From 210b49cac9b08b11981444141c3d12318231578c Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 30 Apr 2019 08:09:13 -0700
Subject: [PATCH 001/572] Disable pipelined write in atomic flush stress test
 (#5266)

Summary:
Since currently pipelined write allows one thread to perform memtable writes
while another thread is traversing the `flush_scheduler_`, it will cause an
assertion failure in `FlushScheduler::Clear`. To unblock crash recoery tests,
we temporarily disable pipelined write when atomic flush is enabled.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5266

Differential Revision: D15142285

Pulled By: riversand963

fbshipit-source-id: a0c20fe4ac543e08feaed602414f982054df7831
---
 tools/db_crashtest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index a1a9ecb66ea..a27abe8cf50 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -136,6 +136,8 @@ def is_direct_io_supported(dbname):
     # use small value for write_buffer_size so that RocksDB triggers flush
     # more frequently
     "write_buffer_size": 1024 * 1024,
+    # disable pipelined write when test_atomic_flush is true
+    "enable_pipelined_write": 0,
 }
 
 

From 25810ca9c7158ec71ec27f8dd98b4d61ff88fc66 Mon Sep 17 00:00:00 2001
From: bxq2011hust <bxq2011hust@qq.com>
Date: Tue, 30 Apr 2019 09:30:46 -0700
Subject: [PATCH 002/572] compile gtest only when enable test

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5248

Differential Revision: D15149190

Pulled By: maysamyabandeh

fbshipit-source-id: fd6d799e80bb502a7ddbc07032ea87e2e3f1e24f
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb8067d2245..f4feee986c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -461,8 +461,6 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
 find_package(Threads REQUIRED)
 
-add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
-
 # Main library source code
 
 set(SOURCES
@@ -843,6 +841,7 @@ endif()
 
 option(WITH_TESTS "build with tests" ON)
 if(WITH_TESTS)
+  add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
   set(TESTS
         cache/cache_test.cc
         cache/lru_cache_test.cc

From b02d0c238db9278cd45375cb10e32161244fd3c9 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Tue, 30 Apr 2019 09:46:40 -0700
Subject: [PATCH 003/572] Init compression dict handle before reading
 meta-blocks (#5267)

Summary:
At least one of the meta-block loading functions (`ReadRangeDelBlock`)
uses the same block reading function (`NewDataBlockIterator`) as data
block reads, which means it uses the dictionary handle. However, the
dictionary handle was uninitialized while reading meta-blocks, causing
readers to receive an error. This situation was only noticed when
`cache_index_and_filter_blocks=true`.

This PR initializes the handle to null while reading meta-blocks to
prevent the error. It also adds support to `db_stress` /
`db_crashtest.py` for `cache_index_and_filter_blocks`.

Fixes #5263.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5267

Differential Revision: D15149264

Pulled By: maysamyabandeh

fbshipit-source-id: 991d38a306c62db5976778bfb050fa3cd4a0671b
---
 table/block_based_table_reader.cc | 5 +++++
 tools/db_crashtest.py             | 1 +
 tools/db_stress.cc                | 5 +++++
 3 files changed, 11 insertions(+)

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index ad088337a19..d6c9ab88796 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -838,6 +838,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                                          rep->persistent_cache_key_prefix_size),
                              rep->ioptions.statistics);
 
+  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+  // handle to null, otherwise it may be seen as uninitialized during the below
+  // meta-block reads.
+  rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
   // Read metaindex
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index a27abe8cf50..6c7fbabbf11 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -25,6 +25,7 @@
 default_params = {
     "acquire_snapshot_one_in": 10000,
     "block_size": 16384,
+    "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
     "cache_size": 1048576,
     "checkpoint_one_in": 1000000,
     "compression_type": "snappy",
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 7f8c4b53f7b..2ecd2aa6d13 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -297,6 +297,9 @@ DEFINE_int32(set_in_place_one_in, 0,
 DEFINE_int64(cache_size, 2LL * KB * KB * KB,
              "Number of bytes to use as a cache of uncompressed data.");
 
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "True if indexes/filters should be cached in block cache.");
+
 DEFINE_bool(use_clock_cache, false,
             "Replace default LRU block cache with clock cache.");
 
@@ -2578,6 +2581,8 @@ class StressTest {
     if (FLAGS_options_file.empty()) {
       BlockBasedTableOptions block_based_options;
       block_based_options.block_cache = cache_;
+      block_based_options.cache_index_and_filter_blocks =
+          FLAGS_cache_index_and_filter_blocks;
       block_based_options.block_cache_compressed = compressed_cache_;
       block_based_options.checksum = FLAGS_checksum_type_e;
       block_based_options.block_size = FLAGS_block_size;

From a5debd7ed821489c5f9e87c805fdd5bc30a85388 Mon Sep 17 00:00:00 2001
From: David Palm <dvdplm@gmail.com>
Date: Tue, 30 Apr 2019 10:08:13 -0700
Subject: [PATCH 004/572] Add rocksdb_property_int_cf (#5268)

Summary:
Adds the missing `rocksdb_property_int_cf` function to the C API to let consuming libraries avoid parsing strings.
Fixes https://github.com/facebook/rocksdb/issues/5249
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5268

Differential Revision: D15149461

Pulled By: maysamyabandeh

fbshipit-source-id: e9fe5f1ad7c64066d921dba8473507269b51d331
---
 db/c.cc             | 12 ++++++++++++
 include/rocksdb/c.h |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 743a88d838e..aac1cf4087c 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1070,6 +1070,18 @@ int rocksdb_property_int(
   }
 }
 
+int rocksdb_property_int_cf(
+    rocksdb_t* db,
+    rocksdb_column_family_handle_t* column_family,
+    const char* propname,
+    uint64_t *out_val) {
+  if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 char* rocksdb_property_value_cf(
     rocksdb_t* db,
     rocksdb_column_family_handle_t* column_family,
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ceb99ebf945..a0ae7ca7785 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -336,6 +336,11 @@ int rocksdb_property_int(
     rocksdb_t* db,
     const char* propname, uint64_t *out_val);
 
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* propname, uint64_t *out_val);
+
 extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
     rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
     const char* propname);

From 03c7ae24c20d0123ef3e45077fd683946ff3384d Mon Sep 17 00:00:00 2001
From: Yuqi Gu <yuqi.gu@arm.com>
Date: Tue, 30 Apr 2019 10:56:06 -0700
Subject: [PATCH 005/572] RocksDB CRC32c optimization with ARMv8 Intrinsic
 (#5221)

Summary:
1. Add Arm linear crc32c implemtation for RocksDB.
2. Arm runtime check for crc32
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221

Differential Revision: D15013685

Pulled By: siying

fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12
---
 Makefile             |  6 +++++
 src.mk               |  5 ++++
 util/crc32c.cc       | 27 ++++++++++++++++++---
 util/crc32c_arm64.cc | 56 ++++++++++++++++++++++++++++++++++++++++++++
 util/crc32c_arm64.h  | 21 +++++++++++++++++
 5 files changed, 112 insertions(+), 3 deletions(-)
 create mode 100644 util/crc32c_arm64.cc
 create mode 100644 util/crc32c_arm64.h

diff --git a/Makefile b/Makefile
index eee0f9fba02..928046f0050 100644
--- a/Makefile
+++ b/Makefile
@@ -137,6 +137,12 @@ CFLAGS +=  -DHAVE_POWER8
 HAVE_POWER8=1
 endif
 
+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
+CXXFLAGS += -march=armv8-a+crc
+CFLAGS += -march=armv8-a+crc
+ARMCRC_SOURCE=1
+endif
+
 # if we're compiling for release, compile without debug code (-DNDEBUG)
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
diff --git a/src.mk b/src.mk
index 55b4e3427c6..e3fe5632f87 100644
--- a/src.mk
+++ b/src.mk
@@ -216,6 +216,11 @@ LIB_SOURCES =                                                   \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
 
+ifeq ($(ARMCRC_SOURCE),1)
+LIB_SOURCES +=\
+  util/crc32c_arm64.cc
+endif
+
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 LIB_SOURCES_ASM =\
   util/crc32c_ppc_asm.S
diff --git a/util/crc32c.cc b/util/crc32c.cc
index 9e4b65e66e1..e8d4116ff42 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -18,6 +18,8 @@
 #include "util/coding.h"
 #include "util/util.h"
 
+#include "util/crc32c_arm64.h"
+
 #ifdef __powerpc64__
 #include "util/crc32c_ppc.h"
 #include "util/crc32c_ppc_constants.h"
@@ -463,6 +465,11 @@ static bool isAltiVec() {
 }
 #endif
 
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_arm64(crc, (const unsigned char *)buf, size);
+}
+#endif
 
 std::string IsFastCrc32Supported() {
   bool has_fast_crc = false;
@@ -478,6 +485,14 @@ std::string IsFastCrc32Supported() {
   has_fast_crc = false;
   arch = "PPC";
 #endif
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if (crc32c_runtime_check()) {
+    has_fast_crc = true;
+    arch = "Arm64";
+  } else {
+    has_fast_crc = false;
+    arch = "Arm64";
+  }
 #else
   has_fast_crc = isSSE42();
   arch = "x86";
@@ -1200,7 +1215,15 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
 #endif //HAVE_SSE42 && HAVE_PCLMUL
 
 static inline Function Choose_Extend() {
-#ifndef HAVE_POWER8
+#ifdef HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if(crc32c_runtime_check()) {
+    return ExtendARMImpl;
+  } else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else
   if (isSSE42()) {
     if (isPCLMULQDQ()) {
 #if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
@@ -1216,8 +1239,6 @@ static inline Function Choose_Extend() {
   else {
     return ExtendImpl<Slow_CRC32>;
   }
-#else  //HAVE_POWER8
-  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
 #endif
 }
 
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
new file mode 100644
index 00000000000..62fabe99e3c
--- /dev/null
+++ b/util/crc32c_arm64.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/crc32c_arm64.h"
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+uint32_t crc32c_runtime_check(void) {
+  uint64_t auxv = getauxval(AT_HWCAP);
+  return (auxv & HWCAP_CRC32) != 0;
+}
+
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             unsigned len) {
+  const uint8_t *buf1;
+  const uint16_t *buf2;
+  const uint32_t *buf4;
+  const uint64_t *buf8;
+
+  int64_t length = (int64_t)len;
+
+  crc ^= 0xffffffff;
+  buf8 = (const uint64_t *)data;
+  while ((length -= sizeof(uint64_t)) >= 0) {
+    crc = __crc32cd(crc, *buf8++);
+  }
+
+  /* The following is more efficient than the straight loop */
+  buf4 = (const uint32_t *)buf8;
+  if (length & sizeof(uint32_t)) {
+    crc = __crc32cw(crc, *buf4++);
+    length -= 4;
+  }
+
+  buf2 = (const uint16_t *)buf4;
+  if (length & sizeof(uint16_t)) {
+    crc = __crc32ch(crc, *buf2++);
+    length -= 2;
+  }
+
+  buf1 = (const uint8_t *)buf2;
+  if (length & sizeof(uint8_t))
+    crc = __crc32cb(crc, *buf1);
+
+  crc ^= 0xffffffff;
+  return crc;
+}
+
+#endif
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
new file mode 100644
index 00000000000..0e77ecd0ef5
--- /dev/null
+++ b/util/crc32c_arm64.h
@@ -0,0 +1,21 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef UTIL_CRC32C_ARM64_H
+#define UTIL_CRC32C_ARM64_H
+
+#include <inttypes.h>
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+#ifdef __ARM_FEATURE_CRC32
+#define HAVE_ARM64_CRC
+#include <arm_acle.h>
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_runtime_check(void);
+#endif
+#endif
+
+
+#endif

From 36ea379cdc542c81af9d708f04151f8228b0425e Mon Sep 17 00:00:00 2001
From: Fosco Marotto <fjm@fb.com>
Date: Tue, 30 Apr 2019 15:05:25 -0700
Subject: [PATCH 006/572] Update history and version for future 6.2.0 (#5270)

Summary:
Update history before branch cut.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5270

Differential Revision: D15153700

Pulled By: gfosco

fbshipit-source-id: 2c81e01a2ab965661b1d88209dca74ba0a3756cb
---
 HISTORY.md                | 2 ++
 include/rocksdb/version.h | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 66dd73965ec..4b08ce9d170 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
+
+## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
 * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 2e8b496819c..7b7d7e86224 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,8 +5,8 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 1
-#define ROCKSDB_PATCH 1
+#define ROCKSDB_MINOR 2
+#define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these

From 521d234bdabb00bdaf60ebb207f67256deec648d Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 1 May 2019 10:04:21 -0700
Subject: [PATCH 007/572] Revert snap_refresh_nanos feature (#5269)

Summary:
Our daily stress tests are failing after this feature. Reverting temporarily until we figure the reason for test failures.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5269

Differential Revision: D15151285

Pulled By: maysamyabandeh

fbshipit-source-id: e4002b99690a97df30d4b4b58bf0f61e9591bc6e
---
 HISTORY.md                        |   1 -
 db/c.cc                           |   5 --
 db/compaction_iterator.cc         |  60 +++++--------
 db/compaction_iterator.h          |  60 +------------
 db/compaction_job.cc              |   5 +-
 db/compaction_job.h               |  31 +++----
 db/compaction_job_test.cc         | 141 +++---------------------------
 db/db_impl.h                      |   9 +-
 db/db_impl_compaction_flush.cc    |  37 +-------
 db/snapshot_impl.h                |  14 +--
 include/rocksdb/c.h               |   2 -
 include/rocksdb/options.h         |  11 ---
 options/cf_options.cc             |   2 -
 options/cf_options.h              |   3 -
 options/options.cc                |   4 -
 options/options_helper.cc         |  25 +++---
 options/options_settable_test.cc  |   1 -
 options/options_test.cc           |   2 -
 table/mock_table.cc               |  14 ---
 table/mock_table.h                |   8 --
 util/compaction_job_stats_impl.cc |   3 -
 21 files changed, 70 insertions(+), 368 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4b08ce9d170..2d3fd87c88c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,7 +8,6 @@
 * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.
 * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior.
 * When reading from option file/string/map, customized envs can be filled according to object registry.
-* Add an option `snap_refresh_nanos` (default to 0.5s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator.
 
 ### Public API Change
diff --git a/db/c.cc b/db/c.cc
index aac1cf4087c..9f5995a413b 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2226,11 +2226,6 @@ void rocksdb_options_set_max_bytes_for_level_base(
   opt->rep.max_bytes_for_level_base = n;
 }
 
-void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt,
-                                            uint64_t n) {
-  opt->rep.snap_refresh_nanos = n;
-}
-
 void rocksdb_options_set_level_compaction_dynamic_level_bytes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.level_compaction_dynamic_level_bytes = v;
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index bce0b82dbc7..93c2b5fa9e9 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -38,16 +38,14 @@ CompactionIterator::CompactionIterator(
     CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
-    const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    const SequenceNumber preserve_deletes_seqnum)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
           report_detailed_time, expect_valid_internal_key, range_del_agg,
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
-          compaction_filter, shutting_down, preserve_deletes_seqnum,
-          snap_list_callback) {}
+          compaction_filter, shutting_down, preserve_deletes_seqnum) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -59,8 +57,7 @@ CompactionIterator::CompactionIterator(
     std::unique_ptr<CompactionProxy> compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
-    const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    const SequenceNumber preserve_deletes_seqnum)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
@@ -78,8 +75,7 @@ CompactionIterator::CompactionIterator(
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
-      current_key_committed_(false),
-      snap_list_callback_(snap_list_callback) {
+      current_key_committed_(false) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
   assert(snapshots_ != nullptr);
   bottommost_level_ =
@@ -87,7 +83,24 @@ CompactionIterator::CompactionIterator(
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
-  ProcessSnapshotList();
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
+    earliest_snapshot_ = kMaxSequenceNumber;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+#endif
   input_->SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
@@ -209,28 +222,6 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
   }
 }
 
-void CompactionIterator::ProcessSnapshotList() {
-#ifndef NDEBUG
-  // findEarliestVisibleSnapshot assumes this ordering.
-  for (size_t i = 1; i < snapshots_->size(); ++i) {
-    assert(snapshots_->at(i - 1) < snapshots_->at(i));
-  }
-#endif
-  if (snapshots_->size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip_ = true;
-    earliest_snapshot_iter_ = snapshots_->end();
-    earliest_snapshot_ = kMaxSequenceNumber;
-    latest_snapshot_ = 0;
-  } else {
-    visible_at_tip_ = false;
-    earliest_snapshot_iter_ = snapshots_->begin();
-    earliest_snapshot_ = snapshots_->at(0);
-    latest_snapshot_ = snapshots_->back();
-  }
-  released_snapshots_.clear();
-}
-
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
@@ -278,13 +269,6 @@ void CompactionIterator::NextFromInput() {
     // compaction filter). ikey_.user_key is pointing to the copy.
     if (!has_current_user_key_ ||
         !cmp_->Equal(ikey_.user_key, current_user_key_)) {
-      num_keys_++;
-      // Use num_keys_ to reduce the overhead of reading current time
-      if (snap_list_callback_ && snapshots_->size() &&
-          snap_list_callback_->TimeToRefresh(num_keys_)) {
-        snap_list_callback_->Refresh(snapshots_, latest_snapshot_);
-        ProcessSnapshotList();
-      }
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h
index 6ab43b1becf..a9e7a262071 100644
--- a/db/compaction_iterator.h
+++ b/db/compaction_iterator.h
@@ -21,53 +21,6 @@
 
 namespace rocksdb {
 
-// This callback can be used to refresh the snapshot list from the db. It
-// includes logics to exponentially decrease the refresh rate to limit the
-// overhead of refresh.
-class SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos,
-                            size_t every_nth_key = 1024)
-      : timer_(env, /*auto restart*/ true),
-        snap_refresh_nanos_(snap_refresh_nanos),
-        every_nth_key_minus_one_(every_nth_key - 1) {
-    assert(every_nth_key > 0);
-    assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key))));
-  }
-  // Refresh the snapshot list. snapshots will bre replacted with the new list.
-  // max is the upper bound. Note: this function will acquire the db_mutex_.
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) = 0;
-  inline bool TimeToRefresh(const size_t key_index) {
-    // skip the key if key_index % every_nth_key (which is of power 2) is not 0.
-    if ((key_index & every_nth_key_minus_one_) != 0) {
-      return false;
-    }
-    const uint64_t elapsed = timer_.ElapsedNanos();
-    auto ret = elapsed > snap_refresh_nanos_;
-    // pre-compute the next time threshold
-    if (ret) {
-      // inc next refresh period exponentially (by x4)
-      auto next_refresh_threshold = snap_refresh_nanos_ << 2;
-      // make sure the shift has not overflown the highest 1 bit
-      snap_refresh_nanos_ =
-          std::max(snap_refresh_nanos_, next_refresh_threshold);
-    }
-    return ret;
-  }
-  static constexpr SnapshotListFetchCallback* kDisabled = nullptr;
-
-  virtual ~SnapshotListFetchCallback() {}
-
- private:
-  // Time since the callback was created
-  StopWatchNano timer_;
-  // The delay before calling ::Refresh. To be increased exponentially.
-  uint64_t snap_refresh_nanos_;
-  // Skip evey nth key. Number n if of power 2. The math will require n-1.
-  const uint64_t every_nth_key_minus_one_;
-};
-
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
@@ -116,8 +69,7 @@ class CompactionIterator {
                      const Compaction* compaction = nullptr,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+                     const SequenceNumber preserve_deletes_seqnum = 0);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(InternalIterator* input, const Comparator* cmp,
@@ -130,8 +82,7 @@ class CompactionIterator {
                      std::unique_ptr<CompactionProxy> compaction,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+                     const SequenceNumber preserve_deletes_seqnum = 0);
 
   ~CompactionIterator();
 
@@ -159,8 +110,6 @@ class CompactionIterator {
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
-  // Process snapshots_ and assign related variables
-  void ProcessSnapshotList();
 
   // Do last preparations before presenting the output to the callee. At this
   // point this only zeroes out the sequence number if possible for better
@@ -195,7 +144,7 @@ class CompactionIterator {
   InternalIterator* input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
-  std::vector<SequenceNumber>* snapshots_;
+  const std::vector<SequenceNumber>* snapshots_;
   // List of snapshots released during compaction.
   // findEarliestVisibleSnapshot() find them out from return of
   // snapshot_checker, and make sure they will not be returned as
@@ -270,9 +219,6 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
-  SnapshotListFetchCallback* snap_list_callback_;
-  // number of distinct keys processed
-  size_t num_keys_ = 0;
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index bc127a4c45c..45221a15512 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -315,7 +315,7 @@ CompactionJob::CompactionJob(
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback)
+    Env::Priority thread_pri)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
@@ -336,7 +336,6 @@ CompactionJob::CompactionJob(
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
-      snap_list_callback_(snap_list_callback),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
@@ -893,7 +892,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
       &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_, snap_list_callback_));
+      shutting_down_, preserve_deletes_seqnum_));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
diff --git a/db/compaction_job.h b/db/compaction_job.h
index b3a0f2eb4b5..9767985f336 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -57,20 +57,22 @@ class VersionSet;
 
 class CompactionJob {
  public:
-  CompactionJob(
-      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-      const EnvOptions env_options, VersionSet* versions,
-      const std::atomic<bool>* shutting_down,
-      const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
-      Directory* db_directory, Directory* output_directory, Statistics* stats,
-      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      SequenceNumber earliest_write_conflict_snapshot,
-      const SnapshotChecker* snapshot_checker,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      bool paranoid_file_checks, bool measure_io_stats,
-      const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback);
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const EnvOptions env_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down,
+                const SequenceNumber preserve_deletes_seqnum,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Directory* output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                std::vector<SequenceNumber> existing_snapshots,
+                SequenceNumber earliest_write_conflict_snapshot,
+                const SnapshotChecker* snapshot_checker,
+                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+                bool paranoid_file_checks, bool measure_io_stats,
+                const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri);
 
   ~CompactionJob();
 
@@ -150,7 +152,6 @@ class CompactionJob {
   // entirely within s1 and s2, then the earlier version of k1 can be safely
   // deleted because that version is not visible in any snapshot.
   std::vector<SequenceNumber> existing_snapshots_;
-  SnapshotListFetchCallback* snap_list_callback_;
 
   // This is the earliest snapshot that could be used for write-conflict
   // checking by a transaction.  For any user-key newer than this snapshot, we
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 60394cc9735..f05a8ec2ff7 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -5,13 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
-#include <array>
 #include <map>
 #include <string>
 #include <tuple>
@@ -200,13 +194,6 @@ class CompactionJobTest : public testing::Test {
   }
 
   void NewDB() {
-    DestroyDB(dbname_, Options());
-    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
-    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_));
-    compaction_job_stats_.Reset();
-
     VersionEdit new_db;
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
@@ -243,10 +230,7 @@ class CompactionJobTest : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const stl_wrappers::KVMap& expected_results,
       const std::vector<SequenceNumber>& snapshots = {},
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
-      int output_level = 1, bool verify = true,
-      SnapshotListFetchCallback* snapshot_fetcher =
-          SnapshotListFetchCallback::kDisabled) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
     auto cfd = versions_->GetColumnFamilySet()->GetDefault();
 
     size_t num_input_files = 0;
@@ -263,7 +247,7 @@ class CompactionJobTest : public testing::Test {
 
     Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
                           *cfd->GetLatestMutableCFOptions(),
-                          compaction_input_files, output_level, 1024 * 1024,
+                          compaction_input_files, 1, 1024 * 1024,
                           10 * 1024 * 1024, 0, kNoCompression,
                           cfd->ioptions()->compression_opts, 0, {}, true);
     compaction.SetInputVersion(cfd->current());
@@ -279,7 +263,7 @@ class CompactionJobTest : public testing::Test {
         nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER, snapshot_fetcher);
+        Env::Priority::USER);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
@@ -291,17 +275,15 @@ class CompactionJobTest : public testing::Test {
     ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
     mutex_.Unlock();
 
-    if (verify) {
-      if (expected_results.size() == 0) {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
-        ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
-      } else {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
-        ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
-        mock_table_factory_->AssertLatestFile(expected_results);
-      }
+    if (expected_results.size() == 0) {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+    } else {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+      mock_table_factory_->AssertLatestFile(expected_results);
     }
   }
 
@@ -956,105 +938,6 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
   RunCompaction({files}, expected_results);
 }
 
-// Test the snapshot fetcher in compaction
-TEST_F(CompactionJobTest, SnapshotRefresh) {
-  uint64_t time_seed = env_->NowMicros();
-  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
-  Random64 rand(time_seed);
-  std::vector<SequenceNumber> db_snapshots;
-  class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback {
-   public:
-    SnapshotListFetchCallbackTest(Env* env, Random64& rand,
-                                  std::vector<SequenceNumber>* snapshots)
-        : SnapshotListFetchCallback(env, 0 /*no time delay*/,
-                                    1 /*fetch after each key*/),
-          rand_(rand),
-          snapshots_(snapshots) {}
-    virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                         SequenceNumber) override {
-      assert(snapshots->size());
-      assert(snapshots_->size());
-      assert(snapshots_->size() == snapshots->size());
-      if (rand_.OneIn(2)) {
-        uint64_t release_index = rand_.Uniform(snapshots_->size());
-        snapshots_->erase(snapshots_->begin() + release_index);
-        *snapshots = *snapshots_;
-      }
-    }
-
-   private:
-    Random64 rand_;
-    std::vector<SequenceNumber>* snapshots_;
-  } snapshot_fetcher(env_, rand, &db_snapshots);
-
-  std::vector<std::pair<const std::string, std::string>> file1_kvs, file2_kvs;
-  std::array<ValueType, 4> types = {kTypeValue, kTypeDeletion,
-                                    kTypeSingleDeletion};
-  SequenceNumber last_seq = 0;
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq = seq;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file1_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file1 = mock::MakeMockFile(file1_kvs);
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq++;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file2_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file2 = mock::MakeMockFile(file2_kvs);
-  for (SequenceNumber i = 1; i < last_seq + 1; i++) {
-    if (rand.OneIn(5)) {
-      db_snapshots.push_back(i);
-    }
-  }
-
-  const bool kVerify = true;
-  const int output_level_0 = 0;
-  NewDB();
-  AddMockFile(file1);
-  AddMockFile(file2);
-  SetLastSequence(last_seq);
-  auto files = cfd_->current()->storage_info()->LevelFiles(0);
-  // put the output on L0 since it is easier to feed them again to the 2nd
-  // compaction
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify, &snapshot_fetcher);
-
-  // Now db_snapshots are changed. Run the compaction again without snapshot
-  // fetcher but with the updated snapshot list.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
-  // The result should be what we get if we run compaction without snapshot
-  // fetcher on the updated list of snapshots
-  auto expected = mock_table_factory_->output();
-
-  NewDB();
-  AddMockFile(file1);
-  AddMockFile(file2);
-  SetLastSequence(last_seq);
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify);
-  // The 2nd compaction above would get rid of useless delete markers. To get
-  // the output here exactly as what we got above after two compactions, we also
-  // run the compaction for 2nd time.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
-}
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl.h b/db/db_impl.h
index 623f69ba6ef..9bdb0abdc10 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -561,13 +561,6 @@ class DBImpl : public DB {
 
   const SnapshotList& snapshots() const { return snapshots_; }
 
-  void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
-                     SequenceNumber* oldest_write_conflict_snapshot,
-                     const SequenceNumber& max_seq) const {
-    InstrumentedMutexLock l(mutex());
-    snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
-  }
-
   const ImmutableDBOptions& immutable_db_options() const {
     return immutable_db_options_;
   }
@@ -746,7 +739,7 @@ class DBImpl : public DB {
   // Not thread-safe.
   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
 
-  InstrumentedMutex* mutex() const { return &mutex_; }
+  InstrumentedMutex* mutex() { return &mutex_; }
 
   Status NewDB();
 
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index f16c6111752..49b6c0fd804 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -798,29 +798,6 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   return s;
 }
 
-class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env,
-                                uint64_t snap_refresh_nanos, Logger* info_log)
-      : SnapshotListFetchCallback(env, snap_refresh_nanos),
-        db_impl_(db_impl),
-        info_log_(info_log) {}
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) override {
-    size_t prev = snapshots->size();
-    snapshots->clear();
-    db_impl_->LoadSnapshots(snapshots, nullptr, max);
-    size_t now = snapshots->size();
-    ROCKS_LOG_DEBUG(info_log_,
-                    "Compaction snapshot count refreshed from %zu to %zu", prev,
-                    now);
-  }
-
- private:
-  DBImpl* db_impl_;
-  Logger* info_log_;
-};
-
 Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
                             ColumnFamilyHandle* column_family,
                             const std::vector<std::string>& input_file_names,
@@ -992,9 +969,6 @@ Status DBImpl::CompactFilesImpl(
 
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
-  SnapshotListFetchCallbackImpl fetch_callback(
-      this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-      immutable_db_options_.info_log.get());
   CompactionJob compaction_job(
       job_context->job_id, c.get(), immutable_db_options_,
       env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -1004,9 +978,7 @@ Status DBImpl::CompactFilesImpl(
       snapshot_checker, table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER,
-      immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                    : nullptr);
+      &compaction_job_stats, Env::Priority::USER);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -2650,9 +2622,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     GetSnapshotContext(job_context, &snapshot_seqs,
                        &earliest_write_conflict_snapshot, &snapshot_checker);
     assert(is_snapshot_supported_ || snapshots_.empty());
-    SnapshotListFetchCallbackImpl fetch_callback(
-        this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-        immutable_db_options_.info_log.get());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
         env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -2662,9 +2631,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
-        &compaction_job_stats, thread_pri,
-        immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                      : nullptr);
+        &compaction_job_stats, thread_pri);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h
index f1cf6f4b755..f2610fd18b2 100644
--- a/db/snapshot_impl.h
+++ b/db/snapshot_impl.h
@@ -91,23 +91,13 @@ class SnapshotList {
       SequenceNumber* oldest_write_conflict_snapshot = nullptr,
       const SequenceNumber& max_seq = kMaxSequenceNumber) const {
     std::vector<SequenceNumber> ret;
-    GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
-    return ret;
-  }
-
-  void GetAll(std::vector<SequenceNumber>* snap_vector,
-              SequenceNumber* oldest_write_conflict_snapshot = nullptr,
-              const SequenceNumber& max_seq = kMaxSequenceNumber) const {
-    std::vector<SequenceNumber>& ret = *snap_vector;
-    // So far we have no use case that would pass a non-empty vector
-    assert(ret.size() == 0);
 
     if (oldest_write_conflict_snapshot != nullptr) {
       *oldest_write_conflict_snapshot = kMaxSequenceNumber;
     }
 
     if (empty()) {
-      return;
+      return ret;
     }
     const SnapshotImpl* s = &list_;
     while (s->next_ != &list_) {
@@ -129,7 +119,7 @@ class SnapshotList {
 
       s = s->next_;
     }
-    return;
+    return ret;
   }
 
   // get the sequence number of the most recent snapshot
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a0ae7ca7785..4b34996a730 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -816,8 +816,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
-extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos(
-    rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
                                                          unsigned char);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ab856bee8e1..4cc2998b2d8 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -269,17 +269,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
-  // If non-zero, compactions will periodically refresh the snapshot list. The
-  // delay for the first refresh is snap_refresh_nanos nano seconds and
-  // exponentially increases afterwards. When having many short-lived snapshots,
-  // this option helps reducing the cpu usage of long-running compactions. The
-  // feature is disabled when max_subcompactions is greater than one.
-  //
-  // Default: 0.5s
-  //
-  // Dynamically changeable through SetOptions() API
-  uint64_t snap_refresh_nanos = 500 * 1000 * 1000;  // 0.5s
-
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
   //
diff --git a/options/cf_options.cc b/options/cf_options.cc
index f7af3f834c9..78accaeb915 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -169,8 +169,6 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_multiplier);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
-  ROCKS_LOG_INFO(log, "                       snap_refresh_nanos: %" PRIu64,
-                 snap_refresh_nanos);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
                  max_bytes_for_level_multiplier);
   ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
diff --git a/options/cf_options.h b/options/cf_options.h
index 47fca58fa7d..d0c4390c36d 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -149,7 +149,6 @@ struct MutableCFOptions {
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
-        snap_refresh_nanos(options.snap_refresh_nanos),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
         periodic_compaction_seconds(options.periodic_compaction_seconds),
@@ -186,7 +185,6 @@ struct MutableCFOptions {
         target_file_size_base(0),
         target_file_size_multiplier(0),
         max_bytes_for_level_base(0),
-        snap_refresh_nanos(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
         periodic_compaction_seconds(0),
@@ -238,7 +236,6 @@ struct MutableCFOptions {
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
-  uint64_t snap_refresh_nanos;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
   uint64_t periodic_compaction_seconds;
diff --git a/options/options.cc b/options/options.cc
index 900510d01b6..bfe3e313d30 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -215,9 +215,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "               Options.max_bytes_for_level_base: %" PRIu64,
         max_bytes_for_level_base);
-    ROCKS_LOG_HEADER(
-        log, "                     Options.snap_refresh_nanos: %" PRIu64,
-        snap_refresh_nanos);
     ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d",
                      level_compaction_dynamic_level_bytes);
     ROCKS_LOG_HEADER(log, "         Options.max_bytes_for_level_multiplier: %f",
@@ -493,7 +490,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
   write_buffer_size = 2 << 20;
   target_file_size_base = 2 * 1048576;
   max_bytes_for_level_base = 10 * 1048576;
-  snap_refresh_nanos = 0;
   soft_pending_compaction_bytes_limit = 256 * 1048576;
   hard_pending_compaction_bytes_limit = 1073741824ul;
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index a973bbfde51..b7781ff6d25 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -177,7 +177,6 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
       mutable_cf_options.target_file_size_multiplier;
   cf_opts.max_bytes_for_level_base =
       mutable_cf_options.max_bytes_for_level_base;
-  cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos;
   cf_opts.max_bytes_for_level_multiplier =
       mutable_cf_options.max_bytes_for_level_multiplier;
   cf_opts.ttl = mutable_cf_options.ttl;
@@ -527,9 +526,9 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
               opt_address));
     case OptionType::kBlockBasedTableIndexShorteningMode:
       return ParseEnum<BlockBasedTableOptions::IndexShorteningMode>(
-          block_base_table_index_shortening_mode_string_map, value,
-          reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
-              opt_address));
+        block_base_table_index_shortening_mode_string_map, value,
+        reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
+            opt_address));
     case OptionType::kEncodingType:
       return ParseEnum<EncodingType>(
           encoding_type_string_map, value,
@@ -1667,13 +1666,13 @@ std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
 
 std::unordered_map<std::string, BlockBasedTableOptions::IndexShorteningMode>
     OptionsHelper::block_base_table_index_shortening_mode_string_map = {
-        {"kNoShortening",
-         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
-        {"kShortenSeparators",
-         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
-        {"kShortenSeparatorsAndSuccessor",
-         BlockBasedTableOptions::IndexShorteningMode::
-             kShortenSeparatorsAndSuccessor}};
+      {"kNoShortening",
+       BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+      {"kShortenSeparators",
+       BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+      {"kShortenSeparatorsAndSuccessor",
+       BlockBasedTableOptions::IndexShorteningMode::
+           kShortenSeparatorsAndSuccessor}};
 
 std::unordered_map<std::string, EncodingType>
     OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
@@ -1911,10 +1910,6 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base),
           OptionType::kUInt64T, OptionVerificationType::kNormal, true,
           offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
-        {"snap_refresh_nanos",
-         {offset_of(&ColumnFamilyOptions::snap_refresh_nanos),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, snap_refresh_nanos)}},
         {"max_bytes_for_level_multiplier",
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
           OptionType::kDouble, OptionVerificationType::kNormal, true,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2d6cc11c02e..005b9d53a89 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -415,7 +415,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
       "kSnappyCompression;"
       "max_bytes_for_level_base=986;"
-      "snap_refresh_nanos=1000000000;"
       "bloom_locality=8016;"
       "target_file_size_base=4294976376;"
       "memtable_huge_page_size=2557;"
diff --git a/options/options_test.cc b/options/options_test.cc
index ded336dd18d..fbfee311b0a 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -74,7 +74,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"target_file_size_base", "12"},
       {"target_file_size_multiplier", "13"},
       {"max_bytes_for_level_base", "14"},
-      {"snap_refresh_nanos", "1000000000"},
       {"level_compaction_dynamic_level_bytes", "true"},
       {"max_bytes_for_level_multiplier", "15.0"},
       {"max_bytes_for_level_multiplier_additional", "16:17:18"},
@@ -184,7 +183,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
-  ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U);
   ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 9b250604803..65a43616969 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -21,12 +21,6 @@ const InternalKeyComparator icmp_(BytewiseComparator());
 
 }  // namespace
 
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l) {
-  return stl_wrappers::KVMap(l.begin(), l.end(),
-                             stl_wrappers::LessOfComparator(&icmp_));
-}
-
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l) {
   return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
@@ -143,14 +137,6 @@ void MockTableFactory::AssertLatestFile(
       ParseInternalKey(Slice(key), &ikey);
       std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
     }
-    std::cout << "Expected:" << std::endl;
-    for (const auto& kv : file_contents) {
-      ParsedInternalKey ikey;
-      std::string key, value;
-      std::tie(key, value) = kv;
-      ParseInternalKey(Slice(key), &ikey);
-      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
-    }
     FAIL();
   }
 }
diff --git a/table/mock_table.h b/table/mock_table.h
index 5bca14644d8..2f123a963cd 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -28,8 +28,6 @@ namespace mock {
 
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l = {});
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l);
 
 struct MockTableFileSystem {
   port::Mutex mutex;
@@ -186,12 +184,6 @@ class MockTableFactory : public TableFactory {
   // contents are equal to file_contents
   void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
   void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
-  stl_wrappers::KVMap output() {
-    assert(!file_system_.files.empty());
-    auto latest = file_system_.files.end();
-    --latest;
-    return latest->second;
-  }
 
  private:
   uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index fe9efd1f092..a1ebc8b9617 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -40,9 +40,6 @@ void CompactionJobStats::Reset() {
   file_fsync_nanos = 0;
   file_prepare_write_nanos = 0;
 
-  smallest_output_key_prefix.clear();
-  largest_output_key_prefix.clear();
-
   num_single_del_fallthru = 0;
   num_single_del_mismatch = 0;
 }

From 4e0f2aadb036c42950abe01fd8a777b576c44331 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Wed, 1 May 2019 10:13:33 -0700
Subject: [PATCH 008/572] DB::Close() to fail when there are unreleased
 snapshots (#5272)

Summary:
Sometimes, users might make mistake of not releasing snapshots before closing the DB. This is undocumented use of RocksDB and the behavior is unknown. We return DB::Close() to provide a way to check it for the users. Aborted() will be returned to users when they call DB::Close().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5272

Differential Revision: D15159713

Pulled By: siying

fbshipit-source-id: 39369def612398d9f239d83d396b5a28e5af65cd
---
 HISTORY.md           |  2 ++
 db/db_impl.cc        | 14 ++++++++++++++
 db/db_test2.cc       | 15 +++++++++++++++
 include/rocksdb/db.h | 10 +++++++---
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 2d3fd87c88c..2662cdea016 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
+### Public API Change
+* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_impl.cc b/db/db_impl.cc
index c6268d0cb80..3ec9e2ab2d6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -582,6 +582,12 @@ Status DBImpl::CloseHelper() {
       ret = s;
     }
   }
+  if (ret.IsAborted()) {
+    // Reserve IsAborted() error for those where users didn't release
+    // certain resource and they can release them and come back and
+    // retry. In this case, we wrap this exception to something else.
+    return Status::Incomplete(ret.ToString());
+  }
   return ret;
 }
 
@@ -3036,6 +3042,14 @@ DB::~DB() {}
 
 Status DBImpl::Close() {
   if (!closed_) {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      // If there is unreleased snapshot, fail the close call
+      if (!snapshots_.empty()) {
+        return Status::Aborted("Cannot close DB with unreleased snapshot.");
+      }
+    }
+
     closed_ = true;
     return CloseImpl();
   }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 75e7fe4abba..d93beb4477f 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3738,6 +3738,21 @@ TEST_F(DBTest2, OldStatsInterface) {
   ASSERT_GT(dos->num_rt, 0);
   ASSERT_GT(dos->num_mt, 0);
 }
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+  const Snapshot* ss = db_->GetSnapshot();
+
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+
+  ASSERT_NOK(db_->Close());
+  db_->ReleaseSnapshot(ss);
+  ASSERT_OK(db_->Close());
+  delete db_;
+  db_ = nullptr;
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 8bec4a56f94..7b49b92c239 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -232,9 +232,13 @@ class DB {
   // status in case there are any errors. This will not fsync the WAL files.
   // If syncing is required, the caller must first call SyncWAL(), or Write()
   // using an empty write batch with WriteOptions.sync=true.
-  // Regardless of the return status, the DB must be freed. If the return
-  // status is NotSupported(), then the DB implementation does cleanup in the
-  // destructor
+  // Regardless of the return status, the DB must be freed.
+  // If the return status is Aborted(), closing fails because there is
+  // unreleased snapshot in the system. In this case, users can release
+  // the unreleased snapshots and try again and expect it to succeed. For
+  // other status, recalling Close() will be no-op.
+  // If the return status is NotSupported(), then the DB implementation does
+  // cleanup in the destructor
   virtual Status Close() { return Status::NotSupported(); }
 
   // ListColumnFamilies will open the DB specified by argument name

From 4479dff208f8880ad853d9d6c52df64d90b6a0c1 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Wed, 1 May 2019 14:23:48 -0700
Subject: [PATCH 009/572] Reduce binary search when reseek into the same data
 block (#5256)

Summary:
Right now, when Seek() is called again, RocksDB always does a binary search against the files and index blocks, even if they end up with the same file/block. Improve it as following:
1. in LevelIterator, reseek first try to check the boundary of the current file. If it falls into the same file, skip the binary search to find the file
2. in block based table iterator, reseek skip to reseek the iterator block if the seek key is larger than the current key and lower than the index key (boundary of the current block and the next block).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5256

Differential Revision: D15105072

Pulled By: siying

fbshipit-source-id: 39634bdb4a881082451fa39cecd7ecf12160bf80
---
 HISTORY.md                        |  3 +
 db/db_iterator_test.cc            | 98 +++++++++++++++++++++++++++++++
 db/version_set.cc                 | 20 ++++++-
 table/block.cc                    |  1 +
 table/block_based_table_reader.cc | 36 +++++++++---
 5 files changed, 147 insertions(+), 11 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 2662cdea016..011ce0a995d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,9 @@
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 
+### New Features
+* Reduce binary search when iterator reseek into the same data block.
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ec5fc8006b8..78b387577dd 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2450,6 +2450,104 @@ TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
   ASSERT_EQ("a", it->key().ToString());
 }
 
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = RandomString(&rnd, 180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  // A new block
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  int num_find_file_in_level = 0;
+  int num_idx_blk_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelIterator::Seek:BeforeFindFile",
+      [&](void* /*arg*/) { num_find_file_in_level++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(3, num_idx_blk_seek);
+
+    iter->Seek("8");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(2, num_find_file_in_level);
+    // Still re-seek because "8" is the boundary key, which has
+    // the same user key as the seek key.
+    ASSERT_EQ(4, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    // Seek backward never triggers the index block seek to be skipped
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(6, num_idx_blk_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
diff --git a/db/version_set.cc b/db/version_set.cc
index fdc07fee0e5..63d5af3af8d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1007,9 +1007,25 @@ class LevelIterator final : public InternalIterator {
 };
 
 void LevelIterator::Seek(const Slice& target) {
-  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  // Check whether the seek key fall under the same file
+  bool need_to_reseek = true;
+  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+    const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+    if (icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.largest_key) <= 0 &&
+        icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.smallest_key) >= 0) {
+      need_to_reseek = false;
+      assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+             file_index_);
+    }
+  }
+  if (need_to_reseek) {
+    TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+    size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+    InitFileIterator(new_file_index);
+  }
 
-  InitFileIterator(new_file_index);
   if (file_iter_.iter() != nullptr) {
     file_iter_.Seek(target);
   }
diff --git a/table/block.cc b/table/block.cc
index 80bef4a913f..a6cc8d2705f 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -381,6 +381,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
 }
 
 void IndexBlockIter::Seek(const Slice& target) {
+  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
   Slice seek_key = target;
   if (!key_includes_seq_) {
     seek_key = ExtractUserKey(target);
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index d6c9ab88796..e39fd2a860d 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -2334,17 +2334,35 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
     return;
   }
 
-  SavePrevIndexValue();
-
-  index_iter_->Seek(target);
-
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_) {
+    // Reseek.
+    prev_index_value_ = index_iter_->value();
+    // We can avoid an index seek if:
+    // 1. The new seek key is larger than the current key
+    // 2. The new seek key is within the upper bound of the block
+    // Since we don't necessarily know the internal key for either
+    // the current key or the upper bound, we check user keys and
+    // exclude the equality case. Considering internal keys can
+    // improve for the boundary cases, but it would complicate the
+    // code.
+    if (user_comparator_.Compare(ExtractUserKey(target),
+                                 block_iter_.user_key()) > 0 &&
+        user_comparator_.Compare(ExtractUserKey(target),
+                                 index_iter_->user_key()) < 0) {
+      need_seek_index = false;
+    }
+  }
+
+  if (need_seek_index) {
+    index_iter_->Seek(target);
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+    InitDataBlock();
   }
 
-  InitDataBlock();
-
   block_iter_.Seek(target);
 
   FindKeyForward();

From d51eb0b583fe28ede2b4a6d778de4489433f1bbf Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <miasantreble@users.noreply.github.com>
Date: Wed, 1 May 2019 20:36:09 -0700
Subject: [PATCH 010/572] set snappy compression only when supported (#4325)

Summary:
Right now `OptimizeLevelStyleCompaction` may set compression type to Snappy even when Snappy is not supported, this may cause errors like "no snappy compression support"
Fixes https://github.com/facebook/rocksdb/issues/4283
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4325

Differential Revision: D15125542

Pulled By: miasantreble

fbshipit-source-id: 70890b73ababe16752721555dbd290633c2aafac
---
 options/options.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/options/options.cc b/options/options.cc
index bfe3e313d30..5e0c539afb5 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -548,7 +548,10 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
     if (i < 2) {
       compression_per_level[i] = kNoCompression;
     } else {
-      compression_per_level[i] = kSnappyCompression;
+      compression_per_level[i] =
+          LZ4_Supported()
+              ? kLZ4Compression
+              : (Snappy_Supported() ? kSnappyCompression : kNoCompression);
     }
   }
   return this;

From 434ccf2df4ead37156edc4b45071c17c7fbad3b3 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 1 May 2019 23:04:03 -0700
Subject: [PATCH 011/572] Add option to use MultiGet in db_stress (#5264)

Summary:
The new option will pick a batch size randomly in the range 1-64. It will then space the keys in the batch by random intervals.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5264

Differential Revision: D15175522

Pulled By: anand1976

fbshipit-source-id: c16baa69d0f1ff4cf53c55c813ddd82c8aeb58fc
---
 tools/db_crashtest.py |   1 +
 tools/db_stress.cc    | 167 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 6c7fbabbf11..62f72f2b5eb 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -65,6 +65,7 @@
     "writepercent": 35,
     "format_version": lambda: random.randint(2, 4),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
+    "use_multiget" : lambda: random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 2ecd2aa6d13..97755fe962a 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -455,6 +455,9 @@ DEFINE_uint64(snapshot_hold_ops, 0,
               "If non-zero, then releases snapshots N operations after they're "
               "acquired.");
 
+DEFINE_bool(use_multiget, false,
+            "If set, use the batched MultiGet API for reads");
+
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
   if (value < 0 || value>100) {
     fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
@@ -1725,6 +1728,27 @@ class StressTest {
     return base_key + thread->rand.Next() % FLAGS_active_width;
   }
 
+  static std::vector<int64_t> GenerateNKeys(
+      ThreadState* thread,
+      int num_keys,
+      uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    std::vector<int64_t> keys;
+    keys.reserve(num_keys);
+    int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width;
+    keys.push_back(next_key);
+    for (int i = 1; i < num_keys; ++i) {
+      // This may result in some duplicate keys
+      next_key = next_key + thread->rand.Next() %
+        (FLAGS_active_width - (next_key - base_key));
+      keys.push_back(next_key);
+    }
+    return keys;
+  }
+
   static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
     size_t value_sz =
         ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
@@ -2162,7 +2186,14 @@ class StressTest {
       int prob_op = thread->rand.Uniform(100);
       if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
         // OPERATION read
-        TestGet(thread, read_opts, rand_column_families, rand_keys);
+        if (FLAGS_use_multiget) {
+          int num_keys = thread->rand.Uniform(64);
+          rand_keys = GenerateNKeys(thread, num_keys, i);
+          TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
+          i += num_keys - 1;
+        } else {
+          TestGet(thread, read_opts, rand_column_families, rand_keys);
+        }
       } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
         // OPERATION prefix scan
         // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
@@ -2211,6 +2242,11 @@ class StressTest {
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys) = 0;
 
+  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
   virtual Status TestPrefixScan(ThreadState* thread,
       const ReadOptions& read_opts,
       const std::vector<int>& rand_column_families,
@@ -2546,6 +2582,8 @@ class StressTest {
     fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
     fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
             FLAGS_subcompactions);
+    fprintf(stdout, "Use MultiGet              : %s\n",
+            FLAGS_use_multiget ? "true" : "false");
 
     const char* memtablerep = "";
     switch (FLAGS_rep_factory) {
@@ -3012,6 +3050,38 @@ class NonBatchedOpsStressTest : public StressTest {
     return s;
   }
 
+  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (const auto& s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
   virtual Status TestPrefixScan(ThreadState* thread,
       const ReadOptions& read_opts,
       const std::vector<int>& rand_column_families,
@@ -3532,6 +3602,70 @@ class BatchedOpsStressTest : public StressTest {
     return s;
   }
 
+  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
+      const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    int num_keys = rand_keys.size();
+    std::vector<Status> statuses(num_keys);
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    for (int key = 0; key < 10; ++key) {
+      std::vector<Slice> key_slices;
+      std::vector<PinnableSlice> values(num_keys);
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = db_->GetSnapshot();
+      std::vector<std::string> key_str;
+      std::string from_db;
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+      for (int rand_key = 0; rand_key < num_keys; ++rand_key) {
+        key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
+        key_slices.emplace_back(key_str.back());
+      }
+      db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(),
+          values.data(), statuses.data());
+      for (int i = 0; i < num_keys; i++) {
+        Status s = statuses[i];
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (s.IsNotFound()) {
+          thread->stats.AddGets(1, 0);
+        } else {
+          char expected_prefix = (keys[key])[0];
+          char actual_prefix = (values[i])[0];
+          if (actual_prefix != expected_prefix) {
+            fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                    expected_prefix, actual_prefix);
+          }
+          std::string str;
+          str.assign(values[i].data(), values[i].size());
+          values[i].Reset();
+          str[0] = ' '; // blank out the differing character
+          values[i].PinSelf(str);
+          thread->stats.AddGets(1, 1);
+        }
+      }
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+      // Now that we retrieved all values, check that they all match
+      for (int i = 1; i < num_keys; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                  key_str[i].c_str(),
+                  StringToHex(values[0].ToString()).c_str(),
+                  StringToHex(values[i].ToString()).c_str());
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+        }
+      }
+    }
+
+    return statuses;
+  }
+
   // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
   // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
   // of the key. Each of these 10 scans returns a series of values;
@@ -3747,6 +3881,37 @@ class AtomicFlushStressTest : public StressTest {
     return s;
   }
 
+  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    int num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (int i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), statuses.data());
+    for (auto s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
   virtual Status TestPrefixScan(ThreadState* thread,
                                 const ReadOptions& readoptions,
                                 const std::vector<int>& rand_column_families,

From 5882e847aabe4cd0a90e0cbaf5a5db39a0668322 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 2 May 2019 14:24:21 -0700
Subject: [PATCH 012/572] Allow builds of RocksJava debug releases (#5274)

Summary:
This allows debug releases of RocksJava to be build with the Docker release targets.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5274

Differential Revision: D15185067

Pulled By: sagar0

fbshipit-source-id: f3988e472f281f5844d9a07098344a827b1e7eb1
---
 Makefile | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index 928046f0050..ee20a41bb1a 100644
--- a/Makefile
+++ b/Makefile
@@ -82,17 +82,23 @@ ifeq ($(MAKECMDGOALS),rocksdbjavastatic)
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease)
-	DEBUG_LEVEL=0
+	ifneq ($(DEBUG_LEVEL),2)
+		DEBUG_LEVEL=0
+	endif
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker)
-        DEBUG_LEVEL=0
+	ifneq ($(DEBUG_LEVEL),2)
+        	DEBUG_LEVEL=0
+	endif
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
 	DEBUG_LEVEL=0
 endif
 
+$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
+
 # Lite build flag.
 LITE ?= 0
 ifeq ($(LITE), 0)
@@ -1827,27 +1833,15 @@ rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 roc
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \
-	if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_x86-be
+	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
-	DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \
-	if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_x64-be
+	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
-	DOCKER_LINUX_PPC64LE_CONTAINER=`docker ps -aqf name=rocksdb_linux_ppc64le-be`; \
-	if [ -z "$$DOCKER_LINUX_PPC64LE_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_ppc64le-be evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_ppc64le-be
+	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 

From 3e994809a1c00ca52fe45e598323e54db18cb90c Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 3 May 2019 09:58:12 -0700
Subject: [PATCH 013/572] fix implicit conversion error reported by clang check
 (#5277)

Summary:
fix the following clang check errors
```
tools/db_stress.cc:3609:30: error: implicit conversion loses integer precision: 'std::vector::size_type' (aka 'unsigned long') to 'int' [-Werror,-Wshorten-64-to-32]
    int num_keys = rand_keys.size();
        ~~~~~~~~   ~~~~~~~~~~^~~~~~
tools/db_stress.cc:3888:30: error: implicit conversion loses integer precision: 'std::vector::size_type' (aka 'unsigned long') to 'int' [-Werror,-Wshorten-64-to-32]
    int num_keys = rand_keys.size();
        ~~~~~~~~   ~~~~~~~~~~^~~~~~
2 errors generated.
make: *** [tools/db_stress.o] Error 1
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5277

Differential Revision: D15196620

Pulled By: miasantreble

fbshipit-source-id: d56b1420d4a9f1df875fc52877a5fbb342bc7cae
---
 tools/db_stress.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 97755fe962a..4ed66ed6d75 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -3606,7 +3606,7 @@ class BatchedOpsStressTest : public StressTest {
       const ReadOptions& readoptions,
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys) {
-    int num_keys = rand_keys.size();
+    size_t num_keys = rand_keys.size();
     std::vector<Status> statuses(num_keys);
     std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
     for (int key = 0; key < 10; ++key) {
@@ -3618,13 +3618,13 @@ class BatchedOpsStressTest : public StressTest {
       std::string from_db;
       ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
 
-      for (int rand_key = 0; rand_key < num_keys; ++rand_key) {
+      for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
         key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
         key_slices.emplace_back(key_str.back());
       }
       db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(),
           values.data(), statuses.data());
-      for (int i = 0; i < num_keys; i++) {
+      for (size_t i = 0; i < num_keys; i++) {
         Status s = statuses[i];
         if (!s.ok() && !s.IsNotFound()) {
           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
@@ -3651,7 +3651,7 @@ class BatchedOpsStressTest : public StressTest {
       db_->ReleaseSnapshot(readoptionscopy.snapshot);
 
       // Now that we retrieved all values, check that they all match
-      for (int i = 1; i < num_keys; i++) {
+      for (size_t i = 1; i < num_keys; i++) {
         if (values[i] != values[0]) {
           fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
                   key_str[i].c_str(),
@@ -3885,14 +3885,14 @@ class AtomicFlushStressTest : public StressTest {
       const ReadOptions& read_opts,
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys) {
-    int num_keys = rand_keys.size();
+    size_t num_keys = rand_keys.size();
     std::vector<std::string> key_str;
     std::vector<Slice> keys;
     std::vector<PinnableSlice> values(num_keys);
     std::vector<Status> statuses(num_keys);
     ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
 
-    for (int i = 0; i < num_keys; ++i) {
+    for (size_t i = 0; i < num_keys; ++i) {
       key_str.emplace_back(Key(rand_keys[i]));
       keys.emplace_back(key_str.back());
     }

From 5d27d65bef4ec40fb7bb61f4f50817279abd85eb Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 3 May 2019 15:55:48 -0700
Subject: [PATCH 014/572] multiget: fix memory issues due to vector auto
 resizing (#5279)

Summary:
This PR fixes three memory issues found by ASAN
* in db_stress, the key vector for MultiGet is created using `emplace_back` which could potentially invalidates references to the underlying storage (vector<string>) due to auto resizing. Fix by calling reserve in advance.
* Similar issue in construction of GetContext autovector in version_set.cc
* In multiget_context.h use T[] specialization for unique_ptr that holds a char array
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5279

Differential Revision: D15202893

Pulled By: miasantreble

fbshipit-source-id: 14cc2cda0ed64d29f2a1e264a6bfdaa4294ee75d
---
 db/version_set.cc        | 6 +++++-
 table/multiget_context.h | 2 +-
 tools/db_stress.cc       | 6 ++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 63d5af3af8d..6d4fb7315ad 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1774,7 +1774,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
         iter->value, nullptr, &(iter->merge_context),
         &iter->max_covering_tombstone_seq, this->env_, &iter->seq,
         merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
-    iter->get_context = &get_ctx.back();
+  }
+  int get_ctx_index = 0;
+  for (auto iter = range->begin(); iter != range->end();
+       ++iter, get_ctx_index++) {
+    iter->get_context = &(get_ctx[get_ctx_index]);
   }
 
   MultiGetRange file_picker_range(*range, range->begin(), range->end());
diff --git a/table/multiget_context.h b/table/multiget_context.h
index d3a8d09463b..c9e682fad4b 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -123,7 +123,7 @@ class MultiGetContext {
   KeyContext** sorted_keys_;
   size_t num_keys_;
   uint64_t value_mask_;
-  std::unique_ptr<char> lookup_key_heap_buf;
+  std::unique_ptr<char[]> lookup_key_heap_buf;
   LookupKey* lookup_key_ptr_;
 
  public:
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 4ed66ed6d75..c6959802be3 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -3057,6 +3057,8 @@ class NonBatchedOpsStressTest : public StressTest {
     size_t num_keys = rand_keys.size();
     std::vector<std::string> key_str;
     std::vector<Slice> keys;
+    key_str.reserve(num_keys);
+    keys.reserve(num_keys);
     std::vector<PinnableSlice> values(num_keys);
     std::vector<Status> statuses(num_keys);
     ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
@@ -3615,6 +3617,8 @@ class BatchedOpsStressTest : public StressTest {
       ReadOptions readoptionscopy = readoptions;
       readoptionscopy.snapshot = db_->GetSnapshot();
       std::vector<std::string> key_str;
+      key_str.reserve(num_keys);
+      key_slices.reserve(num_keys);
       std::string from_db;
       ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
 
@@ -3888,6 +3892,8 @@ class AtomicFlushStressTest : public StressTest {
     size_t num_keys = rand_keys.size();
     std::vector<std::string> key_str;
     std::vector<Slice> keys;
+    keys.reserve(num_keys);
+    key_str.reserve(num_keys);
     std::vector<PinnableSlice> values(num_keys);
     std::vector<Status> statuses(num_keys);
     ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];

From 6a40ee5eb1f3179ad7e56a60d27feeacfcfa4d0c Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Fri, 3 May 2019 17:26:20 -0700
Subject: [PATCH 015/572] Refresh snapshot list during long compactions (2nd
 attempt) (#5278)

Summary:
Part of compaction cpu goes to processing snapshot list, the larger the list the bigger the overhead. Although the lifetime of most of the snapshots is much shorter than the lifetime of compactions, the compaction conservatively operates on the list of snapshots that it initially obtained. This patch allows the snapshot list to be updated via a callback if the compaction is taking long. This should let the compaction to continue more efficiently with much smaller snapshot list.
For simplicity, to avoid the feature is disabled in two cases: i) When more than one sub-compaction are sharing the same snapshot list, ii) when Range Delete is used in which the range delete aggregator has its own copy of snapshot list.
This fixes the reverted https://github.com/facebook/rocksdb/pull/5099 issue with range deletes.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5278

Differential Revision: D15203291

Pulled By: maysamyabandeh

fbshipit-source-id: fa645611e606aa222c7ce53176dc5bb6f259c258
---
 HISTORY.md                        |   1 +
 db/c.cc                           |   5 ++
 db/compaction_iterator.cc         |  60 ++++++++-----
 db/compaction_iterator.h          |  60 ++++++++++++-
 db/compaction_job.cc              |   7 +-
 db/compaction_job.h               |  31 ++++---
 db/compaction_job_test.cc         | 141 +++++++++++++++++++++++++++---
 db/db_impl.h                      |   9 +-
 db/db_impl_compaction_flush.cc    |  39 ++++++++-
 db/snapshot_impl.h                |  14 ++-
 include/rocksdb/c.h               |   2 +
 include/rocksdb/options.h         |  11 +++
 options/cf_options.cc             |   2 +
 options/cf_options.h              |   3 +
 options/options.cc                |   4 +
 options/options_helper.cc         |  25 +++---
 options/options_settable_test.cc  |   1 +
 options/options_test.cc           |   2 +
 table/mock_table.cc               |  14 +++
 table/mock_table.h                |   8 ++
 tools/db_crashtest.py             |   5 +-
 util/compaction_job_stats_impl.cc |   3 +
 22 files changed, 375 insertions(+), 72 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 011ce0a995d..65d64d23604 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 
 ### New Features
 * Reduce binary search when iterator reseek into the same data block.
+* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/c.cc b/db/c.cc
index 9f5995a413b..aac1cf4087c 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2226,6 +2226,11 @@ void rocksdb_options_set_max_bytes_for_level_base(
   opt->rep.max_bytes_for_level_base = n;
 }
 
+void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt,
+                                            uint64_t n) {
+  opt->rep.snap_refresh_nanos = n;
+}
+
 void rocksdb_options_set_level_compaction_dynamic_level_bytes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.level_compaction_dynamic_level_bytes = v;
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index 93c2b5fa9e9..bce0b82dbc7 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -38,14 +38,16 @@ CompactionIterator::CompactionIterator(
     CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
-    const SequenceNumber preserve_deletes_seqnum)
+    const SequenceNumber preserve_deletes_seqnum,
+    SnapshotListFetchCallback* snap_list_callback)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
           report_detailed_time, expect_valid_internal_key, range_del_agg,
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
-          compaction_filter, shutting_down, preserve_deletes_seqnum) {}
+          compaction_filter, shutting_down, preserve_deletes_seqnum,
+          snap_list_callback) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -57,7 +59,8 @@ CompactionIterator::CompactionIterator(
     std::unique_ptr<CompactionProxy> compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
-    const SequenceNumber preserve_deletes_seqnum)
+    const SequenceNumber preserve_deletes_seqnum,
+    SnapshotListFetchCallback* snap_list_callback)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
@@ -75,7 +78,8 @@ CompactionIterator::CompactionIterator(
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
-      current_key_committed_(false) {
+      current_key_committed_(false),
+      snap_list_callback_(snap_list_callback) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
   assert(snapshots_ != nullptr);
   bottommost_level_ =
@@ -83,24 +87,7 @@ CompactionIterator::CompactionIterator(
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
-  if (snapshots_->size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip_ = true;
-    earliest_snapshot_iter_ = snapshots_->end();
-    earliest_snapshot_ = kMaxSequenceNumber;
-    latest_snapshot_ = 0;
-  } else {
-    visible_at_tip_ = false;
-    earliest_snapshot_iter_ = snapshots_->begin();
-    earliest_snapshot_ = snapshots_->at(0);
-    latest_snapshot_ = snapshots_->back();
-  }
-#ifndef NDEBUG
-  // findEarliestVisibleSnapshot assumes this ordering.
-  for (size_t i = 1; i < snapshots_->size(); ++i) {
-    assert(snapshots_->at(i - 1) < snapshots_->at(i));
-  }
-#endif
+  ProcessSnapshotList();
   input_->SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
@@ -222,6 +209,28 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
   }
 }
 
+void CompactionIterator::ProcessSnapshotList() {
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+#endif
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
+    earliest_snapshot_ = kMaxSequenceNumber;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+  released_snapshots_.clear();
+}
+
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
@@ -269,6 +278,13 @@ void CompactionIterator::NextFromInput() {
     // compaction filter). ikey_.user_key is pointing to the copy.
     if (!has_current_user_key_ ||
         !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+      num_keys_++;
+      // Use num_keys_ to reduce the overhead of reading current time
+      if (snap_list_callback_ && snapshots_->size() &&
+          snap_list_callback_->TimeToRefresh(num_keys_)) {
+        snap_list_callback_->Refresh(snapshots_, latest_snapshot_);
+        ProcessSnapshotList();
+      }
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h
index a9e7a262071..6ab43b1becf 100644
--- a/db/compaction_iterator.h
+++ b/db/compaction_iterator.h
@@ -21,6 +21,53 @@
 
 namespace rocksdb {
 
+// This callback can be used to refresh the snapshot list from the db. It
+// includes logics to exponentially decrease the refresh rate to limit the
+// overhead of refresh.
+class SnapshotListFetchCallback {
+ public:
+  SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos,
+                            size_t every_nth_key = 1024)
+      : timer_(env, /*auto restart*/ true),
+        snap_refresh_nanos_(snap_refresh_nanos),
+        every_nth_key_minus_one_(every_nth_key - 1) {
+    assert(every_nth_key > 0);
+    assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key))));
+  }
+  // Refresh the snapshot list. snapshots will bre replacted with the new list.
+  // max is the upper bound. Note: this function will acquire the db_mutex_.
+  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
+                       SequenceNumber max) = 0;
+  inline bool TimeToRefresh(const size_t key_index) {
+    // skip the key if key_index % every_nth_key (which is of power 2) is not 0.
+    if ((key_index & every_nth_key_minus_one_) != 0) {
+      return false;
+    }
+    const uint64_t elapsed = timer_.ElapsedNanos();
+    auto ret = elapsed > snap_refresh_nanos_;
+    // pre-compute the next time threshold
+    if (ret) {
+      // inc next refresh period exponentially (by x4)
+      auto next_refresh_threshold = snap_refresh_nanos_ << 2;
+      // make sure the shift has not overflown the highest 1 bit
+      snap_refresh_nanos_ =
+          std::max(snap_refresh_nanos_, next_refresh_threshold);
+    }
+    return ret;
+  }
+  static constexpr SnapshotListFetchCallback* kDisabled = nullptr;
+
+  virtual ~SnapshotListFetchCallback() {}
+
+ private:
+  // Time since the callback was created
+  StopWatchNano timer_;
+  // The delay before calling ::Refresh. To be increased exponentially.
+  uint64_t snap_refresh_nanos_;
+  // Skip evey nth key. Number n if of power 2. The math will require n-1.
+  const uint64_t every_nth_key_minus_one_;
+};
+
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
@@ -69,7 +116,8 @@ class CompactionIterator {
                      const Compaction* compaction = nullptr,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0);
+                     const SequenceNumber preserve_deletes_seqnum = 0,
+                     SnapshotListFetchCallback* snap_list_callback = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(InternalIterator* input, const Comparator* cmp,
@@ -82,7 +130,8 @@ class CompactionIterator {
                      std::unique_ptr<CompactionProxy> compaction,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0);
+                     const SequenceNumber preserve_deletes_seqnum = 0,
+                     SnapshotListFetchCallback* snap_list_callback = nullptr);
 
   ~CompactionIterator();
 
@@ -110,6 +159,8 @@ class CompactionIterator {
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
+  // Process snapshots_ and assign related variables
+  void ProcessSnapshotList();
 
   // Do last preparations before presenting the output to the callee. At this
   // point this only zeroes out the sequence number if possible for better
@@ -144,7 +195,7 @@ class CompactionIterator {
   InternalIterator* input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
-  const std::vector<SequenceNumber>* snapshots_;
+  std::vector<SequenceNumber>* snapshots_;
   // List of snapshots released during compaction.
   // findEarliestVisibleSnapshot() find them out from return of
   // snapshot_checker, and make sure they will not be returned as
@@ -219,6 +270,9 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
+  SnapshotListFetchCallback* snap_list_callback_;
+  // number of distinct keys processed
+  size_t num_keys_ = 0;
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 45221a15512..00386a99ad4 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -315,7 +315,7 @@ CompactionJob::CompactionJob(
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri)
+    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
@@ -336,6 +336,7 @@ CompactionJob::CompactionJob(
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
+      snap_list_callback_(snap_list_callback),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
@@ -892,7 +893,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
       &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_));
+      shutting_down_, preserve_deletes_seqnum_,
+      // Currently range_del_agg is incompatible with snapshot refresh feature.
+      range_del_agg.IsEmpty() ? snap_list_callback_ : nullptr));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
diff --git a/db/compaction_job.h b/db/compaction_job.h
index 9767985f336..b3a0f2eb4b5 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -57,22 +57,20 @@ class VersionSet;
 
 class CompactionJob {
  public:
-  CompactionJob(int job_id, Compaction* compaction,
-                const ImmutableDBOptions& db_options,
-                const EnvOptions env_options, VersionSet* versions,
-                const std::atomic<bool>* shutting_down,
-                const SequenceNumber preserve_deletes_seqnum,
-                LogBuffer* log_buffer, Directory* db_directory,
-                Directory* output_directory, Statistics* stats,
-                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-                std::vector<SequenceNumber> existing_snapshots,
-                SequenceNumber earliest_write_conflict_snapshot,
-                const SnapshotChecker* snapshot_checker,
-                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-                bool paranoid_file_checks, bool measure_io_stats,
-                const std::string& dbname,
-                CompactionJobStats* compaction_job_stats,
-                Env::Priority thread_pri);
+  CompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const EnvOptions env_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down,
+      const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
+      Directory* db_directory, Directory* output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      bool paranoid_file_checks, bool measure_io_stats,
+      const std::string& dbname, CompactionJobStats* compaction_job_stats,
+      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback);
 
   ~CompactionJob();
 
@@ -152,6 +150,7 @@ class CompactionJob {
   // entirely within s1 and s2, then the earlier version of k1 can be safely
   // deleted because that version is not visible in any snapshot.
   std::vector<SequenceNumber> existing_snapshots_;
+  SnapshotListFetchCallback* snap_list_callback_;
 
   // This is the earliest snapshot that could be used for write-conflict
   // checking by a transaction.  For any user-key newer than this snapshot, we
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index f05a8ec2ff7..60394cc9735 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -5,7 +5,13 @@
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <algorithm>
+#include <array>
 #include <map>
 #include <string>
 #include <tuple>
@@ -194,6 +200,13 @@ class CompactionJobTest : public testing::Test {
   }
 
   void NewDB() {
+    DestroyDB(dbname_, Options());
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_buffer_manager_,
+                                   &write_controller_));
+    compaction_job_stats_.Reset();
+
     VersionEdit new_db;
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
@@ -230,7 +243,10 @@ class CompactionJobTest : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const stl_wrappers::KVMap& expected_results,
       const std::vector<SequenceNumber>& snapshots = {},
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      int output_level = 1, bool verify = true,
+      SnapshotListFetchCallback* snapshot_fetcher =
+          SnapshotListFetchCallback::kDisabled) {
     auto cfd = versions_->GetColumnFamilySet()->GetDefault();
 
     size_t num_input_files = 0;
@@ -247,7 +263,7 @@ class CompactionJobTest : public testing::Test {
 
     Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
                           *cfd->GetLatestMutableCFOptions(),
-                          compaction_input_files, 1, 1024 * 1024,
+                          compaction_input_files, output_level, 1024 * 1024,
                           10 * 1024 * 1024, 0, kNoCompression,
                           cfd->ioptions()->compression_opts, 0, {}, true);
     compaction.SetInputVersion(cfd->current());
@@ -263,7 +279,7 @@ class CompactionJobTest : public testing::Test {
         nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER);
+        Env::Priority::USER, snapshot_fetcher);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
@@ -275,15 +291,17 @@ class CompactionJobTest : public testing::Test {
     ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
     mutex_.Unlock();
 
-    if (expected_results.size() == 0) {
-      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
-      ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
-    } else {
-      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
-      ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
-      mock_table_factory_->AssertLatestFile(expected_results);
+    if (verify) {
+      if (expected_results.size() == 0) {
+        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+        ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+      } else {
+        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+        ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+        mock_table_factory_->AssertLatestFile(expected_results);
+      }
     }
   }
 
@@ -938,6 +956,105 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
   RunCompaction({files}, expected_results);
 }
 
+// Test the snapshot fetcher in compaction
+TEST_F(CompactionJobTest, SnapshotRefresh) {
+  uint64_t time_seed = env_->NowMicros();
+  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
+  Random64 rand(time_seed);
+  std::vector<SequenceNumber> db_snapshots;
+  class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback {
+   public:
+    SnapshotListFetchCallbackTest(Env* env, Random64& rand,
+                                  std::vector<SequenceNumber>* snapshots)
+        : SnapshotListFetchCallback(env, 0 /*no time delay*/,
+                                    1 /*fetch after each key*/),
+          rand_(rand),
+          snapshots_(snapshots) {}
+    virtual void Refresh(std::vector<SequenceNumber>* snapshots,
+                         SequenceNumber) override {
+      assert(snapshots->size());
+      assert(snapshots_->size());
+      assert(snapshots_->size() == snapshots->size());
+      if (rand_.OneIn(2)) {
+        uint64_t release_index = rand_.Uniform(snapshots_->size());
+        snapshots_->erase(snapshots_->begin() + release_index);
+        *snapshots = *snapshots_;
+      }
+    }
+
+   private:
+    Random64 rand_;
+    std::vector<SequenceNumber>* snapshots_;
+  } snapshot_fetcher(env_, rand, &db_snapshots);
+
+  std::vector<std::pair<const std::string, std::string>> file1_kvs, file2_kvs;
+  std::array<ValueType, 4> types = {kTypeValue, kTypeDeletion,
+                                    kTypeSingleDeletion};
+  SequenceNumber last_seq = 0;
+  for (int i = 1; i < 100; i++) {
+    SequenceNumber seq = last_seq + 1;
+    last_seq = seq;
+    if (rand.OneIn(2)) {
+      auto type = types[rand.Uniform(types.size())];
+      file1_kvs.push_back(
+          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
+    }
+  }
+  auto file1 = mock::MakeMockFile(file1_kvs);
+  for (int i = 1; i < 100; i++) {
+    SequenceNumber seq = last_seq + 1;
+    last_seq++;
+    if (rand.OneIn(2)) {
+      auto type = types[rand.Uniform(types.size())];
+      file2_kvs.push_back(
+          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
+    }
+  }
+  auto file2 = mock::MakeMockFile(file2_kvs);
+  for (SequenceNumber i = 1; i < last_seq + 1; i++) {
+    if (rand.OneIn(5)) {
+      db_snapshots.push_back(i);
+    }
+  }
+
+  const bool kVerify = true;
+  const int output_level_0 = 0;
+  NewDB();
+  AddMockFile(file1);
+  AddMockFile(file2);
+  SetLastSequence(last_seq);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  // put the output on L0 since it is easier to feed them again to the 2nd
+  // compaction
+  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
+                output_level_0, !kVerify, &snapshot_fetcher);
+
+  // Now db_snapshots are changed. Run the compaction again without snapshot
+  // fetcher but with the updated snapshot list.
+  compaction_job_stats_.Reset();
+  files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
+                output_level_0 + 1, !kVerify);
+  // The result should be what we get if we run compaction without snapshot
+  // fetcher on the updated list of snapshots
+  auto expected = mock_table_factory_->output();
+
+  NewDB();
+  AddMockFile(file1);
+  AddMockFile(file2);
+  SetLastSequence(last_seq);
+  files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
+                output_level_0, !kVerify);
+  // The 2nd compaction above would get rid of useless delete markers. To get
+  // the output here exactly as what we got above after two compactions, we also
+  // run the compaction for 2nd time.
+  compaction_job_stats_.Reset();
+  files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
+                output_level_0 + 1, !kVerify);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl.h b/db/db_impl.h
index 9bdb0abdc10..623f69ba6ef 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -561,6 +561,13 @@ class DBImpl : public DB {
 
   const SnapshotList& snapshots() const { return snapshots_; }
 
+  void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+                     SequenceNumber* oldest_write_conflict_snapshot,
+                     const SequenceNumber& max_seq) const {
+    InstrumentedMutexLock l(mutex());
+    snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+  }
+
   const ImmutableDBOptions& immutable_db_options() const {
     return immutable_db_options_;
   }
@@ -739,7 +746,7 @@ class DBImpl : public DB {
   // Not thread-safe.
   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
 
-  InstrumentedMutex* mutex() { return &mutex_; }
+  InstrumentedMutex* mutex() const { return &mutex_; }
 
   Status NewDB();
 
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 49b6c0fd804..1cdadf03942 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -798,6 +798,31 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   return s;
 }
 
+namespace {
+class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback {
+ public:
+  SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env,
+                                uint64_t snap_refresh_nanos, Logger* info_log)
+      : SnapshotListFetchCallback(env, snap_refresh_nanos),
+        db_impl_(db_impl),
+        info_log_(info_log) {}
+  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
+                       SequenceNumber max) override {
+    size_t prev = snapshots->size();
+    snapshots->clear();
+    db_impl_->LoadSnapshots(snapshots, nullptr, max);
+    size_t now = snapshots->size();
+    ROCKS_LOG_DEBUG(info_log_,
+                    "Compaction snapshot count refreshed from %zu to %zu", prev,
+                    now);
+  }
+
+ private:
+  DBImpl* db_impl_;
+  Logger* info_log_;
+};
+}  // namespace
+
 Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
                             ColumnFamilyHandle* column_family,
                             const std::vector<std::string>& input_file_names,
@@ -969,6 +994,9 @@ Status DBImpl::CompactFilesImpl(
 
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
+  SnapshotListFetchCallbackImpl fetch_callback(
+      this, env_, c->mutable_cf_options()->snap_refresh_nanos,
+      immutable_db_options_.info_log.get());
   CompactionJob compaction_job(
       job_context->job_id, c.get(), immutable_db_options_,
       env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -978,7 +1006,9 @@ Status DBImpl::CompactFilesImpl(
       snapshot_checker, table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER);
+      &compaction_job_stats, Env::Priority::USER,
+      immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
+                                                    : nullptr);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -2622,6 +2652,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     GetSnapshotContext(job_context, &snapshot_seqs,
                        &earliest_write_conflict_snapshot, &snapshot_checker);
     assert(is_snapshot_supported_ || snapshots_.empty());
+    SnapshotListFetchCallbackImpl fetch_callback(
+        this, env_, c->mutable_cf_options()->snap_refresh_nanos,
+        immutable_db_options_.info_log.get());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
         env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -2631,7 +2664,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
-        &compaction_job_stats, thread_pri);
+        &compaction_job_stats, thread_pri,
+        immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
+                                                      : nullptr);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h
index f2610fd18b2..f1cf6f4b755 100644
--- a/db/snapshot_impl.h
+++ b/db/snapshot_impl.h
@@ -91,13 +91,23 @@ class SnapshotList {
       SequenceNumber* oldest_write_conflict_snapshot = nullptr,
       const SequenceNumber& max_seq = kMaxSequenceNumber) const {
     std::vector<SequenceNumber> ret;
+    GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+    return ret;
+  }
+
+  void GetAll(std::vector<SequenceNumber>* snap_vector,
+              SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+              const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber>& ret = *snap_vector;
+    // So far we have no use case that would pass a non-empty vector
+    assert(ret.size() == 0);
 
     if (oldest_write_conflict_snapshot != nullptr) {
       *oldest_write_conflict_snapshot = kMaxSequenceNumber;
     }
 
     if (empty()) {
-      return ret;
+      return;
     }
     const SnapshotImpl* s = &list_;
     while (s->next_ != &list_) {
@@ -119,7 +129,7 @@ class SnapshotList {
 
       s = s->next_;
     }
-    return ret;
+    return;
   }
 
   // get the sequence number of the most recent snapshot
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 4b34996a730..a0ae7ca7785 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -816,6 +816,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos(
+    rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
                                                          unsigned char);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 4cc2998b2d8..a1071f62ec7 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -269,6 +269,17 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
+  // If non-zero, compactions will periodically refresh the snapshot list. The
+  // delay for the first refresh is snap_refresh_nanos nano seconds and
+  // exponentially increases afterwards. When having many short-lived snapshots,
+  // this option helps reducing the cpu usage of long-running compactions. The
+  // feature is disabled when max_subcompactions is greater than one.
+  //
+  // Default: 0.1s
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t snap_refresh_nanos = 100 * 1000 * 1000;  // 0.1s
+
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
   //
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 78accaeb915..f7af3f834c9 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -169,6 +169,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_multiplier);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
+  ROCKS_LOG_INFO(log, "                       snap_refresh_nanos: %" PRIu64,
+                 snap_refresh_nanos);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
                  max_bytes_for_level_multiplier);
   ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
diff --git a/options/cf_options.h b/options/cf_options.h
index d0c4390c36d..47fca58fa7d 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -149,6 +149,7 @@ struct MutableCFOptions {
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
+        snap_refresh_nanos(options.snap_refresh_nanos),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
         periodic_compaction_seconds(options.periodic_compaction_seconds),
@@ -185,6 +186,7 @@ struct MutableCFOptions {
         target_file_size_base(0),
         target_file_size_multiplier(0),
         max_bytes_for_level_base(0),
+        snap_refresh_nanos(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
         periodic_compaction_seconds(0),
@@ -236,6 +238,7 @@ struct MutableCFOptions {
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
+  uint64_t snap_refresh_nanos;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
   uint64_t periodic_compaction_seconds;
diff --git a/options/options.cc b/options/options.cc
index 5e0c539afb5..057727e59fb 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -215,6 +215,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "               Options.max_bytes_for_level_base: %" PRIu64,
         max_bytes_for_level_base);
+    ROCKS_LOG_HEADER(
+        log, "                     Options.snap_refresh_nanos: %" PRIu64,
+        snap_refresh_nanos);
     ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d",
                      level_compaction_dynamic_level_bytes);
     ROCKS_LOG_HEADER(log, "         Options.max_bytes_for_level_multiplier: %f",
@@ -490,6 +493,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
   write_buffer_size = 2 << 20;
   target_file_size_base = 2 * 1048576;
   max_bytes_for_level_base = 10 * 1048576;
+  snap_refresh_nanos = 0;
   soft_pending_compaction_bytes_limit = 256 * 1048576;
   hard_pending_compaction_bytes_limit = 1073741824ul;
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index b7781ff6d25..a973bbfde51 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -177,6 +177,7 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
       mutable_cf_options.target_file_size_multiplier;
   cf_opts.max_bytes_for_level_base =
       mutable_cf_options.max_bytes_for_level_base;
+  cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos;
   cf_opts.max_bytes_for_level_multiplier =
       mutable_cf_options.max_bytes_for_level_multiplier;
   cf_opts.ttl = mutable_cf_options.ttl;
@@ -526,9 +527,9 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
               opt_address));
     case OptionType::kBlockBasedTableIndexShorteningMode:
       return ParseEnum<BlockBasedTableOptions::IndexShorteningMode>(
-        block_base_table_index_shortening_mode_string_map, value,
-        reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
-            opt_address));
+          block_base_table_index_shortening_mode_string_map, value,
+          reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
+              opt_address));
     case OptionType::kEncodingType:
       return ParseEnum<EncodingType>(
           encoding_type_string_map, value,
@@ -1666,13 +1667,13 @@ std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
 
 std::unordered_map<std::string, BlockBasedTableOptions::IndexShorteningMode>
     OptionsHelper::block_base_table_index_shortening_mode_string_map = {
-      {"kNoShortening",
-       BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
-      {"kShortenSeparators",
-       BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
-      {"kShortenSeparatorsAndSuccessor",
-       BlockBasedTableOptions::IndexShorteningMode::
-           kShortenSeparatorsAndSuccessor}};
+        {"kNoShortening",
+         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+        {"kShortenSeparators",
+         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+        {"kShortenSeparatorsAndSuccessor",
+         BlockBasedTableOptions::IndexShorteningMode::
+             kShortenSeparatorsAndSuccessor}};
 
 std::unordered_map<std::string, EncodingType>
     OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
@@ -1910,6 +1911,10 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base),
           OptionType::kUInt64T, OptionVerificationType::kNormal, true,
           offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
+        {"snap_refresh_nanos",
+         {offset_of(&ColumnFamilyOptions::snap_refresh_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, snap_refresh_nanos)}},
         {"max_bytes_for_level_multiplier",
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
           OptionType::kDouble, OptionVerificationType::kNormal, true,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 005b9d53a89..2d6cc11c02e 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -415,6 +415,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
       "kSnappyCompression;"
       "max_bytes_for_level_base=986;"
+      "snap_refresh_nanos=1000000000;"
       "bloom_locality=8016;"
       "target_file_size_base=4294976376;"
       "memtable_huge_page_size=2557;"
diff --git a/options/options_test.cc b/options/options_test.cc
index fbfee311b0a..ded336dd18d 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -74,6 +74,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"target_file_size_base", "12"},
       {"target_file_size_multiplier", "13"},
       {"max_bytes_for_level_base", "14"},
+      {"snap_refresh_nanos", "1000000000"},
       {"level_compaction_dynamic_level_bytes", "true"},
       {"max_bytes_for_level_multiplier", "15.0"},
       {"max_bytes_for_level_multiplier_additional", "16:17:18"},
@@ -183,6 +184,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U);
   ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 65a43616969..9b250604803 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -21,6 +21,12 @@ const InternalKeyComparator icmp_(BytewiseComparator());
 
 }  // namespace
 
+stl_wrappers::KVMap MakeMockFile(
+    std::vector<std::pair<const std::string, std::string>> l) {
+  return stl_wrappers::KVMap(l.begin(), l.end(),
+                             stl_wrappers::LessOfComparator(&icmp_));
+}
+
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l) {
   return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
@@ -137,6 +143,14 @@ void MockTableFactory::AssertLatestFile(
       ParseInternalKey(Slice(key), &ikey);
       std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
     }
+    std::cout << "Expected:" << std::endl;
+    for (const auto& kv : file_contents) {
+      ParsedInternalKey ikey;
+      std::string key, value;
+      std::tie(key, value) = kv;
+      ParseInternalKey(Slice(key), &ikey);
+      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
+    }
     FAIL();
   }
 }
diff --git a/table/mock_table.h b/table/mock_table.h
index 2f123a963cd..5bca14644d8 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -28,6 +28,8 @@ namespace mock {
 
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l = {});
+stl_wrappers::KVMap MakeMockFile(
+    std::vector<std::pair<const std::string, std::string>> l);
 
 struct MockTableFileSystem {
   port::Mutex mutex;
@@ -184,6 +186,12 @@ class MockTableFactory : public TableFactory {
   // contents are equal to file_contents
   void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
   void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
+  stl_wrappers::KVMap output() {
+    assert(!file_system_.files.empty());
+    auto latest = file_system_.files.end();
+    --latest;
+    return latest->second;
+  }
 
  private:
   uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 62f72f2b5eb..6487562d8bb 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -343,8 +343,9 @@ def whitebox_crash_main(args, unknown_args):
         if additional_opts['kill_random_test'] is None and (retncode == 0):
             # we expect zero retncode if no kill option
             expected = True
-        elif additional_opts['kill_random_test'] is not None and retncode < 0:
-            # we expect negative retncode if kill option was given
+        elif additional_opts['kill_random_test'] is not None and retncode <= 0:
+            # When kill option is given, the test MIGHT kill itself.
+            # If it does, negative retncode is expected. Otherwise 0.
             expected = True
 
         if not expected:
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index a1ebc8b9617..fe9efd1f092 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -40,6 +40,9 @@ void CompactionJobStats::Reset() {
   file_fsync_nanos = 0;
   file_prepare_write_nanos = 0;
 
+  smallest_output_key_prefix.clear();
+  largest_output_key_prefix.clear();
+
   num_single_del_fallthru = 0;
   num_single_del_mismatch = 0;
 }

From 930bfa575079a4f99cd1963df7a7f0b3f1b5691d Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Mon, 6 May 2019 18:23:45 -0700
Subject: [PATCH 016/572] Disable MultiGet from db_stress (#5284)

Summary:
Disable it for now until we can get stress tests to pass consistently.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5284

Differential Revision: D15230727

Pulled By: anand1976

fbshipit-source-id: 239baacdb3c4cd4fb7c4447f7582b9042501d752
---
 tools/db_crashtest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 6487562d8bb..780c987e929 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -65,7 +65,7 @@
     "writepercent": 35,
     "format_version": lambda: random.randint(2, 4),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
-    "use_multiget" : lambda: random.randint(0, 1),
+    "use_multiget" : 0,
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'

From eea1cad850c2e268b0bfde208a005e44289dea47 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Tue, 7 May 2019 20:17:48 -0700
Subject: [PATCH 017/572] avoid updating index type during iterator creation
 (#5288)

Summary:
Right now there is a potential race condition where two threads are created to iterate through the DB (https://gist.github.com/miasantreble/88f5798a397ee7cb8e7baff9db2d9e85).  The problem is that in `BlockBasedTable::NewIndexIterator`, if both threads failed to find index_reader from block cache, they will call `CreateIndexReader->UpdateIndexType()` which creates a race to update `index_type` in the shared rep_ object. By checking the code, we realize the index type is always populated by `PrefetchIndexAndFilterBlocks` during the table `Open` call, so there is no need to update index type every time during iterator creation. This PR attempts to fix the race condition by removing the unnecessary call to `UpdateIndexType`
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5288

Differential Revision: D15252509

Pulled By: miasantreble

fbshipit-source-id: 6e3258652121d5c76d267f7ac457e15c5e84756e
---
 table/block_based_table_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index e39fd2a860d..514587d0b96 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -3178,7 +3178,7 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
 Status BlockBasedTable::CreateIndexReader(
     FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
     InternalIterator* preloaded_meta_index_iter, int level) {
-  auto index_type_on_file = UpdateIndexType();
+  auto index_type_on_file = rep_->index_type;
 
   auto file = rep_->file.get();
   const InternalKeyComparator* icomparator = &rep_->internal_comparator;

From bdba6c56dde69b25762b27a9f1f95f51f2ee4551 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Wed, 8 May 2019 10:56:38 -0700
Subject: [PATCH 018/572] add WAL replay in TryCatchUpWithPrimary (#5282)

Summary:
Previously in PR https://github.com/facebook/rocksdb/pull/5161 we have added the capability to do WAL tailing in `OpenAsSecondary`, in this PR we extend such feature to `TryCatchUpWithPrimary` which is useful for an secondary RocksDB instance to retrieve and apply the latest updates and refresh log readers if needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5282

Differential Revision: D15261011

Pulled By: miasantreble

fbshipit-source-id: a15c94471e8c3b3b1f7f47c3135db1126e936949
---
 db/db_impl_secondary.cc | 86 +++++++++++++++++++++++++----------------
 db/db_impl_secondary.h  |  2 +
 db/db_secondary_test.cc |  6 +++
 3 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc
index 90e979b4e58..007910ea5b4 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl_secondary.cc
@@ -59,40 +59,7 @@ Status DBImplSecondary::Recover(
     single_column_family_mode_ =
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
-    // Recover from all newer log files than the ones named in the
-    // descriptor.
-    std::vector<std::string> filenames;
-    s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
-    if (s.IsNotFound()) {
-      return Status::InvalidArgument("Failed to open wal_dir",
-                                     immutable_db_options_.wal_dir);
-    } else if (!s.ok()) {
-      return s;
-    }
-
-    std::vector<uint64_t> logs;
-    // if log_readers_ is non-empty, it means we have applied all logs with log
-    // numbers smaller than the smallest log in log_readers_, so there is no
-    // need to pass these logs to RecoverLogFiles
-    uint64_t log_number_min = 0;
-    if (log_readers_.size() > 0) {
-      log_number_min = log_readers_.begin()->first;
-    }
-    for (size_t i = 0; i < filenames.size(); i++) {
-      uint64_t number;
-      FileType type;
-      if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
-          number >= log_number_min) {
-        logs.push_back(number);
-      }
-    }
-
-    if (!logs.empty()) {
-      // Recover in the order in which the logs were generated
-      std::sort(logs.begin(), logs.end());
-      SequenceNumber next_sequence(kMaxSequenceNumber);
-      s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/);
-    }
+    s = FindAndRecoverLogFiles();
   }
 
   // TODO: update options_file_number_ needed?
@@ -100,6 +67,41 @@ Status DBImplSecondary::Recover(
   return s;
 }
 
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+  assert(logs != nullptr);
+  std::vector<std::string> filenames;
+  Status s;
+  s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+  if (s.IsNotFound()) {
+    return Status::InvalidArgument("Failed to open wal_dir",
+                                   immutable_db_options_.wal_dir);
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  // if log_readers_ is non-empty, it means we have applied all logs with log
+  // numbers smaller than the smallest log in log_readers_, so there is no
+  // need to pass these logs to RecoverLogFiles
+  uint64_t log_number_min = 0;
+  if (log_readers_.size() > 0) {
+    log_number_min = log_readers_.begin()->first;
+  }
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+        number >= log_number_min) {
+      logs->push_back(number);
+    }
+  }
+  // Recover logs in the order that they were generated
+  if (!logs->empty()) {
+    std::sort(logs->begin(), logs->end());
+  }
+  return s;
+}
+
 // try to find log reader using log_number from log_readers_ map, initialize
 // if it doesn't exist
 Status DBImplSecondary::MaybeInitLogReader(
@@ -294,6 +296,18 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
   return s;
 }
 
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles() {
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/);
+  }
+  return s;
+}
+
 Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
                                        ColumnFamilyHandle* column_family) {
   if (read_options.managed) {
@@ -377,6 +391,7 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
   assert(manifest_reader_.get() != nullptr);
   Status s;
+  // read the manifest and apply new changes to the secondary instance
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   InstrumentedMutexLock lock_guard(&mutex_);
   s = static_cast<ReactiveVersionSet*>(versions_.get())
@@ -389,6 +404,9 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
     }
     sv_context.Clean();
   }
+  // list wal_dir to discover new WALs and apply new changes to the secondary
+  // instance
+  s = FindAndRecoverLogFiles();
   return s;
 }
 
diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h
index 64c81432848..32dbae058b8 100644
--- a/db/db_impl_secondary.h
+++ b/db/db_impl_secondary.h
@@ -194,6 +194,8 @@ class DBImplSecondary : public DBImpl {
 
   using DBImpl::Recover;
 
+  Status FindAndRecoverLogFiles();
+  Status FindNewLogNumbers(std::vector<uint64_t>* logs);
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence,
                          bool read_only) override;
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 47daf9fd8cc..60ea5ba8d5f 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -237,6 +237,12 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
   };
 
   verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
 }
 
 TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {

From 25d81e4577c30f1da7fe6631f4123a5897de4f98 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 9 May 2019 12:04:56 -0700
Subject: [PATCH 019/572] DBIter::Next() can skip user key checking if previous
 entry's seqnum is 0 (#5244)

Summary:
Right now, DBIter::Next() always checks whether an entry is for the same user key as the previous entry to see whether the key should be hidden to the user. However, if previous entry's sequence number is 0, the check is not needed because 0 is the oldest possible sequence number.

We could extend it from seqnum 0 case to simply prev_seqno >= current_seqno. However, it is less robust with bug or unexpected situations, while the gain is relatively low. We can always extend it later when needed.

In a readseq benchmark with full formed LSM-tree, number of key comparisons called is reduced from 2.981 to 2.165. readseq against a fully compacted DB, no key comparison is called. Performance in this benchmark didn't show obvious improvement, which is expected because key comparisons only takes small percentage of CPU. But it may show up to be more effective if users have an expensive customized comparator.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5244

Differential Revision: D15067257

Pulled By: siying

fbshipit-source-id: b7e1ef3ec4fa928cba509683d2b3246e35d270d9
---
 HISTORY.md    |  5 ++++-
 db/db_iter.cc | 29 +++++++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 65d64d23604..fb1db417ecf 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,9 +4,12 @@
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 
 ### New Features
-* Reduce binary search when iterator reseek into the same data block.
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 
+### Performance Improvements
+* Reduce binary search when iterator reseek into the same data block.
+* DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 43a56af78c7..1d8ccf9adbd 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -133,6 +133,7 @@ class DBIter final: public Iterator {
         direction_(kForward),
         valid_(false),
         current_entry_is_merged_(false),
+        is_key_seqnum_zero_(false),
         prefix_same_as_start_(read_options.prefix_same_as_start),
         pin_thru_lifetime_(read_options.pin_data),
         total_order_seek_(read_options.total_order_seek),
@@ -333,6 +334,10 @@ class DBIter final: public Iterator {
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
   const bool prefix_same_as_start_;
   // Means that we will pin all data blocks we read as long the Iterator
   // is not deleted, will be true if ReadOptions::pin_data is true
@@ -381,6 +386,7 @@ void DBIter::Next() {
   num_internal_keys_skipped_ = 0;
   bool ok = true;
   if (direction_ == kReverse) {
+    is_key_seqnum_zero_ = false;
     if (!ReverseToForward()) {
       ok = false;
     }
@@ -400,6 +406,7 @@ void DBIter::Next() {
     FindNextUserEntry(true /* skipping the current user key */,
                       prefix_same_as_start_);
   } else {
+    is_key_seqnum_zero_ = false;
     valid_ = false;
   }
   if (statistics_ != nullptr && valid_) {
@@ -450,10 +457,16 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
   is_blob_ = false;
 
   do {
+    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+    // but we need to save the previous value to be used in the loop.
+    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
     if (!ParseKey(&ikey_)) {
+      is_key_seqnum_zero_ = false;
       return false;
     }
 
+    is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
     if (iterate_upper_bound_ != nullptr &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
@@ -470,11 +483,18 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
     }
 
     if (IsVisible(ikey_.sequence)) {
-      if (skipping && user_comparator_.Compare(ikey_.user_key,
-                                               saved_key_.GetUserKey()) <= 0) {
+      // If the previous entry is of seqnum 0, the current entry will not
+      // possibly be skipped. This condition can potentially be relaxed to
+      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+      // prone to bugs causing the same user key with the same sequence number.
+      if (!is_prev_key_seqnum_zero && skipping &&
+          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
+              0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
       } else {
+        assert(!skipping || user_comparator_.Compare(
+                                ikey_.user_key, saved_key_.GetUserKey()) > 0);
         num_skipped = 0;
         switch (ikey_.type) {
           case kTypeDeletion:
@@ -595,6 +615,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
     // If we have sequentially iterated via numerous equal keys, then it's
     // better to seek so that we can avoid too many key comparisons.
     if (num_skipped > max_skip_ && CanReseekToSkip()) {
+      is_key_seqnum_zero_ = false;
       num_skipped = 0;
       std::string last_key;
       if (skipping) {
@@ -1265,6 +1286,7 @@ void DBIter::Seek(const Slice& target) {
   status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
+  is_key_seqnum_zero_ = false;
 
   SequenceNumber seq = sequence_;
   saved_key_.Clear();
@@ -1323,6 +1345,7 @@ void DBIter::SeekForPrev(const Slice& target) {
   status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
+  is_key_seqnum_zero_ = false;
   saved_key_.Clear();
   // now saved_key is used to store internal key.
   saved_key_.SetInternalKey(target, 0 /* sequence_number */,
@@ -1390,6 +1413,7 @@ void DBIter::SeekToFirst() {
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   ClearSavedValue();
+  is_key_seqnum_zero_ = false;
 
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
@@ -1442,6 +1466,7 @@ void DBIter::SeekToLast() {
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   ClearSavedValue();
+  is_key_seqnum_zero_ = false;
 
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);

From 181bb43f08c77be7af72ceea12b9c66b8ab5fd7d Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 9 May 2019 13:03:37 -0700
Subject: [PATCH 020/572] Fix bugs in FilePickerMultiGet (#5292)

Summary:
This PR fixes a couple of bugs in FilePickerMultiGet that were causing db_stress test failures. The failures were caused by -
1. Improper handling of a key that matches the user key portion of an L0 file's largest key. In this case, the curr_index_in_curr_level file index in L0 for that key was getting incremented, but batch_iter_ was not advanced. By design, all keys in a batch are supposed to be checked against an L0 file before advancing to the next L0 file. Not advancing to the next key in the batch was causing a double increment of curr_index_in_curr_level due to the same key being processed again
2. Improper handling of a key that matches the user key portion of the largest key in the last file of L1 and higher. This was resulting in a premature end to the processing of the batch for that level when the next key in the batch is a duplicate. Typically, the keys in MultiGet will not be duplicates, but its good to handle that case correctly

Test -
asan_crash
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5292

Differential Revision: D15282530

Pulled By: anand1976

fbshipit-source-id: d1a6a86e0af273169c3632db22a44d79c66a581f
---
 db/version_set.cc     | 20 ++++++++++++++++++--
 tools/db_crashtest.py |  2 +-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 6d4fb7315ad..8463a5aa735 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -416,6 +416,18 @@ class FilePickerMultiGet {
     bool file_hit = false;
     int cmp_largest = -1;
     if (curr_file_index >= curr_file_level_->num_files) {
+      // In the unlikely case the next key is a duplicate of the current key,
+      // and the current key is the last in the level and the internal key
+      // was not found, we need to skip lookup for the remaining keys and
+      // reset the search bounds
+      if (batch_iter_ != current_level_range_.end()) {
+        ++batch_iter_;
+        for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+      }
       return false;
     }
     // Loops over keys in the MultiGet batch until it finds a file with
@@ -533,7 +545,10 @@ class FilePickerMultiGet {
           // any further for that key, so advance batch_iter_. Else, keep
           // batch_iter_ positioned on that key so we look it up again in
           // the next file
-          if (current_level_range_.CheckKeyDone(batch_iter_)) {
+          // For L0, always advance the key because we will look in the next
+          // file regardless for all keys not found yet
+          if (current_level_range_.CheckKeyDone(batch_iter_) ||
+              curr_level_ == 0) {
             ++batch_iter_;
           }
         }
@@ -601,7 +616,8 @@ class FilePickerMultiGet {
     unsigned int start_index_in_curr_level;
 
     FilePickerContext(int32_t left, int32_t right)
-        : search_left_bound(left), search_right_bound(right) {}
+        : search_left_bound(left), search_right_bound(right),
+          curr_index_in_curr_level(0), start_index_in_curr_level(0) {}
 
     FilePickerContext() = default;
   };
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 780c987e929..6487562d8bb 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -65,7 +65,7 @@
     "writepercent": 35,
     "format_version": lambda: random.randint(2, 4),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
-    "use_multiget" : 0,
+    "use_multiget" : lambda: random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'

From 9fad3e21eb90d215b6719097baba417bc1eeca3c Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 9 May 2019 14:15:12 -0700
Subject: [PATCH 021/572] Merging iterator to avoid child iterator reseek for
 some cases (#5286)

Summary:
When reseek happens in merging iterator, reseeking a child iterator can be avoided if:
(1) the iterator represents imutable data
(2) reseek() to a larger key than the current key
(3) the current key of the child iterator is larger than the seek key
because it is guaranteed that the result will fall into the same position.

This optimization will be useful for use cases where users keep seeking to keys nearby in ascending order.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5286

Differential Revision: D15283635

Pulled By: siying

fbshipit-source-id: 35f79ffd5ce3609146faa8cd55f2bfd733502f83
---
 HISTORY.md                       |  1 +
 db/db_iterator_test.cc           | 69 ++++++++++++++++++++++++++++++++
 db/version_set.cc                |  3 +-
 table/block_based_table_reader.h |  3 +-
 table/internal_iterator.h        |  5 ++-
 table/iterator_wrapper.h         |  7 +++-
 table/merging_iterator.cc        | 19 ++++++++-
 7 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index fb1db417ecf..99235a33d5c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
+* Merging iterator to avoid child iterator reseek for some cases
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 78b387577dd..cc1af2e0ad8 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2548,6 +2548,75 @@ TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = RandomString(&rnd, 180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+
+  // These two keys will be kept in memtable.
+  ASSERT_OK(Put("0", random_str));
+  ASSERT_OK(Put("8", random_str));
+
+  int num_iter_wrapper_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IteratorWrapper::Seek:0",
+      [&](void* /*arg*/) { num_iter_wrapper_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    // DBIter always wraps internal iterator with IteratorWrapper,
+    // and in merging iterator each child iterator will be wrapped
+    // with IteratorWrapper.
+    ASSERT_EQ(4, num_iter_wrapper_seek);
+
+    // child position: 1 and 5
+    num_iter_wrapper_seek = 0;
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_iter_wrapper_seek);
+
+    // child position: 2 and 5
+    num_iter_wrapper_seek = 0;
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(4, num_iter_wrapper_seek);
+
+    // child position: 8 and 6
+    num_iter_wrapper_seek = 0;
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_iter_wrapper_seek);
+
+    // child position: 8 and 7
+    num_iter_wrapper_seek = 0;
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(4, num_iter_wrapper_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 8463a5aa735..84302556e66 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -896,7 +896,8 @@ class LevelIterator final : public InternalIterator {
       bool skip_filters, int level, RangeDelAggregator* range_del_agg,
       const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
           nullptr)
-      : table_cache_(table_cache),
+      : InternalIterator(false),
+        table_cache_(table_cache),
         read_options_(read_options),
         env_options_(env_options),
         icomparator_(icomparator),
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 1fcc8cbfa07..74d2caeb28b 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -590,7 +590,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
                           bool key_includes_seq = true,
                           bool index_key_is_full = true,
                           bool for_compaction = false)
-      : table_(table),
+      : InternalIteratorBase<TValue>(false),
+        table_(table),
         read_options_(read_options),
         icomp_(icomp),
         user_comparator_(icomp.user_comparator()),
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 6b713e7b951..8f1cc9dd68e 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -20,7 +20,8 @@ class PinnedIteratorsManager;
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
-  InternalIteratorBase() {}
+  InternalIteratorBase() : is_mutable_(true) {}
+  InternalIteratorBase(bool _is_mutable) : is_mutable_(_is_mutable) {}
   virtual ~InternalIteratorBase() {}
 
   // An iterator is either positioned at a key/value pair, or
@@ -119,6 +120,7 @@ class InternalIteratorBase : public Cleanable {
   virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
     return Status::NotSupported("");
   }
+  bool is_mutable() const { return is_mutable_; }
 
  protected:
   void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {
@@ -130,6 +132,7 @@ class InternalIteratorBase : public Cleanable {
       Prev();
     }
   }
+  bool is_mutable_;
 
  private:
   // No copying allowed
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index fc5eb2613d8..a570e53c1e2 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -69,7 +69,12 @@ class IteratorWrapperBase {
     assert(!valid_ || iter_->status().ok());
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
-  void Seek(const Slice& k) { assert(iter_); iter_->Seek(k);       Update(); }
+  void Seek(const Slice& k) {
+    TEST_SYNC_POINT("IteratorWrapper::Seek:0");
+    assert(iter_);
+    iter_->Seek(k);
+    Update();
+  }
   void SeekForPrev(const Slice& k) {
     assert(iter_);
     iter_->SeekForPrev(k);
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index bd4a186b3c2..e5df6bdf6f0 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -127,14 +127,29 @@ class MergingIterator : public InternalIterator {
   }
 
   void Seek(const Slice& target) override {
+    bool is_increasing_reseek = false;
+    if (current_ != nullptr && direction_ == kForward && status_.ok() &&
+        comparator_->Compare(target, key()) >= 0) {
+      is_increasing_reseek = true;
+    }
     ClearHeaps();
     status_ = Status::OK();
     for (auto& child : children_) {
-      {
+      // If upper bound never changes, we can skip Seek() for
+      // the !Valid() case too, but people do hack the code to change
+      // upper bound between Seek(), so it's not a good idea to break
+      // the API.
+      // If DBIter is used on top of merging iterator, we probably
+      // can skip mutable child iterators if they are invalid too,
+      // but it's a less clean API. We can optimize for it later if
+      // needed.
+      if (!is_increasing_reseek || !child.Valid() ||
+          comparator_->Compare(target, child.key()) > 0 ||
+          child.iter()->is_mutable()) {
         PERF_TIMER_GUARD(seek_child_seek_time);
         child.Seek(target);
+        PERF_COUNTER_ADD(seek_child_seek_count, 1);
       }
-      PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
         assert(child.status().ok());

From 6451673f379319755ff238ffef18c674ce37bd0b Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Thu, 9 May 2019 18:16:45 -0700
Subject: [PATCH 022/572] Add C bindings for LowerThreadPoolIO/CPUPriority
 (#5285)

Summary:
There were no C bindings for lowering thread pool priority. This adds those.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5285

Differential Revision: D15290050

Pulled By: siying

fbshipit-source-id: b2ed94d0c39d27434ace2204829a242b53d0d67a
---
 db/c.cc             | 16 ++++++++++++++++
 include/rocksdb/c.h |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index aac1cf4087c..58b51e2523e 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3268,6 +3268,22 @@ void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
   env->rep->WaitForJoin();
 }
 
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
 void rocksdb_env_destroy(rocksdb_env_t* env) {
   if (!env->is_default) delete env->rep;
   delete env;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a0ae7ca7785..ed0709d22a1 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1301,6 +1301,11 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
     rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
 
 extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create();

From f0bf3bf34b068a918b4969812553f21958f79ea6 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 10 May 2019 11:53:33 -0700
Subject: [PATCH 023/572] Turn CachableEntry into a proper resource handle
 (#5252)

Summary:
CachableEntry is used in a variety of contexts: it may refer to a cached
object (i.e. an object in the block cache), an owned object, or an
unowned object; also, in some cases (most notably with iterators), the
responsibility of managing the pointed-to object gets handed off to
another object. Each of the above scenarios have different implications
for the lifecycle of the referenced object. For the most part, the patch
does not change the lifecycle of managed objects; however, it makes
these relationships explicit, and it also enables us to eliminate some
hacks and accident-prone code around releasing cache handles and
deleting/cleaning up objects. (The only places where the patch changes
how an objects are managed are the partitions of partitioned indexes and
filters.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5252

Differential Revision: D15101358

Pulled By: ltamasi

fbshipit-source-id: 9eb59e9ae5a7230e3345789762d0ba1f189485be
---
 table/block_based_table_reader.cc      | 244 +++++++++++--------------
 table/block_based_table_reader.h       |  29 +--
 table/cachable_entry.h                 | 219 ++++++++++++++++++++++
 table/partitioned_filter_block.cc      |  66 ++-----
 table/partitioned_filter_block.h       |  13 +-
 table/partitioned_filter_block_test.cc |   3 +-
 6 files changed, 351 insertions(+), 223 deletions(-)
 create mode 100644 table/cachable_entry.h

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 514587d0b96..1dc220ddec5 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -112,12 +112,6 @@ inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
              : nullptr;
 }
 
-// Delete the resource that is held by the iterator.
-template <class ResourceType>
-void DeleteHeldResource(void* arg, void* /*ignored*/) {
-  delete reinterpret_cast<ResourceType*>(arg);
-}
-
 // Delete the entry resided in the cache.
 template <class Entry>
 void DeleteCachedEntry(const Slice& /*key*/, void* value) {
@@ -224,7 +218,7 @@ bool PrefixExtractorChanged(const TableProperties* table_properties,
 }  // namespace
 
 // Index that allows binary search lookup in a two-level index structure.
-class PartitionIndexReader : public IndexReader, public Cleanable {
+class PartitionIndexReader : public IndexReader {
  public:
   // Read the partition index from the file and create an instance for
   // `PartitionIndexReader`.
@@ -332,10 +326,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
     // After prefetch, read the partitions one by one
     biter.SeekToFirst();
     auto ro = ReadOptions();
-    Cache* block_cache = rep->table_options.block_cache.get();
     for (; biter.Valid(); biter.Next()) {
       handle = biter.value();
-      BlockBasedTable::CachableEntry<Block> block;
+      CachableEntry<Block> block;
       const bool is_index = true;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
@@ -344,18 +337,12 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
           UncompressionDict::GetEmptyDict(), &block, is_index,
           nullptr /* get_context */);
 
-      assert(s.ok() || block.value == nullptr);
-      if (s.ok() && block.value != nullptr) {
-        if (block.cache_handle != nullptr) {
+      assert(s.ok() || block.GetValue() == nullptr);
+      if (s.ok() && block.GetValue() != nullptr) {
+        if (block.IsCached()) {
           if (pin) {
-            partition_map_[handle.offset()] = block;
-            RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            block.cache_handle);
-          } else {
-            block_cache->Release(block.cache_handle);
+            partition_map_[handle.offset()] = std::move(block);
           }
-        } else {
-          delete block.value;
         }
       }
     }
@@ -391,8 +378,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
   }
   BlockBasedTable* table_;
   std::unique_ptr<Block> index_block_;
-  std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
-      partition_map_;
+  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
   const bool index_key_includes_seq_;
   const bool index_value_is_full_;
 };
@@ -1221,14 +1207,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
         // This is the first call to NewIndexIterator() since we're in Open().
         // On success it should give us ownership of the `CachableEntry` by
         // populating `index_entry`.
-        assert(index_entry.value != nullptr);
+        assert(index_entry.GetValue() != nullptr);
         if (prefetch_all) {
-          index_entry.value->CacheDependencies(pin_all);
+          index_entry.GetValue()->CacheDependencies(pin_all);
         }
         if (pin_index) {
           rep->index_entry = std::move(index_entry);
-        } else {
-          index_entry.Release(table_options.block_cache.get());
         }
       }
     }
@@ -1236,17 +1220,15 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
       // Hack: Call GetFilter() to implicitly add filter to the block_cache
       auto filter_entry =
           new_table->GetFilter(rep->table_prefix_extractor.get());
-      if (filter_entry.value != nullptr && prefetch_all) {
-        filter_entry.value->CacheDependencies(
+      if (filter_entry.GetValue() != nullptr && prefetch_all) {
+        filter_entry.GetValue()->CacheDependencies(
             pin_all, rep->table_prefix_extractor.get());
       }
       // if pin_filter is true then save it in rep_->filter_entry; it will be
       // released in the destructor only, hence it will be pinned in the
       // cache while this reader is alive
       if (pin_filter) {
-        rep->filter_entry = filter_entry;
-      } else {
-        filter_entry.Release(table_options.block_cache.get());
+        rep->filter_entry = std::move(filter_entry);
       }
     }
   } else {
@@ -1369,10 +1351,13 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
-    const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block,
+    const ReadOptions& read_options, CachableEntry<Block>* block,
     const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
     bool is_index, GetContext* get_context) {
+
+  assert(block);
+  assert(block->IsEmpty());
+
   Status s;
   BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
@@ -1380,7 +1365,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
-    block->cache_handle = GetEntryFromCache(
+    auto cache_handle = GetEntryFromCache(
         block_cache, block_cache_key, rep->level,
         is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
         is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
@@ -1393,15 +1378,16 @@ Status BlockBasedTable::GetDataBlockFromCache(
                         : &get_context->get_context_stats_.num_cache_data_hit)
             : nullptr,
         statistics, get_context);
-    if (block->cache_handle != nullptr) {
-      block->value =
-          reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
+    if (cache_handle != nullptr) {
+      block->SetCachedValue(
+          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
+          block_cache, cache_handle);
       return s;
     }
   }
 
   // If not found, search from the compressed block cache.
-  assert(block->cache_handle == nullptr && block->value == nullptr);
+  assert(block->IsEmpty());
 
   if (block_cache_compressed == nullptr) {
     return s;
@@ -1435,20 +1421,25 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
-    block->value =
-        new Block(std::move(contents), rep->get_global_seqno(is_index),
-                  read_amp_bytes_per_bit,
-                  statistics);  // uncompressed block
-    if (block_cache != nullptr && block->value->own_bytes() &&
+    std::unique_ptr<Block> block_holder(
+      new Block(std::move(contents), rep->get_global_seqno(is_index),
+                read_amp_bytes_per_bit, statistics));  // uncompressed block
+
+    if (block_cache != nullptr && block_holder->own_bytes() &&
         read_options.fill_cache) {
-      size_t charge = block->value->ApproximateMemoryUsage();
-      s = block_cache->Insert(block_cache_key, block->value, charge,
+      size_t charge = block_holder->ApproximateMemoryUsage();
+      Cache::Handle* cache_handle = nullptr;
+      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
                               &DeleteCachedEntry<Block>,
-                              &(block->cache_handle));
+                              &cache_handle);
 #ifndef NDEBUG
       block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
       if (s.ok()) {
+        assert(cache_handle != nullptr);
+        block->SetCachedValue(block_holder.release(), block_cache,
+                              cache_handle);
+
         if (get_context != nullptr) {
           get_context->get_context_stats_.num_cache_add++;
           get_context->get_context_stats_.num_cache_bytes_write += charge;
@@ -1477,9 +1468,9 @@ Status BlockBasedTable::GetDataBlockFromCache(
         }
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-        delete block->value;
-        block->value = nullptr;
       }
+    } else {
+      block->SetOwnedValue(block_holder.release());
     }
   }
 
@@ -1497,33 +1488,34 @@ Status BlockBasedTable::PutDataBlockToCache(
     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
     bool is_index, Cache::Priority priority, GetContext* get_context) {
+
+  assert(cached_block);
+  assert(cached_block->IsEmpty());
   assert(raw_block_comp_type == kNoCompression ||
          block_cache_compressed != nullptr);
 
   Status s;
-  // Retrieve the uncompressed contents into a new buffer
-  BlockContents uncompressed_block_contents;
   Statistics* statistics = ioptions.statistics;
+
+  std::unique_ptr<Block> block_holder;
   if (raw_block_comp_type != kNoCompression) {
+    // Retrieve the uncompressed contents into a new buffer
+    BlockContents uncompressed_block_contents;
     UncompressionContext context(raw_block_comp_type);
     UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
     s = UncompressBlockContents(info, raw_block_contents->data.data(),
                                 raw_block_contents->data.size(),
                                 &uncompressed_block_contents, format_version,
                                 ioptions, memory_allocator);
-  }
-  if (!s.ok()) {
-    return s;
-  }
+    if (!s.ok()) {
+      return s;
+    }
 
-  if (raw_block_comp_type != kNoCompression) {
-    cached_block->value = new Block(std::move(uncompressed_block_contents),
-                                    seq_no, read_amp_bytes_per_bit,
-                                    statistics);  // uncompressed block
+    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
+                                 read_amp_bytes_per_bit, statistics));
   } else {
-    cached_block->value =
-        new Block(std::move(*raw_block_contents), seq_no,
-                  read_amp_bytes_per_bit, ioptions.statistics);
+    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
+                                 read_amp_bytes_per_bit, statistics));
   }
 
   // Insert compressed block into compressed block cache.
@@ -1553,16 +1545,20 @@ Status BlockBasedTable::PutDataBlockToCache(
   }
 
   // insert into uncompressed block cache
-  if (block_cache != nullptr && cached_block->value->own_bytes()) {
-    size_t charge = cached_block->value->ApproximateMemoryUsage();
-    s = block_cache->Insert(block_cache_key, cached_block->value, charge,
+  if (block_cache != nullptr && block_holder->own_bytes()) {
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    Cache::Handle* cache_handle = nullptr;
+    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
                             &DeleteCachedEntry<Block>,
-                            &(cached_block->cache_handle), priority);
+                            &cache_handle, priority);
 #ifndef NDEBUG
     block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
     if (s.ok()) {
-      assert(cached_block->cache_handle != nullptr);
+      assert(cache_handle != nullptr);
+      cached_block->SetCachedValue(block_holder.release(), block_cache,
+                                   cache_handle);
+
       if (get_context != nullptr) {
         get_context->get_context_stats_.num_cache_add++;
         get_context->get_context_stats_.num_cache_bytes_write += charge;
@@ -1589,12 +1585,12 @@ Status BlockBasedTable::PutDataBlockToCache(
         }
       }
       assert(reinterpret_cast<Block*>(block_cache->Value(
-                 cached_block->cache_handle)) == cached_block->value);
+                 cached_block->GetCacheHandle())) == cached_block->GetValue());
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete cached_block->value;
-      cached_block->value = nullptr;
     }
+  } else {
+    cached_block->SetOwnedValue(block_holder.release());
   }
 
   return s;
@@ -1668,7 +1664,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
   }
 }
 
-BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
+CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
     bool no_io, GetContext* get_context) const {
   const BlockHandle& filter_blk_handle = rep_->filter_handle;
@@ -1677,7 +1673,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
                    no_io, get_context, prefix_extractor);
 }
 
-BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
+CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
     const bool is_a_filter_partition, bool no_io, GetContext* get_context,
     const SliceTransform* prefix_extractor) const {
@@ -1687,17 +1683,19 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   // most probably fail again.
   if (!is_a_filter_partition &&
       !rep_->table_options.cache_index_and_filter_blocks) {
-    return {rep_->filter.get(), nullptr /* cache handle */};
+    return {rep_->filter.get(), nullptr /* cache */,
+      nullptr /* cache_handle */, false /* own_value */};
   }
 
   Cache* block_cache = rep_->table_options.block_cache.get();
   if (rep_->filter_policy == nullptr /* do not use filter */ ||
       block_cache == nullptr /* no block cache at all */) {
-    return {nullptr /* filter */, nullptr /* cache handle */};
+    return CachableEntry<FilterBlockReader>();
   }
 
-  if (!is_a_filter_partition && rep_->filter_entry.IsSet()) {
-    return rep_->filter_entry;
+  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
+    return {rep_->filter_entry.GetValue(), nullptr /* cache */,
+      nullptr /* cache_handle */, false /* own_value */};
   }
 
   PERF_TIMER_GUARD(read_filter_block_nanos);
@@ -1708,7 +1706,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
                          filter_blk_handle, cache_key);
 
   Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle = GetEntryFromCache(
+  Cache::Handle* cache_handle = GetEntryFromCache(
       block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS,
       BLOCK_CACHE_FILTER_HIT,
       get_context ? &get_context->get_context_stats_.num_cache_filter_miss
@@ -1757,20 +1755,22 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     }
   }
 
-  return {filter, cache_handle};
+  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
+    false /* own_value */};
 }
 
-BlockBasedTable::CachableEntry<UncompressionDict>
+CachableEntry<UncompressionDict>
 BlockBasedTable::GetUncompressionDict(Rep* rep,
                                       FilePrefetchBuffer* prefetch_buffer,
                                       bool no_io, GetContext* get_context) {
   if (!rep->table_options.cache_index_and_filter_blocks) {
     // block cache is either disabled or not used for meta-blocks. In either
     // case, BlockBasedTableReader is the owner of the uncompression dictionary.
-    return {rep->uncompression_dict.get(), nullptr /* cache handle */};
+    return {rep->uncompression_dict.get(), nullptr /* cache */,
+      nullptr /* cache_handle */, false /* own_value */};
   }
   if (rep->compression_dict_handle.IsNull()) {
-    return {nullptr, nullptr};
+    return CachableEntry<UncompressionDict>();
   }
   char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   auto cache_key =
@@ -1835,7 +1835,8 @@ BlockBasedTable::GetUncompressionDict(Rep* rep,
       assert(cache_handle == nullptr);
     }
   }
-  return {dict, cache_handle};
+  return {dict, cache_handle ? rep->table_options.block_cache.get() : nullptr,
+    cache_handle, false /* own_value */};
 }
 
 // disable_prefix_seek should be set to true when prefix_extractor found in SST
@@ -1853,10 +1854,10 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
         read_options.fill_cache);
   }
   // we have a pinned index block
-  if (rep_->index_entry.IsSet()) {
+  if (rep_->index_entry.IsCached()) {
     // We don't return pinned datat from index blocks, so no need
     // to set `block_contents_pinned`.
-    return rep_->index_entry.value->NewIterator(
+    return rep_->index_entry.GetValue()->NewIterator(
         input_iter, read_options.total_order_seek || disable_prefix_seek,
         read_options.fill_cache);
   }
@@ -1948,7 +1949,8 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
   // the caller would like to take ownership of the index block
   // don't call RegisterCleanup() in this case, the caller will take care of it
   if (index_entry != nullptr) {
-    *index_entry = {index_reader, cache_handle};
+    *index_entry = {index_reader, block_cache, cache_handle,
+      false /* own_value */};
   } else {
     iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
   }
@@ -1976,9 +1978,9 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     auto uncompression_dict_storage =
         GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
     const UncompressionDict& uncompression_dict =
-        uncompression_dict_storage.value == nullptr
+        uncompression_dict_storage.GetValue() == nullptr
             ? UncompressionDict::GetEmptyDict()
-            : *uncompression_dict_storage.value;
+            : *uncompression_dict_storage.GetValue();
     if (s.ok()) {
       s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
                                        uncompression_dict, &block, is_index,
@@ -1991,7 +1993,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
       iter = new TBlockIter;
     }
     // Didn't get any data from block caches.
-    if (s.ok() && block.value == nullptr) {
+    if (s.ok() && block.GetValue() == nullptr) {
       if (no_io) {
         // Could not read from block_cache and can't do IO
         iter->Invalidate(Status::Incomplete("no blocking io"));
@@ -2012,16 +2014,15 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
             GetMemoryAllocator(rep->table_options));
       }
       if (s.ok()) {
-        block.value = block_value.release();
+        block.SetOwnedValue(block_value.release());
       }
     }
     // TODO(ajkr): also pin compression dictionary block when
     // `pin_l0_filter_and_index_blocks_in_cache == true`.
-    uncompression_dict_storage.Release(block_cache);
   }
 
   if (s.ok()) {
-    assert(block.value != nullptr);
+    assert(block.GetValue() != nullptr);
     const bool kTotalOrderSeek = true;
     // Block contents are pinned and it is still pinned after the iterator
     // is destroyed as long as cleanup functions are moved to another object,
@@ -2031,16 +2032,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     //    not reading data from the original source, whether immortal or not.
     //    Otherwise, the block is pinned iff the source is immortal.
     bool block_contents_pinned =
-        (block.cache_handle != nullptr ||
-         (!block.value->own_bytes() && rep->immortal_table));
-    iter = block.value->NewIterator<TBlockIter>(
+        (block.IsCached() ||
+         (!block.GetValue()->own_bytes() && rep->immortal_table));
+    iter = block.GetValue()->NewIterator<TBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
         iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
         index_key_is_full, block_contents_pinned);
-    if (block.cache_handle != nullptr) {
-      iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            block.cache_handle);
-    } else {
+    if (!block.IsCached()) {
       if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
         // insert a dummy record to block cache to track the memory usage
         Cache::Handle* cache_handle;
@@ -2063,8 +2061,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
         Slice unique_key =
             Slice(cache_key, static_cast<size_t>(end - cache_key));
         s = block_cache->Insert(unique_key, nullptr,
-                                block.value->ApproximateMemoryUsage(), nullptr,
-                                &cache_handle);
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
         if (s.ok()) {
           if (cache_handle != nullptr) {
             iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
@@ -2072,10 +2070,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
           }
         }
       }
-      iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
     }
+
+    block.TransferTo(iter);
   } else {
-    assert(block.value == nullptr);
+    assert(block.GetValue() == nullptr);
     iter->Invalidate(s);
   }
   return iter;
@@ -2122,7 +2121,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
-    if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
+    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
       Statistics* statistics = rep->ioptions.statistics;
       bool do_decompress =
           block_cache_compressed == nullptr && rep->blocks_maybe_compressed;
@@ -2159,7 +2158,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       }
     }
   }
-  assert(s.ok() || block_entry->value == nullptr);
+  assert(s.ok() || block_entry->GetValue() == nullptr);
   return s;
 }
 
@@ -2187,11 +2186,11 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
     Cache* block_cache = rep->table_options.block_cache.get();
     assert(block_cache);
     RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
-               block_cache->GetUsage(block->second.cache_handle));
+               block_cache->GetUsage(block->second.GetCacheHandle()));
     Statistics* kNullStats = nullptr;
     // We don't return pinned datat from index blocks, so no need
     // to set `block_contents_pinned`.
-    return block->second.value->NewIterator<IndexBlockIter>(
+    return block->second.GetValue()->NewIterator<IndexBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
         nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
   }
@@ -2239,7 +2238,7 @@ bool BlockBasedTable::PrefixMayMatch(
 
   // First, try check with full filter
   auto filter_entry = GetFilter(prefix_extractor);
-  FilterBlockReader* filter = filter_entry.value;
+  FilterBlockReader* filter = filter_entry.GetValue();
   bool filter_checked = true;
   if (filter != nullptr) {
     if (!filter->IsBlockBased()) {
@@ -2251,9 +2250,6 @@ bool BlockBasedTable::PrefixMayMatch(
     } else {
       // if prefix_extractor changed for block based filter, skip filter
       if (need_upper_bound_check) {
-        if (!rep_->filter_entry.IsSet()) {
-          filter_entry.Release(rep_->table_options.block_cache.get());
-        }
         return true;
       }
       auto prefix = prefix_extractor->Transform(user_key);
@@ -2317,12 +2313,6 @@ bool BlockBasedTable::PrefixMayMatch(
     }
   }
 
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
   return may_match;
 }
 
@@ -2734,7 +2724,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
                     read_options.read_tier == kBlockCacheTier, get_context);
     }
-    filter = filter_entry.value;
+    filter = filter_entry.GetValue();
 
     // First check the full filter
     // If full filter not useful, Then go into each block
@@ -2838,12 +2828,6 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
     }
   }
 
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
   return s;
 }
 
@@ -2864,7 +2848,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                read_options.read_tier == kBlockCacheTier,
                                nullptr /*get_context*/);
     }
-    filter = filter_entry.value;
+    filter = filter_entry.GetValue();
 
     // First check the full filter
     // If full filter not useful, Then go into each block
@@ -2954,13 +2938,6 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       *(miter->s) = s;
     }
   }
-
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
 }
 
 Status BlockBasedTable::Prefetch(const Slice* const begin,
@@ -3144,11 +3121,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
         UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */);
   }
   assert(s.ok());
-  bool in_cache = block.value != nullptr;
-  if (in_cache) {
-    ReleaseCachedEntry(block_cache, block.cache_handle);
-  }
-  return in_cache;
+  return block.IsCached();
 }
 
 BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
@@ -3494,9 +3467,6 @@ void BlockBasedTable::Close() {
 
   Cache* const cache = rep_->table_options.block_cache.get();
 
-  rep_->filter_entry.Release(cache);
-  rep_->index_entry.Release(cache);
-
   // cleanup index, filter, and compression dictionary blocks
   // to avoid accessing dangling pointers
   if (!rep_->table_options.no_block_cache) {
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 74d2caeb28b..385e50ab79f 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -25,6 +25,7 @@
 #include "rocksdb/table.h"
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
+#include "table/cachable_entry.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/get_context.h"
@@ -220,8 +221,6 @@ class BlockBasedTable : public TableReader {
   // The key retrieved are internal keys.
   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
 
-  template <class TValue>
-  struct CachableEntry;
   struct Rep;
 
   Rep* get_rep() { return rep_; }
@@ -311,8 +310,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
       const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block,
-      const UncompressionDict& uncompression_dict,
+      CachableEntry<Block>* block, const UncompressionDict& uncompression_dict,
       size_t read_amp_bytes_per_bit, bool is_index = false,
       GetContext* get_context = nullptr);
 
@@ -446,29 +444,6 @@ class BlockBasedTable::PartitionedIndexIteratorState
   bool index_key_is_full_;
 };
 
-// CachableEntry represents the entries that *may* be fetched from block cache.
-//  field `value` is the item we want to get.
-//  field `cache_handle` is the cache handle to the block cache. If the value
-//    was not read from cache, `cache_handle` will be nullptr.
-template <class TValue>
-struct BlockBasedTable::CachableEntry {
-  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
-      : value(_value), cache_handle(_cache_handle) {}
-  CachableEntry() : CachableEntry(nullptr, nullptr) {}
-  void Release(Cache* cache, bool force_erase = false) {
-    if (cache_handle) {
-      cache->Release(cache_handle, force_erase);
-      value = nullptr;
-      cache_handle = nullptr;
-    }
-  }
-  bool IsSet() const { return cache_handle != nullptr; }
-
-  TValue* value = nullptr;
-  // if the entry is from the cache, cache_handle will be populated.
-  Cache::Handle* cache_handle = nullptr;
-};
-
 struct BlockBasedTable::Rep {
   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
diff --git a/table/cachable_entry.h b/table/cachable_entry.h
new file mode 100644
index 00000000000..5b5d16ef318
--- /dev/null
+++ b/table/cachable_entry.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace rocksdb {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+public:
+  CachableEntry() = default;
+
+  CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+    bool own_value)
+    : value_(value)
+    , cache_(cache)
+    , cache_handle_(cache_handle)
+    , own_value_(own_value)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+  }
+
+  CachableEntry(const CachableEntry&) = delete;
+  CachableEntry& operator=(const CachableEntry&) = delete;
+
+  CachableEntry(CachableEntry&& rhs)
+    : value_(rhs.value_)
+    , cache_(rhs.cache_)
+    , cache_handle_(rhs.cache_handle_)
+    , own_value_(rhs.own_value_)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+  }
+
+  CachableEntry& operator=(CachableEntry&& rhs) {
+    if (UNLIKELY(this == &rhs)) {
+      return *this;
+    }
+
+    ReleaseResource();
+
+    value_ = rhs.value_;
+    cache_ = rhs.cache_;
+    cache_handle_ = rhs.cache_handle_;
+    own_value_ = rhs.own_value_;
+
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CachableEntry() {
+    ReleaseResource();
+  }
+
+  bool IsEmpty() const {
+    return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+      !own_value_;
+  }
+
+  bool IsCached() const {
+    assert(!!cache_ == !!cache_handle_);
+
+    return cache_handle_ != nullptr;
+  }
+
+  T* GetValue() const { return value_; }
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+  bool GetOwnValue() const { return own_value_; }
+
+  void Reset() {
+    ReleaseResource();
+    ResetFields();
+  }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (cache_handle_ != nullptr) {
+        assert(cache_ != nullptr);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+      } else if (own_value_) {
+        cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+      }
+    }
+
+    ResetFields();
+  }
+
+  void SetOwnedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && own_value_)) {
+      assert(cache_ == nullptr && cache_handle_ == nullptr);
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    own_value_ = true;
+  }
+
+  void SetUnownedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == nullptr &&
+                 cache_handle_ == nullptr && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    assert(!own_value_);
+  }
+
+  void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+    assert(value != nullptr);
+    assert(cache != nullptr);
+    assert(cache_handle != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == cache &&
+                 cache_handle_ == cache_handle && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    cache_ = cache;
+    cache_handle_ = cache_handle;
+    assert(!own_value_);
+  }
+
+private:
+  void ReleaseResource() {
+    if (LIKELY(cache_handle_ != nullptr)) {
+      assert(cache_ != nullptr);
+      cache_->Release(cache_handle_);
+    } else if (own_value_) {
+      delete value_;
+    }
+  }
+
+  void ResetFields() {
+    value_ = nullptr;
+    cache_ = nullptr;
+    cache_handle_ = nullptr;
+    own_value_ = false;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+  static void DeleteValue(void* arg1, void* /* arg2 */) {
+    delete static_cast<T*>(arg1);
+  }
+
+private:
+  T* value_ = nullptr;
+  Cache* cache_ = nullptr;
+  Cache::Handle* cache_handle_ = nullptr;
+  bool own_value_ = false;
+};
+
+}  // namespace rocksdb
diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc
index aab0f5509b9..3ccc7946393 100644
--- a/table/partitioned_filter_block.cc
+++ b/table/partitioned_filter_block.cc
@@ -176,24 +176,14 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
   if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
     return false;
   }
-  bool cached = false;
   auto filter_partition =
       GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         &cached, prefix_extractor);
-  if (UNLIKELY(!filter_partition.value)) {
+                         prefix_extractor);
+  if (UNLIKELY(!filter_partition.GetValue())) {
     return true;
   }
-  auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor,
-                                                 block_offset, no_io);
-  if (cached) {
-    return res;
-  }
-  if (LIKELY(filter_partition.IsSet())) {
-    filter_partition.Release(table_->rep_->table_options.block_cache.get());
-  } else {
-    delete filter_partition.value;
-  }
-  return res;
+  return filter_partition.GetValue()->KeyMayMatch(key, prefix_extractor,
+                                                  block_offset, no_io);
 }
 
 bool PartitionedFilterBlockReader::PrefixMayMatch(
@@ -215,24 +205,14 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
   if (UNLIKELY(filter_handle.size() == 0)) {  // prefix is out of range
     return false;
   }
-  bool cached = false;
   auto filter_partition =
       GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         &cached, prefix_extractor);
-  if (UNLIKELY(!filter_partition.value)) {
+                         prefix_extractor);
+  if (UNLIKELY(!filter_partition.GetValue())) {
     return true;
   }
-  auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor,
-                                                    kNotValid, no_io);
-  if (cached) {
-    return res;
-  }
-  if (LIKELY(filter_partition.IsSet())) {
-    filter_partition.Release(table_->rep_->table_options.block_cache.get());
-  } else {
-    delete filter_partition.value;
-  }
-  return res;
+  return filter_partition.GetValue()->PrefixMayMatch(prefix, prefix_extractor,
+                                                     kNotValid, no_io);
 }
 
 BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
@@ -251,10 +231,10 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
   return fltr_blk_handle;
 }
 
-BlockBasedTable::CachableEntry<FilterBlockReader>
+CachableEntry<FilterBlockReader>
 PartitionedFilterBlockReader::GetFilterPartition(
     FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
-    const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
+    const bool no_io, const SliceTransform* prefix_extractor) {
   const bool is_a_filter_partition = true;
   auto block_cache = table_->rep_->table_options.block_cache.get();
   if (LIKELY(block_cache != nullptr)) {
@@ -267,9 +247,9 @@ PartitionedFilterBlockReader::GetFilterPartition(
         RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
         RecordTick(statistics(), BLOCK_CACHE_HIT);
         RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
-                   block_cache->GetUsage(iter->second.cache_handle));
-        *cached = true;
-        return iter->second;
+                   block_cache->GetUsage(iter->second.GetCacheHandle()));
+        return {iter->second.GetValue(), nullptr /* cache */,
+          nullptr /* cache_handle */, false /* own_value */};
       }
     }
     return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
@@ -278,7 +258,8 @@ PartitionedFilterBlockReader::GetFilterPartition(
   } else {
     auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
                                      is_a_filter_partition, prefix_extractor);
-    return {filter, nullptr};
+    return {filter, nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */};
   }
 }
 
@@ -293,18 +274,10 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
   // TODO(myabandeh): better estimation for filter_map_ size
 }
 
-// Release the cached entry and decrement its ref count.
-void ReleaseFilterCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle);
-}
-
 // TODO(myabandeh): merge this with the same function in IndexReader
 void PartitionedFilterBlockReader::CacheDependencies(
     bool pin, const SliceTransform* prefix_extractor) {
   // Before read partitions, prefetch them to avoid lots of IOs
-  auto rep = table_->rep_;
   IndexBlockIter biter;
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
@@ -330,7 +303,6 @@ void PartitionedFilterBlockReader::CacheDependencies(
 
   // After prefetch, read the partitions one by one
   biter.SeekToFirst();
-  Cache* block_cache = rep->table_options.block_cache.get();
   for (; biter.Valid(); biter.Next()) {
     handle = biter.value();
     const bool no_io = true;
@@ -338,16 +310,10 @@ void PartitionedFilterBlockReader::CacheDependencies(
     auto filter = table_->GetFilter(
         prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
         /* get_context */ nullptr, prefix_extractor);
-    if (LIKELY(filter.IsSet())) {
+    if (LIKELY(filter.IsCached())) {
       if (pin) {
         filter_map_[handle.offset()] = std::move(filter);
-        RegisterCleanup(&ReleaseFilterCachedEntry, block_cache,
-                        filter.cache_handle);
-      } else {
-        block_cache->Release(filter.cache_handle);
       }
-    } else {
-      delete filter.value;
     }
   }
 }
diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h
index 5d55da54493..2563dd2bf35 100644
--- a/table/partitioned_filter_block.h
+++ b/table/partitioned_filter_block.h
@@ -15,6 +15,7 @@
 
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
+#include "table/cachable_entry.h"
 #include "table/full_filter_block.h"
 #include "table/index_builder.h"
 #include "util/autovector.h"
@@ -69,8 +70,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   BlockHandle last_encoded_handle_;
 };
 
-class PartitionedFilterBlockReader : public FilterBlockReader,
-                                     public Cleanable {
+class PartitionedFilterBlockReader : public FilterBlockReader {
  public:
   explicit PartitionedFilterBlockReader(
       const SliceTransform* prefix_extractor, bool whole_key_filtering,
@@ -93,10 +93,9 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
 
  private:
   BlockHandle GetFilterPartitionHandle(const Slice& entry);
-  BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
+  CachableEntry<FilterBlockReader> GetFilterPartition(
       FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
-      const bool no_io, bool* cached,
-      const SliceTransform* prefix_extractor = nullptr);
+      const bool no_io, const SliceTransform* prefix_extractor = nullptr);
   virtual void CacheDependencies(
       bool bin, const SliceTransform* prefix_extractor) override;
 
@@ -106,9 +105,7 @@ class PartitionedFilterBlockReader : public FilterBlockReader,
   const BlockBasedTable* table_;
   const bool index_key_includes_seq_;
   const bool index_value_is_full_;
-  std::unordered_map<uint64_t,
-                     BlockBasedTable::CachableEntry<FilterBlockReader>>
-      filter_map_;
+  std::unordered_map<uint64_t, CachableEntry<FilterBlockReader>> filter_map_;
 };
 
 }  // namespace rocksdb
diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc
index 8068f14d815..8afa530d71a 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/partitioned_filter_block_test.cc
@@ -35,7 +35,8 @@ class MockedBlockBasedTable : public BlockBasedTable {
     auto obj = new FullFilterBlockReader(
         prefix_extractor, true, BlockContents(slice),
         rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
-    return {obj, nullptr};
+    return {obj, nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */};
   }
 
   FilterBlockReader* ReadFilter(

From 6a6aef25c1f20f5922e1478999fe0e7f59af1712 Mon Sep 17 00:00:00 2001
From: Mike Kolupaev <kolmike@fb.com>
Date: Fri, 10 May 2019 12:36:40 -0700
Subject: [PATCH 024/572] Fix crash in BlockBasedTableIterator::Seek() (#5291)

Summary:
https://github.com/facebook/rocksdb/pull/5256 broke it: `block_iter_.user_key()` may not be valid even if `block_iter_points_to_real_block_` is true. E.g. if there was an IO error or Status::Incomplete.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5291

Differential Revision: D15273324

Pulled By: al13n321

fbshipit-source-id: 442e5b09f9884a58f92a6ac1ca93af719c219886
---
 table/block_based_table_reader.cc |  2 +-
 table/table_test.cc               | 38 +++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 1dc220ddec5..576117f0d35 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -2325,7 +2325,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
   }
 
   bool need_seek_index = true;
-  if (block_iter_points_to_real_block_) {
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
     // Reseek.
     prev_index_value_ = index_iter_->value();
     // We can avoid an index seek if:
diff --git a/table/table_test.cc b/table/table_test.cc
index a62ce4255e3..7292ad7c32d 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1798,6 +1798,44 @@ TEST_P(BlockBasedTableTest, PartitionIndexTest) {
   }
 }
 
+TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator());
+  AddInternalKey(&c, "pika");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(1, keys.size());
+
+  auto reader = c.GetTableReader();
+  ReadOptions ropt;
+  ropt.read_tier = ReadTier::kBlockCacheTier;
+  std::unique_ptr<InternalIterator> iter(
+      reader->NewIterator(ropt, /* prefix_extractor */ nullptr));
+
+  auto ikey = [](Slice user_key) {
+    return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+  };
+
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // This used to crash at some point.
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.

From e62601654535cdf9af46c99455af8da969efde65 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 10 May 2019 17:53:41 -0700
Subject: [PATCH 025/572] Fix a race condition caused by unlocking db mutex
 (#5294)

Summary:
Previous code may call `~ColumnFamilyData` in `DBImpl::AtomicFlushMemTablesToOutputFiles` if the column family is dropped or `cfd->IsFlushPending() == false`. In `~ColumnFamilyData`, the db mutex is released briefly and re-acquired. This can cause correctness issue. The reason is as follows.

Assume there are more bg flush threads. After bg_flush_thr1 releases the db mutex, bg_flush_thr2 can grab it and pop an element from the flush queue. This will cause bg_flush_thr2 to accidentally pick some memtables which should have been picked by bg_flush_thr1. To make the matter worse, bg_flush_thr2 can clear `flush_requested_` flag for the memtable list, causing a subsequent call to `MemTableList::IsFlushPending()` by bg_flush_thr1 to return false, which is wrong.

The fix is to delay `ColumnFamilyData::Unref` and `~ColumnFamilyData` for column families not selected for flush until `AtomicFlushMemTablesToOutputFiles` returns. Furthermore, a bg flush thread should not clear `MemTableList::flush_requested_` in `MemTableList::PickMemtablesToFlush` unless atomic flush is not used **or** the memtable list does not have unpicked memtables.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5294

Differential Revision: D15295297

Pulled By: riversand963

fbshipit-source-id: 03b101205ca22c242647cbf488bcf0ed80b2ecbd
---
 HISTORY.md                     |  3 +++
 db/db_flush_test.cc            | 31 +++++++++++++++++++++++++++++++
 db/db_impl_compaction_flush.cc | 10 +++++++---
 db/memtable_list.cc            |  8 +++++++-
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 99235a33d5c..23d8717f361 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,9 @@
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
 
+### Bug Fixes
+* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 09c461f8da4..c603f60b460 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -514,6 +514,37 @@ TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
   ASSERT_EQ("value", Get(0, "key"));
 }
 
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+  bool atomic_flush = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.max_write_buffer_number = 4;
+  // Set min_write_buffer_number_to_merge to be greater than 1, so that
+  // a column family with one memtable in the imm will not cause IsFlushPending
+  // to return true when flush_requested_ is false.
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  ASSERT_OK(Put(0, "key01", "value01"));
+  // Since max_write_buffer_number is 4, the following flush won't cause write
+  // stall.
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+  handles_[1] = nullptr;
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  delete handles_[0];
+  handles_.clear();
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 1cdadf03942..3fbf24e49f8 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -2082,6 +2082,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
   autovector<BGFlushArg> bg_flush_args;
   std::vector<SuperVersionContext>& superversion_contexts =
       job_context->superversion_contexts;
+  autovector<ColumnFamilyData*> column_families_not_to_flush;
   while (!flush_queue_.empty()) {
     // This cfd is already referenced
     const FlushRequest& flush_req = PopFirstFromFlushQueue();
@@ -2092,9 +2093,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
       ColumnFamilyData* cfd = iter.first;
       if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
         // can't flush this CF, try next one
-        if (cfd->Unref()) {
-          delete cfd;
-        }
+        column_families_not_to_flush.push_back(cfd);
         continue;
       }
       superversion_contexts.emplace_back(SuperVersionContext(true));
@@ -2133,6 +2132,11 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
       }
     }
   }
+  for (auto cfd : column_families_not_to_flush) {
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
   return status;
 }
 
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 5abe59b3632..69beb77f965 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -277,8 +277,12 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
+  bool atomic_flush = false;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
+    if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+      atomic_flush = true;
+    }
     if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
       break;
     }
@@ -292,7 +296,9 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
       ret->push_back(m);
     }
   }
-  flush_requested_ = false;  // start-flush request is complete
+  if (!atomic_flush || num_flush_not_started_ == 0) {
+    flush_requested_ = false;  // start-flush request is complete
+  }
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,

From 92c60547fe1bc3254a18c2ff82e5398339cdb45b Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Mon, 13 May 2019 11:26:34 -0700
Subject: [PATCH 026/572] db_bench: fix hang on IO error (#5300)

Summary:
db_bench will wait indefinitely if there's background error. Fix by pass `abs_time_us` to cond var.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5300

Differential Revision: D15319945

Pulled By: miasantreble

fbshipit-source-id: 0034fb7f6ec7c3303c4ccf26e54c20fbdac8ab44
---
 tools/db_bench_tool.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index f0f1d879b96..b2562f4e539 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2102,10 +2102,10 @@ class Benchmark {
       cv_.SignalAll();
     }
 
-    bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    bool WaitForRecovery(uint64_t abs_time_us) {
       InstrumentedMutexLock l(&mutex_);
       if (!recovery_complete_) {
-        cv_.Wait(/*abs_time_us*/);
+        cv_.TimedWait(abs_time_us);
       }
       if (recovery_complete_) {
         recovery_complete_ = false;

From f383641a1d772bcde6dc42f26d798c0d93311443 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 13 May 2019 17:43:47 -0700
Subject: [PATCH 027/572] Unordered Writes (#5218)

Summary:
Performing unordered writes in rocksdb when unordered_write option is set to true. When enabled the writes to memtable are done without joining any write thread. This offers much higher write throughput since the upcoming writes would not have to wait for the slowest memtable write to finish. The tradeoff is that the writes visible to a snapshot might change over time. If the application cannot tolerate that, it should implement its own mechanisms to work around that. Using TransactionDB with WRITE_PREPARED write policy is one way to achieve that. Doing so increases the max throughput by 2.2x without however compromising the snapshot guarantees.
The patch is prepared based on an original by siying
Existing unit tests are extended to include unordered_write option.

Benchmark Results:
```
TEST_TMPDIR=/dev/shm/ ./db_bench_unordered --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions  --unordered_write=1
```
With WAL
- Vanilla RocksDB: 78.6 MB/s
- WRITER_PREPARED with unordered_write: 177.8 MB/s (2.2x)
- unordered_write: 368.9 MB/s (4.7x with relaxed snapshot guarantees)

Without WAL
- Vanilla RocksDB: 111.3 MB/s
- WRITER_PREPARED with unordered_write: 259.3 MB/s MB/s (2.3x)
- unordered_write: 645.6 MB/s (5.8x with relaxed snapshot guarantees)

- WRITER_PREPARED with unordered_write disable concurrency control: 185.3 MB/s MB/s (2.35x)

Limitations:
- The feature is not yet extended to `max_successive_merges` > 0. The feature is also incompatible with `enable_pipelined_write` = true as well as with `allow_concurrent_memtable_write` = false.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5218

Differential Revision: D15219029

Pulled By: maysamyabandeh

fbshipit-source-id: 38f2abc4af8780148c6128acdba2b3227bc81759
---
 HISTORY.md                                    |   1 +
 db/c.cc                                       |   5 +
 db/db_bloom_filter_test.cc                    |   2 +
 db/db_impl.h                                  |  58 ++++-
 db/db_impl_open.cc                            |  11 +
 db/db_impl_write.cc                           | 157 ++++++++++--
 db/db_memtable_test.cc                        |  69 +++++
 db/db_test_util.cc                            |   8 +
 db/db_test_util.h                             |   6 +
 db/flush_scheduler.h                          |   3 +
 db/plain_table_db_test.cc                     |   1 +
 db/write_batch.cc                             |   9 +-
 db/write_callback_test.cc                     |  17 +-
 include/rocksdb/c.h                           |   2 +
 include/rocksdb/options.h                     |  25 ++
 options/db_options.cc                         |   3 +
 options/db_options.h                          |   1 +
 options/options_helper.cc                     |   4 +
 options/options_settable_test.cc              |   1 +
 table/block_based_table_factory.cc            |   8 +-
 tools/db_bench_tool.cc                        |   4 +
 .../pessimistic_transaction_db.cc             |  16 +-
 utilities/transactions/transaction_test.cc    |  61 +++--
 utilities/transactions/transaction_test.h     |  27 +-
 .../write_prepared_transaction_test.cc        | 237 ++++++++++++------
 .../write_unprepared_transaction_test.cc      |   3 +-
 26 files changed, 585 insertions(+), 154 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 23d8717f361..919dea21133 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
+* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
diff --git a/db/c.cc b/db/c.cc
index 58b51e2523e..8f96366fbed 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2473,6 +2473,11 @@ void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
   opt->rep.enable_pipelined_write = v;
 }
 
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.unordered_write = v;
+}
+
 void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
                                             uint32_t n) {
   opt->rep.max_subcompactions = n;
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index a2a01d6b4cf..beed590ae66 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -1095,6 +1095,8 @@ TEST_F(DBBloomFilterTest, PrefixScan) {
     options.max_background_compactions = 2;
     options.create_if_missing = true;
     options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    assert(!options.unordered_write);
+    // It is incompatible with allow_concurrent_memtable_write=false
     options.allow_concurrent_memtable_write = false;
 
     BlockBasedTableOptions table_options;
diff --git a/db/db_impl.h b/db/db_impl.h
index 623f69ba6ef..0ee5d82b56c 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -897,14 +897,32 @@ class DBImpl : public DB {
                             bool disable_memtable = false,
                             uint64_t* seq_used = nullptr);
 
-  // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
-  // the number of sub-patches. A sub-patch is a subset of the write batch that
-  // does not have duplicate keys.
-  Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
-                          WriteCallback* callback = nullptr,
-                          uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                          uint64_t* seq_used = nullptr, size_t batch_cnt = 0,
-                          PreReleaseCallback* pre_release_callback = nullptr);
+  // Write only to memtables without joining any write queue
+  Status UnorderedWriteMemtable(const WriteOptions& write_options,
+                                WriteBatch* my_batch, WriteCallback* callback,
+                                uint64_t log_ref, SequenceNumber seq,
+                                const size_t sub_batch_cnt);
+
+  // Whether the batch requires to be assigned with an order
+  enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+  // Whether it requires publishing last sequence or not
+  enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+  // Join the write_thread to write the batch only to the WAL. It is the
+  // responsibility of the caller to also write the write batch to the memtable
+  // if it required.
+  //
+  // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+  // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+  // of the write batch that does not have duplicate keys. When seq_per_batch is
+  // not set, each key is a separate sub_batch. Otherwise each duplicate key
+  // marks start of a new sub-batch.
+  Status WriteImplWALOnly(
+      WriteThread* write_thread, const WriteOptions& options,
+      WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+      const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+      PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+      const PublishLastSeq publish_last_seq, const bool disable_memtable);
 
   // write cached_recoverable_state_ to memtable if it is not empty
   // The writer must be the leader in write_thread_ and holding mutex_
@@ -1121,6 +1139,20 @@ class DBImpl : public DB {
       const autovector<const uint64_t*>& flush_memtable_ids,
       bool resuming_from_bg_err);
 
+  inline void WaitForPendingWrites() {
+    if (!immutable_db_options_.unordered_write) {
+      // Then the writes are finished before the next write group starts
+      return;
+    }
+    // Wait for the ones who already wrote to the WAL to finish their
+    // memtable write.
+    if (pending_memtable_writes_.load() != 0) {
+      std::unique_lock<std::mutex> guard(switch_mutex_);
+      switch_cv_.wait(guard,
+                      [&] { return pending_memtable_writes_.load() == 0; });
+    }
+  }
+
   // REQUIRES: mutex locked and in write thread.
   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
@@ -1571,13 +1603,21 @@ class DBImpl : public DB {
   // corresponding call to PurgeObsoleteFiles has not yet finished.
   int pending_purge_obsolete_files_;
 
-  // last time when DeleteObsoleteFiles with full scan was executed. Originaly
+  // last time when DeleteObsoleteFiles with full scan was executed. Originally
   // initialized with startup time.
   uint64_t delete_obsolete_files_last_run_;
 
   // last time stats were dumped to LOG
   std::atomic<uint64_t> last_stats_dump_time_microsec_;
 
+  // The thread that wants to switch memtable, can wait on this cv until the
+  // pending writes to memtable finishes.
+  std::condition_variable switch_cv_;
+  // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+  std::mutex switch_mutex_;
+  // Number of threads intending to write to memtable
+  std::atomic<size_t> pending_memtable_writes_ = {};
+
   // Each flush or compaction gets its own job id. this counter makes sure
   // they're unique
   std::atomic<int> next_job_id_;
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index 1bc69b49182..66104d0ba28 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -228,6 +228,17 @@ static Status ValidateOptions(
     return Status::InvalidArgument("keep_log_file_num must be greater than 0");
   }
 
+  if (db_options.unordered_write &&
+      !db_options.allow_concurrent_memtable_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with !allow_concurrent_memtable_write");
+  }
+
+  if (db_options.unordered_write && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with enable_pipelined_write");
+  }
+
   return Status::OK();
 }
 }  // namespace
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index 3edec9ac521..733eb408a8d 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -94,6 +94,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     return Status::NotSupported(
         "pipelined_writes is not compatible with seq_per_batch");
   }
+  if (immutable_db_options_.unordered_write &&
+      immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with unordered_write");
+  }
   // Otherwise IsLatestPersistentState optimization does not make sense
   assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
          disable_memtable);
@@ -107,8 +112,39 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   }
 
   if (two_write_queues_ && disable_memtable) {
-    return WriteImplWALOnly(write_options, my_batch, callback, log_used,
-                            log_ref, seq_used, batch_cnt, pre_release_callback);
+    AssignOrder assign_order =
+        seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+    // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+    // they don't consume sequence.
+    return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+                            callback, log_used, log_ref, seq_used, batch_cnt,
+                            pre_release_callback, assign_order,
+                            kDontPublishLastSeq, disable_memtable);
+  }
+
+  if (immutable_db_options_.unordered_write) {
+    const size_t sub_batch_cnt = batch_cnt != 0
+                                     ? batch_cnt
+                                     // every key is a sub-batch consuming a seq
+                                     : WriteBatchInternal::Count(my_batch);
+    uint64_t seq;
+    // Use a write thread to i) optimize for WAL write, ii) publish last
+    // sequence in in increasing order, iii) call pre_release_callback serially
+    status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
+                              log_used, log_ref, &seq, sub_batch_cnt,
+                              pre_release_callback, kDoAssignOrder,
+                              kDoPublishLastSeq, disable_memtable);
+    if (!status.ok()) {
+      return status;
+    }
+    if (seq_used) {
+      *seq_used = seq;
+    }
+    if (!disable_memtable) {
+      status = UnorderedWriteMemtable(write_options, my_batch, callback,
+                                      log_ref, seq, sub_batch_cnt);
+    }
+    return status;
   }
 
   if (immutable_db_options_.enable_pipelined_write) {
@@ -534,23 +570,65 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
   return w.FinalStatus();
 }
 
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+                                      WriteBatch* my_batch,
+                                      WriteCallback* callback, uint64_t log_ref,
+                                      SequenceNumber seq,
+                                      const size_t sub_batch_cnt) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        false /*disable_memtable*/);
+
+  if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+    w.sequence = seq;
+    size_t total_count = WriteBatchInternal::Count(my_batch);
+    InternalStats* stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+        true /*concurrent_memtable_writes*/, seq_per_batch_, sub_batch_cnt);
+
+    WriteStatusCheck(w.status);
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+  if (pending_cnt == 0) {
+    switch_cv_.notify_all();
+  }
+
+  if (!w.FinalStatus().ok()) {
+    return w.FinalStatus();
+  }
+  return Status::OK();
+}
+
 // The 2nd write queue. If enabled it will be used only for WAL-only writes.
 // This is the only queue that updates LastPublishedSequence which is only
 // applicable in a two-queue setting.
-Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
-                                WriteBatch* my_batch, WriteCallback* callback,
-                                uint64_t* log_used, uint64_t log_ref,
-                                uint64_t* seq_used, size_t batch_cnt,
-                                PreReleaseCallback* pre_release_callback) {
+Status DBImpl::WriteImplWALOnly(
+    WriteThread* write_thread, const WriteOptions& write_options,
+    WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+    const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+    PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+    const PublishLastSeq publish_last_seq, const bool disable_memtable) {
   Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
-                        true /* disable_memtable */, batch_cnt,
-                        pre_release_callback);
+                        disable_memtable, sub_batch_cnt, pre_release_callback);
   RecordTick(stats_, WRITE_WITH_WAL);
   StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
 
-  nonmem_write_thread_.JoinBatchGroup(&w);
+  write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
   if (w.state == WriteThread::STATE_COMPLETED) {
     if (log_used != nullptr) {
@@ -563,9 +641,33 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  if (publish_last_seq == kDoPublishLastSeq) {
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+    WriteContext write_context;
+    if (error_handler_.IsDBStopped()) {
+      status = error_handler_.GetBGError();
+    }
+    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+    // without paying the cost of obtaining the mutex.
+    if (status.ok()) {
+      InstrumentedMutexLock l(&mutex_);
+      bool need_log_sync = false;
+      status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+      WriteStatusCheck(status);
+    }
+    if (!status.ok()) {
+      WriteThread::WriteGroup write_group;
+      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  }
+
   WriteThread::WriteGroup write_group;
   uint64_t last_sequence;
-  nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
 
@@ -602,11 +704,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   // LastAllocatedSequence is increased inside WriteToWAL under
   // wal_write_mutex_ to ensure ordered events in WAL
   size_t seq_inc = 0 /* total_count */;
-  if (seq_per_batch_) {
+  if (assign_order == kDoAssignOrder) {
     size_t total_batch_cnt = 0;
     for (auto* writer : write_group) {
-      assert(writer->batch_cnt);
-      total_batch_cnt += writer->batch_cnt;
+      assert(writer->batch_cnt || !seq_per_batch_);
+      if (!writer->CallbackFailed()) {
+        total_batch_cnt += writer->batch_cnt;
+      }
     }
     seq_inc = total_batch_cnt;
   }
@@ -617,16 +721,21 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
     // Otherwise we inc seq number to do solely the seq allocation
     last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
   }
+
+  size_t memtable_write_cnt = 0;
   auto curr_seq = last_sequence + 1;
   for (auto* writer : write_group) {
     if (writer->CallbackFailed()) {
       continue;
     }
     writer->sequence = curr_seq;
-    if (seq_per_batch_) {
-      assert(writer->batch_cnt);
+    if (assign_order == kDoAssignOrder) {
+      assert(writer->batch_cnt || !seq_per_batch_);
       curr_seq += writer->batch_cnt;
     }
+    if (!writer->disable_memtable) {
+      memtable_write_cnt++;
+    }
     // else seq advances only by memtable writes
   }
   if (status.ok() && write_options.sync) {
@@ -648,9 +757,8 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
     for (auto* writer : write_group) {
       if (!writer->CallbackFailed() && writer->pre_release_callback) {
         assert(writer->sequence != kMaxSequenceNumber);
-        const bool DISABLE_MEMTABLE = true;
         Status ws = writer->pre_release_callback->Callback(
-            writer->sequence, DISABLE_MEMTABLE, writer->log_used);
+            writer->sequence, disable_memtable, writer->log_used);
         if (!ws.ok()) {
           status = ws;
           break;
@@ -658,7 +766,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
       }
     }
   }
-  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
+  if (publish_last_seq == kDoPublishLastSeq) {
+    versions_->SetLastSequence(last_sequence + seq_inc);
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+  }
+  if (immutable_db_options_.unordered_write && status.ok()) {
+    pending_memtable_writes_ += memtable_write_cnt;
+  }
+  write_thread->ExitAsBatchGroupLeader(write_group, status);
   if (status.ok()) {
     status = w.FinalStatus();
   }
@@ -710,6 +826,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
   if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
                total_log_size_ > GetMaxTotalWalSize())) {
+    WaitForPendingWrites();
     status = SwitchWAL(write_context);
   }
 
@@ -719,10 +836,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     // thread is writing to another DB with the same write buffer, they may also
     // be flushed. We may end up with flushing much more DBs than needed. It's
     // suboptimal but still correct.
+    WaitForPendingWrites();
     status = HandleWriteBufferFull(write_context);
   }
 
   if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    WaitForPendingWrites();
     status = ScheduleFlushes(write_context);
   }
 
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 294d0f581bc..a212c981286 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -204,6 +204,75 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
   delete mem;
 }
 
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+  int num_ops = 1000;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  Options options;
+  // A merge operator that is not sensitive to concurrent writes since in this
+  // test we don't order the writes.
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  options.allow_concurrent_memtable_write = true;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTablePostProcessInfo post_process_info;
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Put 0 as the base
+  PutFixed64(&value, static_cast<uint64_t>(0));
+  bool res = mem->Add(0, kTypeValue, "key", value);
+  ASSERT_TRUE(res);
+  value.clear();
+
+  // Write Merge concurrently
+  rocksdb::port::Thread write_thread1([&]() {
+    std::string v1;
+    for (int seq = 1; seq < num_ops / 2; seq++) {
+      PutFixed64(&v1, seq);
+      bool res1 =
+          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info);
+      ASSERT_TRUE(res1);
+      v1.clear();
+    }
+  });
+  rocksdb::port::Thread write_thread2([&]() {
+    std::string v2;
+    for (int seq = num_ops / 2; seq < num_ops; seq++) {
+      PutFixed64(&v2, seq);
+      bool res2 =
+          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info);
+      ASSERT_TRUE(res2);
+      v2.clear();
+    }
+  });
+  write_thread1.join();
+  write_thread2.join();
+
+  Status status;
+  ReadOptions roptions;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey("key", kMaxSequenceNumber);
+  res = mem->Get(lkey, &value, &status, &merge_context,
+                 &max_covering_tombstone_seq, roptions);
+  ASSERT_TRUE(res);
+  uint64_t ivalue = DecodeFixed64(Slice(value).data());
+  uint64_t sum = 0;
+  for (int seq = 0; seq < num_ops; seq++) {
+    sum += seq;
+  }
+  ASSERT_EQ(ivalue, sum);
+
+  delete mem;
+}
+
 TEST_F(DBMemTableTest, InsertWithHint) {
   Options options;
   options.allow_concurrent_memtable_write = false;
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index bee6b81d5dd..ebfc7a9cad3 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -341,6 +341,7 @@ Options DBTestBase::GetOptions(
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.memtable_factory.reset(NewHashSkipListRepFactory(16));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
     case kPlainTableFirstBytePrefix:
       options.table_factory.reset(new PlainTableFactory());
@@ -373,12 +374,14 @@ Options DBTestBase::GetOptions(
     case kVectorRep:
       options.memtable_factory.reset(new VectorRepFactory(100));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
     case kHashLinkList:
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.memtable_factory.reset(
           NewHashLinkListRepFactory(4, 0, 3, true, 4));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
       case kDirectIO: {
         options.use_direct_reads = true;
@@ -540,6 +543,11 @@ Options DBTestBase::GetOptions(
       options.manual_wal_flush = true;
       break;
     }
+    case kUnorderedWrite: {
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    }
 
     default:
       break;
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 50109e0a406..f5d7fd1a75f 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -140,6 +140,11 @@ class SpecialMemTableRep : public MemTableRep {
     memtable_->Insert(handle);
   }
 
+  void InsertConcurrently(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
   // Returns true iff an entry that compares equal to key is in the list.
   virtual bool Contains(const char* key) const override {
     return memtable_->Contains(key);
@@ -688,6 +693,7 @@ class DBTestBase : public testing::Test {
     kPartitionedFilterWithNewTableReaderForCompactions,
     kUniversalSubcompactions,
     kxxHash64Checksum,
+    kUnorderedWrite,
     // This must be the last line
     kEnd,
   };
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
index cd3575861a8..b5abec40569 100644
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@@ -28,6 +28,9 @@ class FlushScheduler {
   // Filters column families that have been dropped.
   ColumnFamilyData* TakeNextColumnFamily();
 
+  // This can be called concurrently with ScheduleFlush but it would miss all
+  // the scheduled flushes after the last synchronization. This would result
+  // into less precise enforcement of memtable sizes but should not matter much.
   bool Empty();
 
   void Clear();
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 2dd0cff0b41..8a08cf9fede 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -142,6 +142,7 @@ class PlainTableDBTest : public testing::Test,
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
     options.allow_mmap_reads = mmap_mode_;
     options.allow_concurrent_memtable_write = false;
+    options.unordered_write = false;
     return options;
   }
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 939b595305b..830fbeab15d 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -1471,7 +1471,6 @@ class MemTableInserter : public WriteBatch::Handler {
 
   Status MergeCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
-    assert(!concurrent_memtable_writes_);
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
@@ -1498,6 +1497,8 @@ class MemTableInserter : public WriteBatch::Handler {
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
     bool perform_merge = false;
+    assert(!concurrent_memtable_writes_ ||
+           moptions->max_successive_merges == 0);
 
     // If we pass DB through and options.max_successive_merges is hit
     // during recovery, Get() will be issued which will try to acquire
@@ -1505,6 +1506,7 @@ class MemTableInserter : public WriteBatch::Handler {
     // So we disable merge in recovery
     if (moptions->max_successive_merges > 0 && db_ != nullptr &&
         recovering_log_number_ == 0) {
+      assert(!concurrent_memtable_writes_);
       LookupKey lkey(key, sequence_);
 
       // Count the number of successive merges at the head
@@ -1550,6 +1552,7 @@ class MemTableInserter : public WriteBatch::Handler {
         perform_merge = false;
       } else {
         // 3) Add value to memtable
+        assert(!concurrent_memtable_writes_);
         bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
         if (UNLIKELY(!mem_res)) {
           assert(seq_per_batch_);
@@ -1562,7 +1565,9 @@ class MemTableInserter : public WriteBatch::Handler {
 
     if (!perform_merge) {
       // Add merge operator to memtable
-      bool mem_res = mem->Add(sequence_, kTypeMerge, key, value);
+      bool mem_res =
+          mem->Add(sequence_, kTypeMerge, key, value,
+                   concurrent_memtable_writes_, get_post_process_info(mem));
       if (UNLIKELY(!mem_res)) {
         assert(seq_per_batch_);
         ret_status = Status::TryAgain("key+seq exists");
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index cb880560efc..7f2b20d892f 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -124,6 +124,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
       {false, false, true, false, true},
   };
 
+  for (auto& unordered_write : {true, false}) {
   for (auto& seq_per_batch : {true, false}) {
   for (auto& two_queues : {true, false}) {
     for (auto& allow_parallel : {true, false}) {
@@ -133,15 +134,22 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
             for (auto& write_group : write_scenarios) {
               Options options;
               options.create_if_missing = true;
+              options.unordered_write = unordered_write;
               options.allow_concurrent_memtable_write = allow_parallel;
               options.enable_pipelined_write = enable_pipelined_write;
               options.two_write_queues = two_queues;
+              // Skip unsupported combinations
               if (options.enable_pipelined_write && seq_per_batch) {
-                // This combination is not supported
                 continue;
               }
               if (options.enable_pipelined_write && options.two_write_queues) {
-                // This combination is not supported
+                continue;
+              }
+              if (options.unordered_write &&
+                  !options.allow_concurrent_memtable_write) {
+                continue;
+              }
+              if (options.unordered_write && options.enable_pipelined_write) {
                 continue;
               }
 
@@ -358,8 +366,9 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
         }
       }
     }
-}
-}
+  }
+  }
+  }
 }
 
 TEST_F(WriteCallbackTest, WriteCallBackTest) {
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ed0709d22a1..5e75dd70964 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -845,6 +845,8 @@ rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+    rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
     rocksdb_options_t*, uint32_t);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a1071f62ec7..c8b4cc538d9 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -893,6 +893,31 @@ struct DBOptions {
   // Default: false
   bool enable_pipelined_write = false;
 
+  // Setting unordered_write to true trades higher write throughput with
+  // relaxing the immutability guarantee of snapshots. This violates the
+  // repeatability one expects from ::Get from a snapshot, as well as
+  // ::MultiGet and Iterator's consistent-point-in-time view property.
+  // If the application cannot tolerate the relaxed guarantees, it can implement
+  // its own mechanisms to work around that and yet benefit from the higher
+  // throughput. Using TransactionDB with WRITE_PREPARED write policy is one way
+  // to achieve immutable snapshots despite unordered_write.
+  //
+  // By default, i.e., when it is false, rocksdb does not advance the sequence
+  // number for new snapshots unless all the writes with lower sequence numbers
+  // are already finished. This provides the immutability that we except from
+  // snapshots. Moreover, since Iterator and MultiGet internally depend on
+  // snapshots, the snapshot immutability results into Iterator and MultiGet
+  // offering consistent-point-in-time view. If set to true, although
+  // Read-Your-Own-Write property is still provided, the snapshot immutability
+  // property is relaxed: the writes issued after the snapshot is obtained (with
+  // larger sequence numbers) will be still not visible to the reads from that
+  // snapshot, however, there still might be pending writes (with lower sequence
+  // number) that will change the state visible to the snapshot after they are
+  // landed to the memtable.
+  //
+  // Default: false
+  bool unordered_write = false;
+
   // If true, allow multi-writers to update mem tables in parallel.
   // Only some memtable_factory-s support concurrent writes; currently it
   // is implemented only for SkipListFactory.  Concurrent memtable writes
diff --git a/options/db_options.cc b/options/db_options.cc
index 83f1a18b042..e180238f433 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -67,6 +67,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       listeners(options.listeners),
       enable_thread_tracking(options.enable_thread_tracking),
       enable_pipelined_write(options.enable_pipelined_write),
+      unordered_write(options.unordered_write),
       allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
       enable_write_thread_adaptive_yield(
           options.enable_write_thread_adaptive_yield),
@@ -185,6 +186,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    enable_thread_tracking);
   ROCKS_LOG_HEADER(log, "                 Options.enable_pipelined_write: %d",
                    enable_pipelined_write);
+  ROCKS_LOG_HEADER(log, "                 Options.unordered_write: %d",
+                   unordered_write);
   ROCKS_LOG_HEADER(log, "        Options.allow_concurrent_memtable_write: %d",
                    allow_concurrent_memtable_write);
   ROCKS_LOG_HEADER(log, "     Options.enable_write_thread_adaptive_yield: %d",
diff --git a/options/db_options.h b/options/db_options.h
index 8d02003623e..67b26786f5e 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -60,6 +60,7 @@ struct ImmutableDBOptions {
   std::vector<std::shared_ptr<EventListener>> listeners;
   bool enable_thread_tracking;
   bool enable_pipelined_write;
+  bool unordered_write;
   bool allow_concurrent_memtable_write;
   bool enable_write_thread_adaptive_yield;
   uint64_t write_thread_max_yield_usec;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index a973bbfde51..c33c2be6fb7 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -103,6 +103,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;
   options.delayed_write_rate = mutable_db_options.delayed_write_rate;
   options.enable_pipelined_write = immutable_db_options.enable_pipelined_write;
+  options.unordered_write = immutable_db_options.unordered_write;
   options.allow_concurrent_memtable_write =
       immutable_db_options.allow_concurrent_memtable_write;
   options.enable_write_thread_adaptive_yield =
@@ -1583,6 +1584,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"enable_pipelined_write",
          {offsetof(struct DBOptions, enable_pipelined_write),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"unordered_write",
+         {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
         {"allow_concurrent_memtable_write",
          {offsetof(struct DBOptions, allow_concurrent_memtable_write),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2d6cc11c02e..79a4fa81475 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -279,6 +279,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "advise_random_on_open=true;"
                              "fail_if_options_file_error=false;"
                              "enable_pipelined_write=false;"
+                             "unordered_write=false;"
                              "allow_concurrent_memtable_write=true;"
                              "wal_recovery_mode=kPointInTimeRecovery;"
                              "enable_write_thread_adaptive_yield=true;"
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 47fe8e1b0e3..790a2c99ecc 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -227,7 +227,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
 }
 
 Status BlockBasedTableFactory::SanitizeOptions(
-    const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const {
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
       cf_opts.prefix_extractor == nullptr) {
     return Status::InvalidArgument(
@@ -268,6 +268,12 @@ Status BlockBasedTableFactory::SanitizeOptions(
         "data_block_hash_table_util_ratio should be greater than 0 when "
         "data_block_index_type is set to kDataBlockBinaryAndHash");
   }
+  if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+    // TODO(myabandeh): support it
+    return Status::InvalidArgument(
+        "max_successive_merges larger than 0 is currently inconsistent with "
+        "unordered_write");
+  }
   return Status::OK();
 }
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index b2562f4e539..b806fff8980 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -891,6 +891,9 @@ DEFINE_uint64(delayed_write_rate, 8388608u,
 DEFINE_bool(enable_pipelined_write, true,
             "Allow WAL and memtable writes to be pipelined");
 
+DEFINE_bool(unordered_write, false,
+            "Allow WAL and memtable writes to be pipelined");
+
 DEFINE_bool(allow_concurrent_memtable_write, true,
             "Allow multi-writers to update mem tables in parallel.");
 
@@ -3552,6 +3555,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.enable_write_thread_adaptive_yield =
         FLAGS_enable_write_thread_adaptive_yield;
     options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+    options.unordered_write = FLAGS_unordered_write;
     options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
     options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
     options.rate_limit_delay_max_milliseconds =
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 8eb21777a99..05973e83aea 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -221,9 +221,18 @@ Status TransactionDB::Open(
     std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
   Status s;
   DB* db = nullptr;
+  if (txn_db_options.write_policy == WRITE_COMMITTED &&
+      db_options.unordered_write) {
+    return Status::NotSupported(
+        "WRITE_COMMITTED is incompatible with unordered_writes");
+  }
+  if (txn_db_options.write_policy == WRITE_UNPREPARED &&
+      db_options.unordered_write) {
+    // TODO(lth): support it
+    return Status::NotSupported(
+        "WRITE_UNPREPARED is currently incompatible with unordered_writes");
+  }
 
-  ROCKS_LOG_WARN(db_options.info_log, "Transaction write_policy is %" PRId32,
-                 static_cast<int>(txn_db_options.write_policy));
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
   std::vector<size_t> compaction_enabled_cf_indices;
   DBOptions db_options_2pc = db_options;
@@ -238,6 +247,9 @@ Status TransactionDB::Open(
   s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
                    use_seq_per_batch, use_batch_per_txn);
   if (s.ok()) {
+    ROCKS_LOG_WARN(db->GetDBOptions().info_log,
+                   "Transaction write_policy is %" PRId32,
+                   static_cast<int>(txn_db_options.write_policy));
     s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
                dbptr);
   }
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 1a5bf2d6644..997a5abe2d8 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -42,40 +42,48 @@ namespace rocksdb {
 
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
-                      std::make_tuple(false, true, WRITE_COMMITTED),
-                      std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED),
-                      std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionStressTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
-                      std::make_tuple(false, true, WRITE_COMMITTED),
-                      std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED),
-                      std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
 INSTANTIATE_TEST_CASE_P(
     StackableDBAsBaseDB, TransactionTest,
-    ::testing::Values(std::make_tuple(true, true, WRITE_COMMITTED),
-                      std::make_tuple(true, true, WRITE_PREPARED),
-                      std::make_tuple(true, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
 
 // MySQLStyleTransactionTest takes far too long for valgrind to run.
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED, false),
-                      std::make_tuple(false, true, WRITE_COMMITTED, false),
-                      std::make_tuple(false, false, WRITE_PREPARED, false),
-                      std::make_tuple(false, false, WRITE_PREPARED, true),
-                      std::make_tuple(false, true, WRITE_PREPARED, false),
-                      std::make_tuple(false, true, WRITE_PREPARED, true),
-                      std::make_tuple(false, false, WRITE_UNPREPARED, false),
-                      std::make_tuple(false, false, WRITE_UNPREPARED, true),
-                      std::make_tuple(false, true, WRITE_UNPREPARED, false),
-                      std::make_tuple(false, true, WRITE_UNPREPARED, true)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
@@ -5646,7 +5654,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
     }    // do_rollback
   }      // do_prepare
 
-  {
+  if (!options.unordered_write) {
     // Also test with max_successive_merges > 0. max_successive_merges will not
     // affect our algorithm for duplicate key insertion but we add the test to
     // verify that.
@@ -5697,6 +5705,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
 
     std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
     cf_options.comparator = comp_gc.get();
+    cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     delete cf_handle;
     std::vector<ColumnFamilyDescriptor> cfds{
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 33b2c51ea2f..b4254870951 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -39,6 +39,8 @@ namespace rocksdb {
 // Return true if the ith bit is set in combination represented by comb
 bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); }
 
+enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite };
+
 class TransactionTestBase : public ::testing::Test {
  public:
   TransactionDB* db;
@@ -50,11 +52,13 @@ class TransactionTestBase : public ::testing::Test {
   bool use_stackable_db_;
 
   TransactionTestBase(bool use_stackable_db, bool two_write_queue,
-                      TxnDBWritePolicy write_policy)
+                      TxnDBWritePolicy write_policy,
+                      WriteOrdering write_ordering)
       : db(nullptr), env(nullptr), use_stackable_db_(use_stackable_db) {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.write_buffer_size = 4 * 1024;
+    options.unordered_write = write_ordering == kUnorderedWrite;
     options.level0_file_num_compaction_trigger = 2;
     options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
     env = new FaultInjectionTestEnv(Env::Default());
@@ -352,6 +356,9 @@ class TransactionTestBase : public ::testing::Test {
     Transaction* txn;
 
     txn_db_options.write_policy = from_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
     ReOpen();
 
     for (int i = 0; i < 1024; i++) {
@@ -400,6 +407,9 @@ class TransactionTestBase : public ::testing::Test {
     }  // for i
 
     txn_db_options.write_policy = to_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
     auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     // Before upgrade/downgrade the WAL must be emptied
     if (empty_wal) {
@@ -437,13 +447,14 @@ class TransactionTestBase : public ::testing::Test {
   }
 };
 
-class TransactionTest : public TransactionTestBase,
-                        virtual public ::testing::WithParamInterface<
-                            std::tuple<bool, bool, TxnDBWritePolicy>> {
+class TransactionTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
  public:
   TransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam())){};
+                            std::get<2>(GetParam()), std::get<3>(GetParam())){};
 };
 
 class TransactionStressTest : public TransactionTest {};
@@ -451,12 +462,12 @@ class TransactionStressTest : public TransactionTest {};
 class MySQLStyleTransactionTest
     : public TransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, bool>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
  public:
   MySQLStyleTransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam())),
-        with_slow_threads_(std::get<3>(GetParam())) {
+                            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        with_slow_threads_(std::get<4>(GetParam())) {
     if (with_slow_threads_ &&
         (txn_db_options.write_policy == WRITE_PREPARED ||
          txn_db_options.write_policy == WRITE_UNPREPARED)) {
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index f2f3f30e26e..d5a03cd0408 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -342,8 +342,10 @@ class WritePreparedTxnDBMock : public WritePreparedTxnDB {
 class WritePreparedTransactionTestBase : public TransactionTestBase {
  public:
   WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
-                                   TxnDBWritePolicy write_policy)
-      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){};
+                                   TxnDBWritePolicy write_policy,
+                                   WriteOrdering write_ordering)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            write_ordering){};
 
  protected:
   void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
@@ -518,26 +520,26 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
 class WritePreparedTransactionTest
     : public WritePreparedTransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
  public:
   WritePreparedTransactionTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())){};
 };
 
 #ifndef ROCKSDB_VALGRIND_RUN
 class SnapshotConcurrentAccessTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
  public:
   SnapshotConcurrentAccessTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())),
-        split_id_(std::get<3>(GetParam())),
-        split_cnt_(std::get<4>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())){};
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -549,15 +551,15 @@ class SnapshotConcurrentAccessTest
 
 class SeqAdvanceConcurrentTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
  public:
   SeqAdvanceConcurrentTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())),
-        split_id_(std::get<3>(GetParam())),
-        split_cnt_(std::get<4>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())){};
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -568,81 +570,152 @@ class SeqAdvanceConcurrentTest
 
 INSTANTIATE_TEST_CASE_P(
     WritePreparedTransactionTest, WritePreparedTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
 
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SnapshotConcurrentAccessTest,
-    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 1, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 2, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 3, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 4, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 5, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 6, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 7, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 8, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 9, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 10, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 11, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 12, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 13, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 14, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 15, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 16, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 17, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 18, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 19, 20)));
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SnapshotConcurrentAccessTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 1, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 2, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 3, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 4, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 5, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 6, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 7, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 8, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 9, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 10, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 11, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 12, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 13, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 14, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 15, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 16, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 17, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 18, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 19, 20)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 19,
+                        20)));
 
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SeqAdvanceConcurrentTest,
-    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 1, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 2, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 3, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 4, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 5, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 6, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 7, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 8, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 9, 10)));
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SeqAdvanceConcurrentTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 1, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 2, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 3, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 4, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 5, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 6, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 7, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 8, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 9, 10)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10),
+
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(WritePreparedTransactionTest, CommitMapTest) {
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 9aee33b078f..914f3f581e4 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -20,7 +20,8 @@ class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
   WriteUnpreparedTransactionTestBase(bool use_stackable_db,
                                      bool two_write_queue,
                                      TxnDBWritePolicy write_policy)
-      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){}
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            kOrderedWrite) {}
 };
 
 class WriteUnpreparedTransactionTest

From 6492430eaf1a13730eec81321528558cbf486c96 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 14 May 2019 11:54:52 -0700
Subject: [PATCH 028/572] Fix a bug in db_stress and an incorrect assertion in
 FilePickerMultiGet (#5301)

Summary:
This PR has two fixes for crash test failures -
1. Fix a bug in TestMultiGet() in db_stress that was passing list of key to MultiGet() in the wrong order, thus ensuring that actual values don't match expected values
2. Remove an incorrect assertion in FilePickerMultiGet::GetNextFileInLevelWithKeys() that checks that files in a level are in sorted order. This is not true with MultiGet(), especially if there are duplicate keys and we may have to go back one file for the next key. Furthermore, this assertion makes more sense when a new version is created, rather than at lookup time

Test -
asan_crash and ubsan_crash tests
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5301

Differential Revision: D15337383

Pulled By: anand1976

fbshipit-source-id: 35092cb15bbc1700e5e823cbe07bfa62f1e9e6c6
---
 db/version_set.cc  | 41 ++---------------------------------------
 tools/db_stress.cc | 28 ++++++++++++++++------------
 2 files changed, 18 insertions(+), 51 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 84302556e66..f0dfe765871 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -353,7 +353,7 @@ class FilePickerMultiGet {
   struct FilePickerContext;
 
  public:
-  FilePickerMultiGet(std::vector<FileMetaData*>* files, MultiGetRange* range,
+  FilePickerMultiGet(MultiGetRange* range,
                      autovector<LevelFilesBrief>* file_levels,
                      unsigned int num_levels, FileIndexer* file_indexer,
                      const Comparator* user_comparator,
@@ -368,18 +368,12 @@ class FilePickerMultiGet {
         maybe_repeat_key_(false),
         current_level_range_(*range, range->begin(), range->end()),
         current_file_range_(*range, range->begin(), range->end()),
-#ifndef NDEBUG
-        files_(files),
-#endif
         level_files_brief_(file_levels),
         is_hit_file_last_in_level_(false),
         curr_file_level_(nullptr),
         file_indexer_(file_indexer),
         user_comparator_(user_comparator),
         internal_comparator_(internal_comparator) {
-#ifdef NDEBUG
-    (void)files;
-#endif
     for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
       fp_ctx_array_[iter.index()] =
           FilePickerContext(0, FileIndexer::kLevelMaxIndex);
@@ -485,25 +479,6 @@ class FilePickerMultiGet {
       } else {
         file_hit = true;
       }
-#ifndef NDEBUG
-      // Sanity check to make sure that the files are correctly sorted
-      if (f != prev_file_) {
-        if (prev_file_) {
-          if (curr_level_ != 0) {
-            int comp_sign = internal_comparator_->Compare(
-                prev_file_->largest_key, f->smallest_key);
-            assert(comp_sign < 0);
-          } else if (fp_ctx.curr_index_in_curr_level > 0) {
-            // level == 0, the current file cannot be newer than the previous
-            // one. Use compressed data structure, has no attribute seqNo
-            assert(!NewestFirstBySeqNo(
-                files_[0][fp_ctx.curr_index_in_curr_level],
-                files_[0][fp_ctx.curr_index_in_curr_level - 1]));
-          }
-        }
-        prev_file_ = f;
-      }
-#endif
       if (cmp_largest == 0) {
         // cmp_largest is 0, which means the next key will not be in this
         // file, so stop looking further. Also don't increment megt_iter_
@@ -635,9 +610,6 @@ class FilePickerMultiGet {
   bool maybe_repeat_key_;
   MultiGetRange current_level_range_;
   MultiGetRange current_file_range_;
-#ifndef NDEBUG
-  std::vector<FileMetaData*>* files_;
-#endif
   autovector<LevelFilesBrief>* level_files_brief_;
   bool search_ended_;
   bool is_hit_file_last_in_level_;
@@ -645,9 +617,6 @@ class FilePickerMultiGet {
   FileIndexer* file_indexer_;
   const Comparator* user_comparator_;
   const InternalKeyComparator* internal_comparator_;
-#ifndef NDEBUG
-  FdWithKeyRange* prev_file_;
-#endif
 
   // Setup local variables to search next level.
   // Returns false if there are no more levels to search.
@@ -656,9 +625,6 @@ class FilePickerMultiGet {
       MultiGetRange::Iterator mget_iter = current_level_range_.begin();
       if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
           curr_file_level_->num_files) {
-#ifndef NDEBUG
-        prev_file_ = nullptr;
-#endif
         batch_iter_prev_ = current_level_range_.begin();
         batch_iter_ = current_level_range_.begin();
         return true;
@@ -754,9 +720,6 @@ class FilePickerMultiGet {
         fp_ctx.curr_index_in_curr_level = start_index;
       }
       if (level_contains_keys) {
-#ifndef NDEBUG
-        prev_file_ = nullptr;
-#endif
         batch_iter_prev_ = current_level_range_.begin();
         batch_iter_ = current_level_range_.begin();
         return true;
@@ -1800,7 +1763,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
 
   MultiGetRange file_picker_range(*range, range->begin(), range->end());
   FilePickerMultiGet fp(
-      storage_info_.files_, &file_picker_range,
+      &file_picker_range,
       &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
       &storage_info_.file_indexer_, user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index c6959802be3..6eb974e0934 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -3609,36 +3609,40 @@ class BatchedOpsStressTest : public StressTest {
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys) {
     size_t num_keys = rand_keys.size();
-    std::vector<Status> statuses(num_keys);
-    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    for (int key = 0; key < 10; ++key) {
+    std::vector<Status> ret_status(num_keys);
+    std::array<std::string, 10> keys = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    size_t num_prefixes = keys.size();
+    for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
       std::vector<Slice> key_slices;
-      std::vector<PinnableSlice> values(num_keys);
+      std::vector<PinnableSlice> values(num_prefixes);
+      std::vector<Status> statuses(num_prefixes);
       ReadOptions readoptionscopy = readoptions;
       readoptionscopy.snapshot = db_->GetSnapshot();
       std::vector<std::string> key_str;
-      key_str.reserve(num_keys);
-      key_slices.reserve(num_keys);
+      key_str.reserve(num_prefixes);
+      key_slices.reserve(num_prefixes);
       std::string from_db;
       ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
 
-      for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
+      for (size_t key = 0; key < num_prefixes; ++key) {
         key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
         key_slices.emplace_back(key_str.back());
       }
-      db_->MultiGet(readoptionscopy, cfh, num_keys, key_slices.data(),
+      db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(),
           values.data(), statuses.data());
-      for (size_t i = 0; i < num_keys; i++) {
+      for (size_t i = 0; i < num_prefixes; i++) {
         Status s = statuses[i];
         if (!s.ok() && !s.IsNotFound()) {
           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
           thread->stats.AddErrors(1);
+          ret_status[rand_key] = s;
           // we continue after error rather than exiting so that we can
           // find more errors if any
         } else if (s.IsNotFound()) {
           thread->stats.AddGets(1, 0);
+          ret_status[rand_key] = s;
         } else {
-          char expected_prefix = (keys[key])[0];
+          char expected_prefix = (keys[i])[0];
           char actual_prefix = (values[i])[0];
           if (actual_prefix != expected_prefix) {
             fprintf(stderr, "error expected prefix = %c actual = %c\n",
@@ -3655,7 +3659,7 @@ class BatchedOpsStressTest : public StressTest {
       db_->ReleaseSnapshot(readoptionscopy.snapshot);
 
       // Now that we retrieved all values, check that they all match
-      for (size_t i = 1; i < num_keys; i++) {
+      for (size_t i = 1; i < num_prefixes; i++) {
         if (values[i] != values[0]) {
           fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
                   key_str[i].c_str(),
@@ -3667,7 +3671,7 @@ class BatchedOpsStressTest : public StressTest {
       }
     }
 
-    return statuses;
+    return ret_status;
   }
 
   // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P

From 3c3252a06a77c5f6877392b882014dc8c8b2bd8f Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 15 May 2019 11:18:34 -0700
Subject: [PATCH 029/572] Fix tsan complaint in ConcurrentMergeWrite test
 (#5308)

Summary:
The test was not using separate MemTablePostProcessInfo per memetable insert thread and thus tsan was complaining about data race.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5308

Differential Revision: D15356420

Pulled By: maysamyabandeh

fbshipit-source-id: 46c2f2d19fb02c3c775b587aa09ca9c0dae6ed04
---
 db/db_memtable_test.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index a212c981286..184c6f53b11 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -222,7 +222,6 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
   options.allow_concurrent_memtable_write = true;
   ImmutableCFOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
-  MemTablePostProcessInfo post_process_info;
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
 
@@ -234,21 +233,23 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
 
   // Write Merge concurrently
   rocksdb::port::Thread write_thread1([&]() {
+  MemTablePostProcessInfo post_process_info1;
     std::string v1;
     for (int seq = 1; seq < num_ops / 2; seq++) {
       PutFixed64(&v1, seq);
       bool res1 =
-          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info);
+          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
       ASSERT_TRUE(res1);
       v1.clear();
     }
   });
   rocksdb::port::Thread write_thread2([&]() {
+  MemTablePostProcessInfo post_process_info2;
     std::string v2;
     for (int seq = num_ops / 2; seq < num_ops; seq++) {
       PutFixed64(&v2, seq);
       bool res2 =
-          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info);
+          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
       ASSERT_TRUE(res2);
       v2.clear();
     }

From 189e711b3740ae3fbe2eeb8cd5a12419346bd627 Mon Sep 17 00:00:00 2001
From: Andres Suarez <asuarez@fb.com>
Date: Wed, 15 May 2019 11:28:39 -0700
Subject: [PATCH 030/572] Text lint all .gitignore files

Reviewed By: scottrice, pallotron

Differential Revision: D15353820

fbshipit-source-id: 74f9eaadc90363a958692259f5cb66cef91ac8ef
---
 docs/.gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/.gitignore b/docs/.gitignore
index e48dc98be89..3938549cbe6 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -6,4 +6,3 @@ _site
 .sass-cache
 *.psd
 *~
-

From a42757607d5c7bc503958fd8027a4f9ef5cfceaf Mon Sep 17 00:00:00 2001
From: Thomas Fersch <bwps@gmx.de>
Date: Wed, 15 May 2019 13:14:18 -0700
Subject: [PATCH 031/572] Use pre-increment instead of post-increment for
 iterators (#5296)

Summary:
Google C++ style guide indicates pre-increment should be used for iterators: https://google.github.io/styleguide/cppguide.html#Preincrement_and_Predecrement. Replaced all instances of ' it++' by ' ++it' (where type is iterator). So this covers the cases where iterators are named 'it'.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5296

Differential Revision: D15301256

Pulled By: tfersch

fbshipit-source-id: 2803483c1392504ad3b281d21db615429c71114b
---
 db/compaction_job.cc                                   |  2 +-
 db/db_impl.h                                           |  2 +-
 db/db_impl_compaction_flush.cc                         | 10 +++++-----
 db/memtable_list.cc                                    |  2 +-
 db/prefix_test.cc                                      |  2 +-
 utilities/transactions/pessimistic_transaction_db.cc   |  4 ++--
 utilities/transactions/transaction_test.cc             |  4 ++--
 .../transactions/write_prepared_transaction_test.cc    |  6 +++---
 utilities/transactions/write_prepared_txn_db.cc        |  4 ++--
 utilities/transactions/write_unprepared_txn_db.cc      |  2 +-
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 00386a99ad4..fb77431fddc 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -519,7 +519,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
   auto* v = compact_->compaction->input_version();
   for (auto it = bounds.begin();;) {
     const Slice a = *it;
-    it++;
+    ++it;
 
     if (it == bounds.end()) {
       break;
diff --git a/db/db_impl.h b/db/db_impl.h
index 0ee5d82b56c..c4fae9a6ad5 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -725,7 +725,7 @@ class DBImpl : public DB {
 
   void DeleteAllRecoveredTransactions() {
     for (auto it = recovered_transactions_.begin();
-         it != recovered_transactions_.end(); it++) {
+         it != recovered_transactions_.end(); ++it) {
       delete it->second;
     }
     recovered_transactions_.clear();
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 3fbf24e49f8..900ea4acdcd 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -2794,7 +2794,7 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
       it = manual_compaction_dequeue_.erase(it);
       return;
     }
-    it++;
+    ++it;
   }
   assert(false);
   return;
@@ -2815,7 +2815,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
   bool seen = false;
   while (it != manual_compaction_dequeue_.end()) {
     if (m == (*it)) {
-      it++;
+      ++it;
       seen = true;
       continue;
     } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
@@ -2824,7 +2824,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
       // and (*it) is ahead in the queue and is not yet in progress
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
@@ -2842,7 +2842,7 @@ bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
       // in progress
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
@@ -2855,7 +2855,7 @@ bool DBImpl::HasExclusiveManualCompaction() {
     if ((*it)->exclusive) {
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 69beb77f965..21b44b1798a 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -437,7 +437,7 @@ Status MemTableList::TryInstallMemtableFlushResults(
           ++mem_id;
         }
       } else {
-        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) {
+        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
           MemTable* m = *it;
           // commit failed. setup state so that we can flush again.
           ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index ac854cb3dbd..be420ded183 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -751,7 +751,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) {
       for (size_t k = 0; k < 9; k++) {
         if (rnd.OneIn(2) || it == whole_map.begin()) {
           iter->Next();
-          it++;
+          ++it;
           if (FLAGS_enable_print) {
             std::cout << "Next >> ";
           }
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 05973e83aea..ecf6d2ff387 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -121,7 +121,7 @@ Status PessimisticTransactionDB::Initialize(
   assert(dbimpl != nullptr);
   auto rtrxs = dbimpl->recovered_transactions();
 
-  for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
+  for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) {
     auto recovered_trx = it->second;
     assert(recovered_trx);
     assert(recovered_trx->batches_.size() == 1);
@@ -594,7 +594,7 @@ void PessimisticTransactionDB::GetAllPreparedTransactions(
   assert(transv);
   transv->clear();
   std::lock_guard<std::mutex> lock(name_map_mutex_);
-  for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
+  for (auto it = transactions_.begin(); it != transactions_.end(); ++it) {
     if (it->second->GetState() == Transaction::PREPARED) {
       transv->push_back(it->second);
     }
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 997a5abe2d8..2433af82637 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -567,7 +567,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
     TransactionID leaf_id =
         dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
 
-    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
       ASSERT_EQ(dl_node.m_cf_id, 0);
@@ -774,7 +774,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
     }
 
     // Iterates backwards over path verifying decreasing txn_ids.
-    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
       ASSERT_EQ(dl_node.m_cf_id, 0);
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index d5a03cd0408..6bad81db0ee 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1099,7 +1099,7 @@ TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccessTest) {
           new_snapshots.push_back(snapshots[old_snapshots.size() + i]);
         }
         for (auto it = common_snapshots.begin(); it != common_snapshots.end();
-             it++) {
+             ++it) {
           auto snapshot = *it;
           // Create a commit entry that is around the snapshot and thus should
           // be not be discarded
@@ -1166,12 +1166,12 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) {
   // b. delayed prepared should contain every txn <= max and prepared should
   // only contain txns > max
   auto it = initial_prepared.begin();
-  for (; it != initial_prepared.end() && *it <= new_max; it++) {
+  for (; it != initial_prepared.end() && *it <= new_max; ++it) {
     ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it));
   }
   ASSERT_TRUE(wp_db->delayed_prepared_.empty());
   for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty();
-       it++, wp_db->prepared_txns_.pop()) {
+       ++it, wp_db->prepared_txns_.pop()) {
     ASSERT_EQ(*it, wp_db->prepared_txns_.top());
   }
   ASSERT_TRUE(it == initial_prepared.end());
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 8a7883c0504..3b09cbbf7d6 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -798,7 +798,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
   // afterwards.
   size_t i = 0;
   auto it = snapshots.begin();
-  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; it++, i++) {
+  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) {
     snapshot_cache_[i].store(*it, std::memory_order_release);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
@@ -812,7 +812,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
   }
 #endif
   snapshots_.clear();
-  for (; it != snapshots.end(); it++) {
+  for (; it != snapshots.end(); ++it) {
     // Insert them to a vector that is less efficient to access
     // concurrently
     snapshots_.push_back(*it);
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 4fcbfbc37c5..a1aeedf2e15 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -46,7 +46,7 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
   };
 
   // Iterate starting with largest sequence number.
-  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); it++) {
+  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
     auto last_visible_txn = it->first - 1;
     const auto& batch = it->second.batch_;
     WriteBatch rollback_batch;

From da7c89d79d7033a53f30f82da3630ba3a0a77b8d Mon Sep 17 00:00:00 2001
From: Yuqi Gu <yuqi.gu@arm.com>
Date: Wed, 15 May 2019 13:24:36 -0700
Subject: [PATCH 032/572] RocksDB Cmake changes for Arm64 CRC32 Optimization
 (#5304)

Summary:
Add CMake build for RocksDB CRC32 Optimization on Arm64.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5304

Differential Revision: D15355193

Pulled By: miasantreble

fbshipit-source-id: 8d750a444274fbde14e510f51290631a369026b8
---
 CMakeLists.txt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4feee986c3..5bb0c089f2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,15 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
   endif(HAS_ALTIVEC)
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+        CHECK_C_COMPILER_FLAG("-march=armv8-a+crc" HAS_ARMV8_CRC)
+  if(HAS_ARMV8_CRC)
+    message(STATUS " HAS_ARMV8_CRC yes")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc -Wno-unused-function")
+  endif(HAS_ARMV8_CRC)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
 if(PORTABLE)
@@ -213,7 +222,7 @@ else()
   if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
   else()
-    if(NOT HAVE_POWER8)
+    if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
     endif()
   endif()
@@ -694,6 +703,11 @@ if(HAVE_POWER8)
     util/crc32c_ppc_asm.S)
 endif(HAVE_POWER8)
 
+if(HAS_ARMV8_CRC)
+  list(APPEND SOURCES
+    util/crc32c_arm64.cc)
+endif(HAS_ARMV8_CRC)
+
 if(WIN32)
   list(APPEND SOURCES
     port/win/io_win.cc

From ad27045d14871d7edbee606ec19108c89c974336 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 15 May 2019 13:36:01 -0700
Subject: [PATCH 033/572] Update HISTORY after cherrypicking a bug fix to 6.2
 (#5309)

Summary:
After cherry-pick a bug fix to 6.2.fb branch, update the HISTORY.md file to reflect this change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5309

Differential Revision: D15358002

Pulled By: riversand963

fbshipit-source-id: 5a60510ec6dd444ce5ffaefc69b2e4c38914a921
---
 HISTORY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 919dea21133..9cf8a88da04 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -13,7 +13,6 @@
 * Merging iterator to avoid child iterator reseek for some cases
 
 ### Bug Fixes
-* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
@@ -34,6 +33,7 @@
 * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys.
 * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor.
 * Close a WAL file before another thread deletes it.
+* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
 
 ## 6.1.1 (4/9/2019)
 ### New Features

From f0e821619742a8e97521d035c7e527c21743530a Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 15 May 2019 13:48:59 -0700
Subject: [PATCH 034/572] WritePrepared: Fix deadlock in WriteRecoverableState
 (#5306)

Summary:
The recent improvement in https://github.com/facebook/rocksdb/pull/3661 could cause a deadlock: When writing recoverable state, we also commit its sequence number to commit table, which could result into evicting existing commit entry, which could result into advancing max_evicted_seq_, which would need to get snapshots from database, which requires obtaining db mutex. The patch releases db_mutex before calling the callback in WriteRecoverableState to avoid the potential deadlock. It also improves the stress tests to let the issue be manifested in the tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5306

Differential Revision: D15341458

Pulled By: maysamyabandeh

fbshipit-source-id: 05dcbed7e21b789fd1e5fd5ee8eea08077162323
---
 db/compaction_iterator.cc                  | 14 --------------
 db/db_impl_write.cc                        | 12 +++++++++---
 util/transaction_test_util.cc              |  6 ++++++
 utilities/transactions/transaction_test.cc |  3 +++
 utilities/transactions/transaction_test.h  |  1 +
 5 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index bce0b82dbc7..ca55eef7123 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -493,20 +493,6 @@ void CompactionIterator::NextFromInput() {
       // in this snapshot.
       assert(last_sequence >= current_user_key_sequence_);
 
-      // Note2: if last_snapshot < current_user_key_snapshot, it can only
-      // mean last_snapshot is released between we process last value and
-      // this value, and findEarliestVisibleSnapshot returns the next snapshot
-      // as current_user_key_snapshot. In this case last value and current
-      // value are both in current_user_key_snapshot currently.
-      // Although last_snapshot is released we might still get a definitive
-      // response when key sequence number changes, e.g., when seq is determined
-      // too old and visible in all snapshots.
-      assert(last_snapshot == current_user_key_snapshot_ ||
-             (snapshot_checker_ != nullptr &&
-              snapshot_checker_->CheckInSnapshot(current_user_key_sequence_,
-                                                 last_snapshot) !=
-                  SnapshotCheckerResult::kNotInSnapshot));
-
       ++iter_stats_.num_record_drop_hidden;  // (A)
       input_->Next();
     } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index 733eb408a8d..f4c72e298ee 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -214,9 +214,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   WriteThread::WriteGroup write_group;
   bool in_parallel_group = false;
   uint64_t last_sequence = kMaxSequenceNumber;
-  if (!two_write_queues_) {
-    last_sequence = versions_->LastSequence();
-  }
 
   mutex_.Lock();
 
@@ -231,6 +228,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
     status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    if (!two_write_queues_) {
+      // Assign it after ::PreprocessWrite since the sequence might advance
+      // inside it by WriteRecoverableState
+      last_sequence = versions_->LastSequence();
+    }
 
     PERF_TIMER_START(write_pre_and_post_process_time);
   }
@@ -1113,8 +1115,12 @@ Status DBImpl::WriteRecoverableState() {
       for (uint64_t sub_batch_seq = seq + 1;
            sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
         uint64_t const no_log_num = 0;
+        // Unlock it since the callback might end up locking mutex. e.g.,
+        // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+        mutex_.Unlock();
         status = recoverable_state_pre_release_callback_->Callback(
             sub_batch_seq, !DISABLE_MEMTABLE, no_log_num);
+        mutex_.Lock();
       }
     }
     if (status.ok()) {
diff --git a/util/transaction_test_util.cc b/util/transaction_test_util.cc
index 30cff11e14d..bd2d6afdca0 100644
--- a/util/transaction_test_util.cc
+++ b/util/transaction_test_util.cc
@@ -205,6 +205,12 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
         ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
                         "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
                         s.ToString().c_str(), txn->GetName().c_str());
+        if (rand_->OneIn(20)) {
+          // This currently only tests the mechanics of writing commit time
+          // write batch so the exact values would not matter.
+          s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog");
+          assert(s.ok());
+        }
         db->GetDBOptions().env->SleepForMicroseconds(
             static_cast<int>(cmt_delay_ms_ * 1000));
       }
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 2433af82637..6c71b679d60 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5096,6 +5096,9 @@ Status TransactionStressTestInserter(
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
+  if (rand->OneIn(2)) {
+    txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+  }
   // Inside the inserter we might also retake the snapshot. We do both since two
   // separte functions are engaged for each.
   txn_options.set_snapshot = rand->OneIn(2);
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index b4254870951..00fa6cf0364 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -477,6 +477,7 @@ class MySQLStyleTransactionTest
       // structures.
       txn_db_options.wp_snapshot_cache_bits = 1;
       txn_db_options.wp_commit_cache_bits = 10;
+      options.write_buffer_size = 1024;
       EXPECT_OK(ReOpen());
     }
   };

From 468ca611052eb207cfa6f312c90be1aff9de48ba Mon Sep 17 00:00:00 2001
From: Raphael Bost <raphael_bost@alumni.brown.edu>
Date: Wed, 15 May 2019 14:16:36 -0700
Subject: [PATCH 035/572] Break large file writes into 1GB chunks (#5213)

Summary:
This is a workaround for the issue described in #5169.
It has been tested on a database with very large values, but not dedicated test has been added to the code base.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5213

Differential Revision: D15243116

Pulled By: siying

fbshipit-source-id: e0c226a6cd71a60924dcd7ce7af74abcb4054484
---
 env/io_posix.cc | 135 ++++++++++++++++++++++++++++--------------------
 1 file changed, 79 insertions(+), 56 deletions(-)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index 0f86c3ff93f..0ced06ff262 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -37,7 +37,7 @@
 
 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
 #define F_LINUX_SPECIFIC_BASE 1024
-#define F_SET_RW_HINT         (F_LINUX_SPECIFIC_BASE + 12)
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
 #endif
 
 namespace rocksdb {
@@ -58,6 +58,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 
 namespace {
 
+// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
+// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
+// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
+// the writes aligned.
+
+bool PosixWrite(int fd, const char* buf, size_t nbyte) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = write(fd, src, bytes_to_write);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    src += done;
+  }
+  return true;
+}
+
+bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = pwrite(fd, src, bytes_to_write, offset);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    offset += done;
+    src += done;
+  }
+
+  return true;
+}
+
 size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #ifdef OS_LINUX
   struct stat buf;
@@ -180,7 +231,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) {
   return uintptr_t(ptr) % sector_size == 0;
 }
 
-}
+}  // namespace
 #endif
 
 /*
@@ -752,9 +803,9 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
   TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
   int alloc_status = 0;
   if (allow_fallocate_) {
-    alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
-          static_cast<off_t>(offset), static_cast<off_t>(len));
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
   }
   if (alloc_status == 0) {
     return Status::OK();
@@ -801,19 +852,13 @@ Status PosixWritableFile::Append(const Slice& data) {
     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
   }
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = write(fd_, src, left);
-    if (done < 0) {
-      if (errno == EINTR) {
-        continue;
-      }
-      return IOError("While appending to file", filename_, errno);
-    }
-    left -= done;
-    src += done;
+  size_t nbytes = data.size();
+
+  if (!PosixWrite(fd_, src, nbytes)) {
+    return IOError("While appending to file", filename_, errno);
   }
-  filesize_ += data.size();
+
+  filesize_ += nbytes;
   return Status::OK();
 }
 
@@ -825,21 +870,12 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
   }
   assert(offset <= std::numeric_limits<off_t>::max());
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
-    if (done < 0) {
-      if (errno == EINTR) {
-        continue;
-      }
-      return IOError("While pwrite to file at offset " + ToString(offset),
-                     filename_, errno);
-    }
-    left -= done;
-    offset += done;
-    src += done;
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError("While pwrite to file at offset " + ToString(offset),
+                   filename_, errno);
   }
-  filesize_ = offset;
+  filesize_ = offset + nbytes;
   return Status::OK();
 }
 
@@ -891,8 +927,8 @@ Status PosixWritableFile::Close() {
     // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
     if (result == 0 &&
         (file_stats.st_size + file_stats.st_blksize - 1) /
-            file_stats.st_blksize !=
-        file_stats.st_blocks / (file_stats.st_blksize / 512)) {
+                file_stats.st_blksize !=
+            file_stats.st_blocks / (file_stats.st_blksize / 512)) {
       IOSTATS_TIMER_GUARD(allocate_nanos);
       if (allow_fallocate_) {
         fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
@@ -942,10 +978,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
   }
 #else
   (void)hint;
-#endif // ROCKSDB_VALGRIND_RUN
+#endif  // ROCKSDB_VALGRIND_RUN
 #else
   (void)hint;
-#endif // OS_LINUX
+#endif  // OS_LINUX
 }
 
 Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
@@ -974,9 +1010,9 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
   IOSTATS_TIMER_GUARD(allocate_nanos);
   int alloc_status = 0;
   if (allow_fallocate_) {
-    alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
-        static_cast<off_t>(offset), static_cast<off_t>(len));
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
   }
   if (alloc_status == 0) {
     return Status::OK();
@@ -1037,24 +1073,11 @@ PosixRandomRWFile::~PosixRandomRWFile() {
 
 Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) {
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = pwrite(fd_, src, left, offset);
-    if (done < 0) {
-      // error while writing to file
-      if (errno == EINTR) {
-        // write was interrupted, try again.
-        continue;
-      }
-      return IOError(
-          "While write random read/write file at offset " + ToString(offset),
-          filename_, errno);
-    }
-
-    // Wrote `done` bytes
-    left -= done;
-    offset += done;
-    src += done;
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError(
+        "While write random read/write file at offset " + ToString(offset),
+        filename_, errno);
   }
 
   return Status::OK();

From 8149bb9d6ab8ef55a30e9906f0bca8e6e0a42bec Mon Sep 17 00:00:00 2001
From: Dave Rigby <d.rigby@me.com>
Date: Wed, 15 May 2019 14:19:04 -0700
Subject: [PATCH 036/572] Pass OptionTypeInfo maps by const& (#5295)

Summary:
In options_helper.cc various functions take a const unordered_map of
string -> TypeInfo for options handling. These functions pass by-value
the (const) maps, resulting in unnecessary copies.

Change to pass by reference.

This results in a noticable reduction in the amount of time spent
parsing options - in my case a set of unit tests using RocksDB which
call SetOptions() to modify options see a ~25% runtime reduction.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5295

Differential Revision: D15296334

Pulled By: riversand963

fbshipit-source-id: 4d4be3db635264943607911b296dda27fd7ce1a7
---
 options/options_helper.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/options/options_helper.cc b/options/options_helper.cc
index c33c2be6fb7..dbee1636d9f 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -255,7 +255,7 @@ const std::string kNameMergeOperator = "merge_operator";
 template <typename T>
 Status GetStringFromStruct(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& delimiter);
 
 namespace {
@@ -350,7 +350,7 @@ bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str,
 template <typename T>
 bool SerializeStruct(
     const T& options, std::string* value,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   std::string opt_str;
   Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";");
   if (!s.ok()) {
@@ -363,7 +363,7 @@ bool SerializeStruct(
 template <typename T>
 bool ParseSingleStructOption(
     const std::string& opt_val_str, T* options,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   size_t end = opt_val_str.find('=');
   std::string key = opt_val_str.substr(0, end);
   std::string value = opt_val_str.substr(end + 1);
@@ -380,7 +380,7 @@ bool ParseSingleStructOption(
 template <typename T>
 bool ParseStructOptions(
     const std::string& opt_str, T* options,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   assert(!opt_str.empty());
 
   size_t start = 0;
@@ -1092,7 +1092,7 @@ Status ParseColumnFamilyOption(const std::string& name,
 template <typename T>
 bool SerializeSingleStructOption(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& name, const std::string& delimiter) {
   auto iter = type_info.find(name);
   if (iter == type_info.end()) {
@@ -1112,7 +1112,7 @@ bool SerializeSingleStructOption(
 template <typename T>
 Status GetStringFromStruct(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& delimiter) {
   assert(opt_string);
   opt_string->clear();

From 1583cb402eb6f52adac0261cb3766b47aac3078e Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 15 May 2019 15:13:44 -0700
Subject: [PATCH 037/572] Fix a flaky test with test sync point (#5310)

Summary:
If DB is opened with `avoid_unnecessary_blocking_io` being true, then `~ColumnFamilyHandleImpl` enqueues a purge request and schedules a background thread to perform the deletion. Without test sync point, whether the SST file is purged or not at a later point in time is not deterministic. If the SST does not exist, it will cause an assertion failure.

How to reproduce:
```
$git checkout 6492430eaf1a13730eec81321528558cbf486c96
$make -j20 deletefile_test
$gtest-parallel --repeat 1000 --worker 16 ./deletefile_test --gtest_filter=DeleteFileTest.BackgroundPurgeCFDropTest
```
The test may fail a few times.
With changes made in this PR, repeat the above commands, and the test should not fail.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5310

Differential Revision: D15361136

Pulled By: riversand963

fbshipit-source-id: c4308d5f8da83472c893bf7f8ceed347fbfa850f
---
 db/deletefile_test.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 3ae464c5842..54bab847927 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -305,6 +305,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
                    &sleeping_task_after, Env::Priority::HIGH);
     // If background purge is enabled, the file should still be there.
     CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+    TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
 
     // Execute background purges.
     sleeping_task_after.WakeUp();
@@ -318,6 +319,13 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
     do_test(false);
   }
 
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+        "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
   options_.avoid_unnecessary_blocking_io = true;
   ASSERT_OK(ReopenDB(false));
   {
@@ -326,6 +334,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
   }
 
   CloseDB();
+  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 // This test is to reproduce a bug that read invalid ReadOption in iterator

From 29a198564d097411ca4bf08ae061c35e91a22502 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 15 May 2019 15:57:04 -0700
Subject: [PATCH 038/572] Fixes for build_detect_platform

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5255

Differential Revision: D15246532

Pulled By: riversand963

fbshipit-source-id: 96a21509666152788fa2f956e865a6bed7c8f474
---
 build_tools/build_detect_platform | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 057f77ec531..7f454bcca08 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -397,6 +397,7 @@ EOF
           #include <malloc.h>
           int main() {
             size_t res = malloc_usable_size(0);
+            (void)res;
             return 0;
           }
 EOF
@@ -411,6 +412,7 @@ EOF
           #include <pthread.h>
           int main() {
             int x = PTHREAD_MUTEX_ADAPTIVE_NP;
+            (void)x;
             return 0;
           }
 EOF
@@ -422,7 +424,7 @@ EOF
     if ! test $ROCKSDB_DISABLE_BACKTRACE; then
         # Test whether backtrace is available
         $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-          #include <execinfo.h>>
+          #include <execinfo.h>
           int main() {
             void* frames[1];
             backtrace_symbols(frames, backtrace(frames, 1));
@@ -480,6 +482,7 @@ EOF
           #include <sched.h>
           int main() {
             int cpuid = sched_getcpu();
+            (void)cpuid;
           }
 EOF
         if [ "$?" = 0 ]; then
@@ -515,7 +518,7 @@ fi
 
 if test "$USE_HDFS"; then
   if test -z "$JAVA_HOME"; then
-    echo "JAVA_HOME has to be set for HDFS usage."
+    echo "JAVA_HOME has to be set for HDFS usage." >&2
     exit 1
   fi
   HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS -I$HADOOP_HOME/include"
@@ -553,12 +556,13 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <nmmintrin.h>
   int main() {
     volatile uint32_t x = _mm_crc32_u32(0, 0);
+    (void)x;
   }
 EOF
 if [ "$?" = 0 ]; then
   COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42"
 elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling"
+  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
   exit 1
 fi
 
@@ -570,12 +574,13 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
     const auto b = _mm_set_epi64x(0, 0);
     const auto c = _mm_clmulepi64_si128(a, b, 0x00);
     auto d = _mm_cvtsi128_si64(c);
+    (void)d;
   }
 EOF
 if [ "$?" = 0 ]; then
   COMMON_FLAGS="$COMMON_FLAGS -DHAVE_PCLMUL"
 elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling"
+  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
   exit 1
 fi
 
@@ -589,6 +594,7 @@ if [ "$PLATFORM" != IOS ]; then
   #endif
   int main() {
     static __thread int tls;
+    (void)tls;
   }
 EOF
   if [ "$?" = 0 ]; then

From f82e693a31d07ab8b391888ff60eb7ff5b95bd13 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 16 May 2019 15:20:19 -0700
Subject: [PATCH 039/572] RangeDelAggregator::StripeRep::Invalidate() to be
 skipped if empty (#5312)

Summary:
RangeDelAggregator::StripeRep::Invalidate() clears up several vectors. If we know there isn't anything to there, we can safe these small CPUs. Profiling shows that it sometimes take non-negligible amount of CPU. Worth a small optimization.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5312

Differential Revision: D15380511

Pulled By: siying

fbshipit-source-id: 53c5f34c33b4cb1e743643c6086ac56d0b84ec2e
---
 db/range_del_aggregator.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index e593807d548..ce7897a975a 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -320,8 +320,10 @@ class RangeDelAggregator {
                       RangeDelPositioningMode mode);
 
     void Invalidate() {
-      InvalidateForwardIter();
-      InvalidateReverseIter();
+      if (!IsEmpty()) {
+        InvalidateForwardIter();
+        InvalidateReverseIter();
+      }
     }
 
     bool IsRangeOverlapped(const Slice& start, const Slice& end);

From c71f5bb9aa7fd2f12533a5b8300949e7f766e213 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Thu, 16 May 2019 15:35:28 -0700
Subject: [PATCH 040/572] Disable WriteUnPrepared stress tests (#5315)

Summary:
They are kind of flaky at the moment. Will re-enable it when flakiness is fixed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5315

Differential Revision: D15382744

Pulled By: maysamyabandeh

fbshipit-source-id: 8b2f9d81a4bb34bfd51481727a682d5cd063c5e3
---
 utilities/transactions/transaction_test.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 6c71b679d60..6ea1fc70213 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -79,11 +79,7 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true)));
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {

From a13026fb2fa45a1cc0f03f5e426035088f394c0a Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Thu, 16 May 2019 20:18:33 -0700
Subject: [PATCH 041/572] Added trace replay fast forward function (#5273)

Summary:
In the current db_bench trace replay, the replay process strictly follows the timestamp to issue the queries. In some cases, user does not care about the time. Therefore, fast forward is needed for users to speed up the replay process.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5273

Differential Revision: D15389232

Pulled By: zhichao-cao

fbshipit-source-id: 735d629b9d2a167b05af3e4fa0ddf9d5d0be1806
---
 tools/db_bench_tool.cc |  5 +++++
 util/trace_replay.cc   | 15 ++++++++++++++-
 util/trace_replay.h    |  2 ++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index b806fff8980..18d8733439b 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -762,6 +762,9 @@ DEFINE_bool(use_stderr_info_logger, false,
 
 DEFINE_string(trace_file, "", "Trace workload to a file. ");
 
+DEFINE_int32(trace_replay_fast_forward, 1,
+             "Fast forward trace replay, must >= 1. ");
+
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -6163,6 +6166,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
                       std::move(trace_reader));
+    replayer.SetFastForward(
+        static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
     s = replayer.Replay();
     if (s.ok()) {
       fprintf(stdout, "Replay started from trace_file: %s\n",
diff --git a/util/trace_replay.cc b/util/trace_replay.cc
index 28160b29292..c90fef2eff8 100644
--- a/util/trace_replay.cc
+++ b/util/trace_replay.cc
@@ -155,10 +155,22 @@ Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
   for (ColumnFamilyHandle* cfh : handles) {
     cf_map_[cfh->GetID()] = cfh;
   }
+  fast_forward_ = 1;
 }
 
 Replayer::~Replayer() { trace_reader_.reset(); }
 
+Status Replayer::SetFastForward(uint32_t fast_forward) {
+  Status s;
+  if (fast_forward < 1) {
+    s = Status::InvalidArgument("Wrong fast forward speed!");
+  } else {
+    fast_forward_ = fast_forward;
+    s = Status::OK();
+  }
+  return s;
+}
+
 Status Replayer::Replay() {
   Status s;
   Trace header;
@@ -182,7 +194,8 @@ Status Replayer::Replay() {
     }
 
     std::this_thread::sleep_until(
-        replay_epoch + std::chrono::microseconds(trace.ts - header.ts));
+        replay_epoch +
+        std::chrono::microseconds((trace.ts - header.ts) / fast_forward_));
     if (trace.type == kTraceWrite) {
       WriteBatch batch(trace.payload);
       db_->Write(woptions, &batch);
diff --git a/util/trace_replay.h b/util/trace_replay.h
index 749ea2f6432..29c00c287b2 100644
--- a/util/trace_replay.h
+++ b/util/trace_replay.h
@@ -88,6 +88,7 @@ class Replayer {
   ~Replayer();
 
   Status Replay();
+  Status SetFastForward(uint32_t fast_forward);
 
  private:
   Status ReadHeader(Trace* header);
@@ -97,6 +98,7 @@ class Replayer {
   DBImpl* db_;
   std::unique_ptr<TraceReader> trace_reader_;
   std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+  uint32_t fast_forward_;
 };
 
 }  // namespace rocksdb

From f3a7847598d89ef8f9f531b10fabb7ce044a38f8 Mon Sep 17 00:00:00 2001
From: yiwu-arbug <yiwu@pingcap.com>
Date: Fri, 17 May 2019 10:23:38 -0700
Subject: [PATCH 042/572] Reduce iterator key comparison for upper/lower bound
 check (#5111)

Summary:
Previously if iterator upper/lower bound presents, `DBIter` will check the bound for every key. This patch turns the check into per-file or per-data block check when applicable, by checking against either file largest/smallest key or block index key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5111

Differential Revision: D15330061

Pulled By: siying

fbshipit-source-id: 8a653fe3cd50d94d81eb2d13b087326c58ee2024
---
 HISTORY.md                        |  1 +
 db/db_iter.cc                     |  4 ++--
 db/version_set.cc                 | 40 +++++++++++++++++++++++--------
 table/block_based_table_reader.cc | 20 +++++++++-------
 table/block_based_table_reader.h  |  9 ++++++-
 table/internal_iterator.h         | 25 +++++++++++++++++--
 table/iterator_wrapper.h          | 22 +++++++++++++----
 table/merging_iterator.cc         | 24 +++++++++++++++++++
 8 files changed, 117 insertions(+), 28 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 9cf8a88da04..d45e94bb670 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,7 @@
 * Reduce binary search when iterator reseek into the same data block.
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
+* Reduce iterator key comparision for upper/lower bound check.
 
 ### Bug Fixes
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 1d8ccf9adbd..a606e3acd66 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -467,7 +467,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
-    if (iterate_upper_bound_ != nullptr &&
+    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
     }
@@ -859,7 +859,7 @@ void DBIter::PrevInternal() {
       return;
     }
 
-    if (iterate_lower_bound_ != nullptr &&
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
         user_comparator_.Compare(saved_key_.GetUserKey(),
                                  *iterate_lower_bound_) < 0) {
       // We've iterated earlier than the user-specified lower bound.
diff --git a/db/version_set.cc b/db/version_set.cc
index f0dfe765871..03c5902728c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
 
   bool Valid() const override { return file_iter_.Valid(); }
@@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator {
     assert(Valid());
     return file_iter_.key();
   }
+
   Slice value() const override {
     assert(Valid());
     return file_iter_.value();
   }
+
   Status status() const override {
     return file_iter_.iter() ? file_iter_.status() : Status::OK();
   }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return file_iter_.MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     if (file_iter_.iter()) {
       file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
     }
   }
+
   bool IsKeyPinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsKeyPinned();
   }
+
   bool IsValuePinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsValuePinned();
@@ -954,12 +969,16 @@ class LevelIterator final : public InternalIterator {
       smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
       largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
     }
+    may_be_out_of_lower_bound_ =
+        read_options_.iterate_lower_bound != nullptr &&
+        user_comparator_.Compare(ExtractUserKey(file_smallest_key(file_index_)),
+                                 *read_options_.iterate_lower_bound) < 0;
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
-        nullptr /* don't need reference to table */,
-        file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
-        level_, smallest_compaction_key, largest_compaction_key);
+        nullptr /* don't need reference to table */, file_read_hist_,
+        for_compaction_, nullptr /* arena */, skip_filters_, level_,
+        smallest_compaction_key, largest_compaction_key);
   }
 
   TableCache* table_cache_;
@@ -975,6 +994,7 @@ class LevelIterator final : public InternalIterator {
   bool should_sample_;
   bool for_compaction_;
   bool skip_filters_;
+  bool may_be_out_of_lower_bound_ = true;
   size_t file_index_;
   int level_;
   RangeDelAggregator* range_del_agg_;
@@ -1043,11 +1063,12 @@ void LevelIterator::SeekToLast() {
 
 void LevelIterator::Next() { NextImpl(); }
 
-bool LevelIterator::NextAndGetResult(Slice* ret_key) {
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
   NextImpl();
   bool is_valid = Valid();
   if (is_valid) {
-    *ret_key = key();
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
   }
   return is_valid;
 }
@@ -4278,10 +4299,9 @@ Status VersionSet::Recover(
         ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
-        manifest_path.c_str(), manifest_file_number_,
-        next_file_number_.load(), last_sequence_.load(), log_number,
-        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
-        min_log_number_to_keep_2pc());
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 576117f0d35..34e40979247 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -2446,11 +2446,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
 
 template <class TBlockIter, typename TValue>
 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
-    Slice* ret_key) {
+    IterateResult* result) {
   Next();
   bool is_valid = Valid();
   if (is_valid) {
-    *ret_key = key();
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
   }
   return is_valid;
 }
@@ -2531,6 +2532,11 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
         key_includes_seq_, index_key_is_full_,
         /* get_context */ nullptr, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
+    if (read_options_.iterate_upper_bound != nullptr) {
+      data_block_within_upper_bound_ =
+          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                    index_iter_->user_key()) > 0);
+    }
   }
 }
 
@@ -2543,13 +2549,9 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
       return;
     }
     // Whether next data block is out of upper bound, if there is one.
-    bool next_block_is_out_of_bound = false;
-    if (read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_) {
-      next_block_is_out_of_bound =
-          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                    index_iter_->user_key()) <= 0);
-    }
+    bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
     ResetDataIter();
     index_iter_->Next();
     if (next_block_is_out_of_bound) {
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 385e50ab79f..8274f0cf965 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -588,7 +588,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
   bool Valid() const override {
     return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
@@ -619,6 +619,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // Whether iterator invalidated for being out of bound.
   bool IsOutOfBound() override { return is_out_of_bound_; }
 
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return !data_block_within_upper_bound_;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
@@ -680,6 +685,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   TBlockIter block_iter_;
   bool block_iter_points_to_real_block_;
   bool is_out_of_bound_ = false;
+  // Whether current data block being fully within iterate upper bound.
+  bool data_block_within_upper_bound_ = false;
   bool check_filter_;
   // TODO(Zhongyi): pick a better name
   bool need_upper_bound_check_;
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 8f1cc9dd68e..1f57399c7f7 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -17,6 +17,11 @@ namespace rocksdb {
 
 class PinnedIteratorsManager;
 
+struct IterateResult {
+  Slice key;
+  bool may_be_out_of_upper_bound;
+};
+
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
@@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  virtual bool NextAndGetResult(Slice* ret_key) {
+  // Moves to the next entry in the source, and return result. Iterator
+  // implementation should override this method to help methods inline better,
+  // or when MayBeOutOfUpperBound() is non-trivial.
+  // REQUIRES: Valid()
+  virtual bool NextAndGetResult(IterateResult* result) {
     Next();
     bool is_valid = Valid();
     if (is_valid) {
-      *ret_key = key();
+      result->key = key();
+      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+      // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
+      // it should also override NextAndGetResult().
+      result->may_be_out_of_upper_bound = true;
+      assert(MayBeOutOfUpperBound());
     }
     return is_valid;
   }
@@ -94,6 +108,13 @@ class InternalIteratorBase : public Cleanable {
   // upper bound
   virtual bool IsOutOfBound() { return false; }
 
+  // Keys return from this iterator can be smaller than iterate_lower_bound.
+  virtual bool MayBeOutOfLowerBound() { return true; }
+
+  // Keys return from this iterator can be larger or equal to
+  // iterate_upper_bound.
+  virtual bool MayBeOutOfUpperBound() { return true; }
+
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index a570e53c1e2..a5aa5c49eac 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -56,7 +56,10 @@ class IteratorWrapperBase {
 
   // Iterator interface methods
   bool Valid() const        { return valid_; }
-  Slice key() const         { assert(Valid()); return key_; }
+  Slice key() const {
+    assert(Valid());
+    return result_.key;
+  }
   TValue value() const {
     assert(Valid());
     return iter_->value();
@@ -65,7 +68,7 @@ class IteratorWrapperBase {
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next() {
     assert(iter_);
-    valid_ = iter_->NextAndGetResult(&key_);
+    valid_ = iter_->NextAndGetResult(&result_);
     assert(!valid_ || iter_->status().ok());
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
@@ -83,6 +86,16 @@ class IteratorWrapperBase {
   void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
   void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
 
+  bool MayBeOutOfLowerBound() {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() {
+    assert(Valid());
+    return result_.may_be_out_of_upper_bound;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
     assert(iter_);
     iter_->SetPinnedItersMgr(pinned_iters_mgr);
@@ -100,14 +113,15 @@ class IteratorWrapperBase {
   void Update() {
     valid_ = iter_->Valid();
     if (valid_) {
-      key_ = iter_->key();
       assert(iter_->status().ok());
+      result_.key = iter_->key();
+      result_.may_be_out_of_upper_bound = true;
     }
   }
 
   InternalIteratorBase<TValue>* iter_;
+  IterateResult result_;
   bool valid_;
-  Slice key_;
 };
 
 using IteratorWrapper = IteratorWrapperBase<Slice>;
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index e5df6bdf6f0..244b5e82c3d 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator {
     current_ = CurrentForward();
   }
 
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+    }
+    return is_valid;
+  }
+
   void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
@@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator {
     return current_->value();
   }
 
+  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+  // from current child iterator. Potentially as long as one of child iterator
+  // report out of bound is not possible, we know current key is within bound.
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     for (auto& child : children_) {

From fb4c6a31cece73f79a05135c2821d511cd76aeba Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 17 May 2019 19:16:51 -0700
Subject: [PATCH 043/572] Log replay integration for secondary instance (#5305)

Summary:
RocksDB secondary can replay both MANIFEST and WAL now.
On the one hand, the memory usage by memtables will grow after replaying WAL for sometime. On the other hand, replaying the MANIFEST can bring the database persistent data to a more recent point in time, giving us the opportunity to discard some memtables containing out-dated data.
This PR coordinates the MANIFEST and WAL replay, using the updates from MANIFEST replay to update the active memtable and immutable memtable list of each column family.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5305

Differential Revision: D15386512

Pulled By: riversand963

fbshipit-source-id: a3ea6fc415f8382d8cf624f52a71ebdcffa3e355
---
 HISTORY.md              |   1 +
 db/db_impl.h            |   4 +-
 db/db_impl_secondary.cc | 119 +++++++++++++++++++++++++++---------
 db/db_impl_secondary.h  | 102 ++++++++++++++++++++++++++-----
 db/db_secondary_test.cc | 130 ++++++++++++++++++++++++++++++++++++++++
 db/memtable_list.cc     |  18 ++++++
 db/memtable_list.h      |   7 +++
 7 files changed, 336 insertions(+), 45 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d45e94bb670..f67a8210d24 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees.
+* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
diff --git a/db/db_impl.h b/db/db_impl.h
index c4fae9a6ad5..08cb1949118 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -1078,8 +1078,8 @@ class DBImpl : public DB {
       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
 
   // REQUIRES: log_numbers are sorted in ascending order
-  virtual Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
-                                 SequenceNumber* next_sequence, bool read_only);
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence, bool read_only);
 
   // The following two methods are used to flush a memtable to
   // storage. The first one is used at database RecoveryTime (when the
diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc
index 007910ea5b4..5dfa2d0c942 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl_secondary.cc
@@ -18,7 +18,6 @@
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
-
 DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
                                  const std::string& dbname)
     : DBImpl(db_options, dbname) {
@@ -35,6 +34,7 @@ Status DBImplSecondary::Recover(
     bool /*error_if_data_exists_in_logs*/) {
   mutex_.AssertHeld();
 
+  JobContext job_context(0);
   Status s;
   s = static_cast<ReactiveVersionSet*>(versions_.get())
           ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
@@ -59,11 +59,29 @@ Status DBImplSecondary::Recover(
     single_column_family_mode_ =
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
-    s = FindAndRecoverLogFiles();
+    std::unordered_set<ColumnFamilyData*> cfds_changed;
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
   }
 
   // TODO: update options_file_number_ needed?
 
+  job_context.Clean();
+  return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+  }
   return s;
 }
 
@@ -151,7 +169,10 @@ Status DBImplSecondary::MaybeInitLogReader(
 // REQUIRES: log_numbers are sorted in ascending order
 Status DBImplSecondary::RecoverLogFiles(
     const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
-    bool /*read_only*/) {
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
   mutex_.AssertHeld();
   Status status;
   for (auto log_number : log_numbers) {
@@ -184,6 +205,39 @@ Status DBImplSecondary::RecoverLogFiles(
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
+      std::vector<uint32_t> column_family_ids;
+      status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+      if (status.ok()) {
+        SequenceNumber seq = versions_->LastSequence();
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          if (cfds_changed->count(cfd) == 0) {
+            cfds_changed->insert(cfd);
+          }
+          auto curr_log_num = port::kMaxUint64;
+          if (cfd_to_current_log_.count(cfd) > 0) {
+            curr_log_num = cfd_to_current_log_[cfd];
+          }
+          // If the active memtable contains records added by replaying an
+          // earlier WAL, then we need to seal the memtable, add it to the
+          // immutable memtable list and create a new active memtable.
+          if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
+                                         curr_log_num != log_number)) {
+            const MutableCFOptions mutable_cf_options =
+                *cfd->GetLatestMutableCFOptions();
+            MemTable* new_mem =
+                cfd->ConstructNewMemtable(mutable_cf_options, seq);
+            cfd->mem()->SetNextLogNumber(log_number);
+            cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+            new_mem->Ref();
+            cfd->SetMemtable(new_mem);
+          }
+        }
+      }
       // do not check sequence number because user may toggle disableWAL
       // between writes which breaks sequence number continuity guarantee
 
@@ -194,12 +248,30 @@ Status DBImplSecondary::RecoverLogFiles(
       // That's why we set ignore missing column families to true
       // passing null flush_scheduler will disable memtable flushing which is
       // needed for secondary instances
-      bool has_valid_writes = false;
-      status = WriteBatchInternal::InsertInto(
-          &batch, column_family_memtables_.get(), nullptr /* flush_scheduler */,
-          true, log_number, this, false /* concurrent_memtable_writes */,
-          next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
-      if (!status.ok()) {
+      if (status.ok()) {
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, true, log_number, this,
+            false /* concurrent_memtable_writes */, next_sequence,
+            &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      }
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+              cfd_to_current_log_.find(cfd);
+          if (iter == cfd_to_current_log_.end()) {
+            cfd_to_current_log_.insert({cfd, log_number});
+          } else if (log_number > iter->second) {
+            iter->second = log_number;
+          }
+        }
+      } else {
         // We are treating this as a failure while reading since we read valid
         // blocks that do not form coherent data
         reader->GetReporter()->Corruption(record.size(), status);
@@ -296,18 +368,6 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
   return s;
 }
 
-// find new WAL and apply them in order to the secondary instance
-Status DBImplSecondary::FindAndRecoverLogFiles() {
-  Status s;
-  std::vector<uint64_t> logs;
-  s = FindNewLogNumbers(&logs);
-  if (s.ok() && !logs.empty()) {
-    SequenceNumber next_sequence(kMaxSequenceNumber);
-    s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/);
-  }
-  return s;
-}
-
 Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
                                        ColumnFamilyHandle* column_family) {
   if (read_options.managed) {
@@ -393,20 +453,25 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
   Status s;
   // read the manifest and apply new changes to the secondary instance
   std::unordered_set<ColumnFamilyData*> cfds_changed;
+  JobContext job_context(0, true /*create_superversion*/);
   InstrumentedMutexLock lock_guard(&mutex_);
   s = static_cast<ReactiveVersionSet*>(versions_.get())
           ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+  // list wal_dir to discover new WALs and apply new changes to the secondary
+  // instance
+  if (s.ok()) {
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
   if (s.ok()) {
-    SuperVersionContext sv_context(true /* create_superversion */);
     for (auto cfd : cfds_changed) {
-      sv_context.NewSuperVersion();
+      cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+                                     &job_context.memtables_to_free);
+      auto& sv_context = job_context.superversion_contexts.back();
       cfd->InstallSuperVersion(&sv_context, &mutex_);
+      sv_context.NewSuperVersion();
     }
-    sv_context.Clean();
+    job_context.Clean();
   }
-  // list wal_dir to discover new WALs and apply new changes to the secondary
-  // instance
-  s = FindAndRecoverLogFiles();
   return s;
 }
 
diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h
index 32dbae058b8..912708b1ec0 100644
--- a/db/db_impl_secondary.h
+++ b/db/db_impl_secondary.h
@@ -96,40 +96,40 @@ class DBImplSecondary : public DBImpl {
   Status Put(const WriteOptions& /*options*/,
              ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
              const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Merge;
   Status Merge(const WriteOptions& /*options*/,
                ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
                const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Delete;
   Status Delete(const WriteOptions& /*options*/,
                 ColumnFamilyHandle* /*column_family*/,
                 const Slice& /*key*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::SingleDelete;
   Status SingleDelete(const WriteOptions& /*options*/,
                       ColumnFamilyHandle* /*column_family*/,
                       const Slice& /*key*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status Write(const WriteOptions& /*options*/,
                WriteBatch* /*updates*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::CompactRange;
   Status CompactRange(const CompactRangeOptions& /*options*/,
                       ColumnFamilyHandle* /*column_family*/,
                       const Slice* /*begin*/, const Slice* /*end*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::CompactFiles;
@@ -140,32 +140,32 @@ class DBImplSecondary : public DBImpl {
       const int /*output_level*/, const int /*output_path_id*/ = -1,
       std::vector<std::string>* const /*output_file_names*/ = nullptr,
       CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status DisableFileDeletions() override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status EnableFileDeletions(bool /*force*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status GetLiveFiles(std::vector<std::string>&,
                       uint64_t* /*manifest_file_size*/,
                       bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Flush;
   Status Flush(const FlushOptions& /*options*/,
                ColumnFamilyHandle* /*column_family*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::SyncWAL;
   Status SyncWAL() override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DB::IngestExternalFile;
@@ -173,7 +173,7 @@ class DBImplSecondary : public DBImpl {
       ColumnFamilyHandle* /*column_family*/,
       const std::vector<std::string>& /*external_files*/,
       const IngestExternalFileOptions& /*ingestion_options*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   // Try to catch up with the primary by reading as much as possible from the
@@ -185,6 +185,70 @@ class DBImplSecondary : public DBImpl {
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
+ protected:
+  class ColumnFamilyCollector : public WriteBatch::Handler {
+    std::unordered_set<uint32_t> column_family_ids_;
+
+    Status AddColumnFamilyId(uint32_t column_family_id) {
+      if (column_family_ids_.find(column_family_id) ==
+          column_family_ids_.end()) {
+        column_family_ids_.insert(column_family_id);
+      }
+      return Status::OK();
+    }
+
+   public:
+    explicit ColumnFamilyCollector() {}
+
+    ~ColumnFamilyCollector() override {}
+
+    Status PutCF(uint32_t column_family_id, const Slice&,
+                 const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+                         const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MergeCF(uint32_t column_family_id, const Slice&,
+                   const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+                          const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    const std::unordered_set<uint32_t>& column_families() const {
+      return column_family_ids_;
+    }
+  };
+
+  Status CollectColumnFamilyIdsFromWriteBatch(
+      const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+    assert(column_family_ids != nullptr);
+    column_family_ids->clear();
+    ColumnFamilyCollector handler;
+    Status s = batch.Iterate(&handler);
+    if (s.ok()) {
+      for (const auto& cf : handler.column_families()) {
+        column_family_ids->push_back(cf);
+      }
+    }
+    return s;
+  }
+
  private:
   friend class DB;
 
@@ -194,19 +258,25 @@ class DBImplSecondary : public DBImpl {
 
   using DBImpl::Recover;
 
-  Status FindAndRecoverLogFiles();
+  Status FindAndRecoverLogFiles(
+      std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      JobContext* job_context);
   Status FindNewLogNumbers(std::vector<uint64_t>* logs);
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence,
-                         bool read_only) override;
+                         std::unordered_set<ColumnFamilyData*>* cfds_changed,
+                         JobContext* job_context);
 
   std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
   std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
   std::unique_ptr<Status> manifest_reader_status_;
 
-  // cache log readers for each log number, used for continue WAL replay
+  // Cache log readers for each log number, used for continue WAL replay
   // after recovery
   std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+  // Current WAL number replayed for each column family.
+  std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
 };
 
 }  // namespace rocksdb
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 60ea5ba8d5f..a4267c7d596 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -243,6 +243,11 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
 
   ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
   verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
 }
 
 TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
@@ -519,6 +524,131 @@ TEST_F(DBSecondaryTest, SwitchManifest) {
   ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
   range_scan_db();
 }
+
+TEST_F(DBSecondaryTest, SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    ASSERT_OK(
+        Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(
+        Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  wb.Put("key0", "value0");
+  wb.Put("key1", "value1");
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    wb1.Put("key0", "value01");
+    wb1.Put("key1", "value11");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    wb2.Put("key0", "new_value0");
+    wb2.Delete("key1");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+}
 #endif  //! ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 21b44b1798a..d81b1d4d224 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -638,4 +638,22 @@ Status InstallMemtableAtomicFlushResults(
   return s;
 }
 
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+                                      autovector<MemTable*>* to_delete) {
+  assert(to_delete != nullptr);
+  InstallNewVersion();
+  auto& memlist = current_->memlist_;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* mem = *it;
+    if (mem->GetNextLogNumber() > log_number) {
+      break;
+    }
+    current_->Remove(mem, to_delete);
+    --num_flush_not_started_;
+    if (0 == num_flush_not_started_) {
+      imm_flush_needed.store(false, std::memory_order_release);
+    }
+  }
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable_list.h b/db/memtable_list.h
index b56ad4932c4..5df35660a4d 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -294,6 +294,13 @@ class MemTableList {
     }
   }
 
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
  private:
   friend Status InstallMemtableAtomicFlushResults(
       const autovector<MemTableList*>* imm_lists,

From 5c0e304170dbb157f9faa612f0568f37ad506674 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 20 May 2019 07:46:15 -0700
Subject: [PATCH 044/572] WritePrepared: Clarify the need for two_write_queues
 in unordered_write (#5313)

Summary:
WritePrepared transactions when configured with two_write_queues=true offers higher throughput with unordered_write feature without however compromising the rocksdb guarantees. This is because it performs ordering among writes in a 2nd step that is not tied to memtable write speed. The 2nd step is naturally provided by 2PC when the commit phase does the ordering as well. Without 2PC, the 2nd step would only be provided when we use two_write_queues=true, where WritePrepared after performing the writes, in a 2nd step uses the 2nd queue to assign order to the writes.
The patch clarifies the need for two_write_queues=true in the HISTORY and inline comments of unordered_writes. Moreover it extends the stress tests of WritePrepared to unordred_write.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5313

Differential Revision: D15379977

Pulled By: maysamyabandeh

fbshipit-source-id: 5b6f05b9b59285dcbf3b0532215ba9fe7d926e00
---
 HISTORY.md                                    |  2 +-
 db/db_impl_write.cc                           |  5 +++
 include/rocksdb/options.h                     |  5 ++-
 .../pessimistic_transaction_db.cc             |  6 +++
 utilities/transactions/transaction_test.cc    |  6 +--
 .../write_prepared_transaction_test.cc        | 38 +------------------
 6 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f67a8210d24..44fc66bcbd8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,7 +5,7 @@
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
-* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions, it offers higher throughput with however no compromise on guarantees.
+* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 
 ### Performance Improvements
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index f4c72e298ee..92edc84254c 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -605,6 +605,11 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
 
   size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
   if (pending_cnt == 0) {
+    // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+    // before notify ensures that cv is in waiting state when it is notified
+    // thus not missing the update to pending_memtable_writes_ even though it is
+    // not modified under the mutex.
+    std::lock_guard<std::mutex> lck(switch_mutex_);
     switch_cv_.notify_all();
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index c8b4cc538d9..7d22fb67559 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -899,8 +899,9 @@ struct DBOptions {
   // ::MultiGet and Iterator's consistent-point-in-time view property.
   // If the application cannot tolerate the relaxed guarantees, it can implement
   // its own mechanisms to work around that and yet benefit from the higher
-  // throughput. Using TransactionDB with WRITE_PREPARED write policy is one way
-  // to achieve immutable snapshots despite unordered_write.
+  // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+  // two_write_queues=true is one way to achieve immutable snapshots despite
+  // unordered_write.
   //
   // By default, i.e., when it is false, rocksdb does not advance the sequence
   // number for new snapshots unless all the writes with lower sequence numbers
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index ecf6d2ff387..c4e6e247756 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -232,6 +232,12 @@ Status TransactionDB::Open(
     return Status::NotSupported(
         "WRITE_UNPREPARED is currently incompatible with unordered_writes");
   }
+  if (txn_db_options.write_policy == WRITE_PREPARED &&
+      db_options.unordered_write && !db_options.two_write_queues) {
+    return Status::NotSupported(
+        "WRITE_UNPREPARED is incompatible with unordered_writes if "
+        "two_write_queues is not enabled.");
+  }
 
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
   std::vector<size_t> compaction_enabled_cf_indices;
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 6ea1fc70213..3c8036614f0 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -47,7 +47,6 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
         std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
@@ -58,7 +57,6 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
         std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
@@ -79,7 +77,9 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true)));
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 6bad81db0ee..b93f1a74ffe 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -573,7 +573,6 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
 
 #ifndef ROCKSDB_VALGRIND_RUN
@@ -644,29 +643,7 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20),
-
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 10, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 11, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 12, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 13, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 14, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 15, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 16, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 17, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 18, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 19,
-                        20)));
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
 
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SeqAdvanceConcurrentTest,
@@ -704,18 +681,7 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10),
-
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 0, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 1, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 2, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 3, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 4, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 5, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 6, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 7, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 8, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(WritePreparedTransactionTest, CommitMapTest) {

From 931c9df88677bcb6935eb353e79085790b79c8d4 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <1004951+vjnadimpalli@users.noreply.github.com>
Date: Mon, 20 May 2019 10:37:37 -0700
Subject: [PATCH 045/572] Use separate status code for column family drop and
 db shutdown in progress (#5275)

Summary:
Currently RocksDB uses Status::ShutdownInProgress to inform about column family drop. I would like to have a separate Status code for this event.
https://github.com/facebook/rocksdb/blob/master/include/rocksdb/status.h#L55
Comment on this:
https://github.com/facebook/rocksdb/blob/abc4202e47eb433dc731911af38f232d2148428c/db/version_set.cc#L2742:L2743
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5275

Differential Revision: D15204583

Pulled By: vjnadimpalli

fbshipit-source-id: 95e99e34b27bc165b554ecb8a48a7f8e60f21e2a
---
 HISTORY.md                     |  4 ++++
 db/compaction_job.cc           | 11 ++++++----
 db/db_compaction_test.cc       | 12 ++++++++---
 db/db_flush_test.cc            |  2 +-
 db/db_impl_compaction_flush.cc | 37 ++++++++++++++++++----------------
 db/flush_job.cc                | 10 +++++----
 db/memtable_list.cc            |  2 +-
 db/version_set.cc              |  4 +---
 include/rocksdb/status.h       | 16 ++++++++++++++-
 java/rocksjni/portal.h         |  8 ++++++++
 util/status.cc                 |  3 +++
 11 files changed, 75 insertions(+), 34 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 44fc66bcbd8..e9f06b53280 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -14,8 +14,12 @@
 * Merging iterator to avoid child iterator reseek for some cases
 * Reduce iterator key comparision for upper/lower bound check.
 
+### General Improvements
+* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
+
 ### Bug Fixes
 
+	
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index fb77431fddc..d1ae1932729 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -1004,10 +1004,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
   RecordCompactionIOStats();
 
-  if (status.ok() &&
-      (shutting_down_->load(std::memory_order_relaxed) || cfd->IsDropped())) {
-    status = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during compaction");
+  if (status.ok() && cfd->IsDropped()) {
+    status =
+        Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_relaxed)) {
+    status = Status::ShutdownInProgress("Database shutdown");
   }
   if (status.ok()) {
     status = input->status();
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index b5033b66f0c..91a04205e07 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3890,11 +3890,17 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
       }
       Flush(1);
     }
-    auto manual_compaction_thread = port::Thread([this]() {
+    auto manual_compaction_thread = port::Thread([this, i]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
-                      .IsShutdownInProgress());
+      Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+      if (i == 0) {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsColumnFamilyDropped());
+      } else {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsShutdownInProgress());
+      }
     });
 
     TEST_SYNC_POINT(
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index c603f60b460..876605b2e48 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -431,7 +431,7 @@ TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
     cf_ids.push_back(cf_id);
   }
   ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-  ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress());
+  ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
   Destroy(options);
 }
 
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 900ea4acdcd..38c69dfc1e4 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -201,7 +201,7 @@ Status DBImpl::FlushMemTableToOutputFile(
                      cfd->current()->storage_info()->LevelSummary(&tmp));
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress()) {
+  if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
     Status new_bg_error = s;
     error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
   }
@@ -254,7 +254,7 @@ Status DBImpl::FlushMemTablesToOutputFiles(
         snapshot_checker, log_buffer, thread_pri);
     if (!s.ok()) {
       status = s;
-      if (!s.IsShutdownInProgress()) {
+      if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
         // At this point, DB is not shutting down, nor is cfd dropped.
         // Something is wrong, thus we break out of the loop.
         break;
@@ -385,7 +385,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     for (const auto& e : exec_status) {
       if (!e.second.ok()) {
         s = e.second;
-        if (!e.second.IsShutdownInProgress()) {
+        if (!e.second.IsShutdownInProgress() &&
+            !e.second.IsColumnFamilyDropped()) {
           // If a flush job did not return OK, and the CF is not dropped, and
           // the DB is not shutting down, then we have to return this result to
           // caller later.
@@ -397,15 +398,11 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     s = error_status.ok() ? s : error_status;
   }
 
-  // If db is NOT shutting down, and one or more column families have been
-  // dropped.
-  // TODO: use separate status code for db shutdown and column family dropped.
-  if (s.IsShutdownInProgress() &&
-      !shutting_down_.load(std::memory_order_acquire)) {
+  if (s.IsColumnFamilyDropped()) {
     s = Status::OK();
   }
 
-  if (s.ok() || s.IsShutdownInProgress()) {
+  if (s.ok() || s.IsShutdownInProgress() || s.IsColumnFamilyDropped()) {
     // Sync on all distinct output directories.
     for (auto dir : distinct_output_dirs) {
       if (dir != nullptr) {
@@ -523,7 +520,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
   // Need to undo atomic flush if something went wrong, i.e. s is not OK and
   // it is not because of CF drop.
-  if (!s.ok() && !s.IsShutdownInProgress()) {
+  if (!s.ok() && !s.IsColumnFamilyDropped()) {
     // Have to cancel the flush jobs that have NOT executed because we need to
     // unref the versions.
     for (int i = 0; i != num_cfs; ++i) {
@@ -1052,7 +1049,7 @@ Status DBImpl::CompactFilesImpl(
 
   if (status.ok()) {
     // Done
-  } else if (status.IsShutdownInProgress()) {
+  } else if (status.IsColumnFamilyDropped()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
@@ -1697,7 +1694,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
                        cfd->GetName().c_str());
         bg_cv_.Wait();
       }
-      if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) {
+      if (cfd->IsDropped()) {
+        return Status::ColumnFamilyDropped();
+      }
+      if (shutting_down_.load(std::memory_order_acquire)) {
         return Status::ShutdownInProgress();
       }
 
@@ -2159,7 +2159,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 
     Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
                                &reason, thread_pri);
-    if (!s.ok() && !s.IsShutdownInProgress() &&
+    if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
         reason != FlushReason::kErrorRecovery) {
       // Wait a little bit before retrying background flush in
       // case this is an environmental problem and we do not want to
@@ -2184,7 +2184,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 
     // If flush failed, we want to delete all temporary files that we might have
     // created. Thus, we force full scan in FindObsoleteFiles()
-    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
     // delete unnecessary files if any, this is done outside the mutex
     if (job_context.HaveSomethingToClean() ||
         job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
@@ -2248,7 +2249,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       mutex_.Unlock();
       env_->SleepForMicroseconds(10000);  // prevent hot loop
       mutex_.Lock();
-    } else if (!s.ok() && !s.IsShutdownInProgress()) {
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsColumnFamilyDropped()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -2272,7 +2274,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // If compaction failed, we want to delete all temporary files that we might
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
-    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2710,7 +2713,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
   if (status.ok() || status.IsCompactionTooLarge()) {
     // Done
-  } else if (status.IsShutdownInProgress()) {
+  } else if (status.IsColumnFamilyDropped()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 4226589e79d..21c1ff3a746 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -229,10 +229,12 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
   // This will release and re-acquire the mutex.
   Status s = WriteLevel0Table();
 
-  if (s.ok() &&
-      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
-    s = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during flush");
+  if (s.ok() && cfd_->IsDropped()) {
+    s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((s.ok() || s.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_acquire)) {
+    s = Status::ShutdownInProgress("Database shutdown");
   }
 
   if (!s.ok()) {
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index d81b1d4d224..bdcbd218663 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -598,7 +598,7 @@ Status InstallMemtableAtomicFlushResults(
     imm->InstallNewVersion();
   }
 
-  if (s.ok() || s.IsShutdownInProgress()) {
+  if (s.ok() || s.IsColumnFamilyDropped()) {
     for (size_t i = 0; i != cfds.size(); ++i) {
       if (cfds[i]->IsDropped()) {
         continue;
diff --git a/db/version_set.cc b/db/version_set.cc
index 03c5902728c..15b9d01feea 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3842,8 +3842,6 @@ Status VersionSet::LogAndApply(
     }
   }
   if (0 == num_undropped_cfds) {
-    // TODO (yanqin) maybe use a different status code to denote column family
-    // drop other than OK and ShutdownInProgress
     for (int i = 0; i != num_cfds; ++i) {
       manifest_writers_.pop_front();
     }
@@ -3851,7 +3849,7 @@ Status VersionSet::LogAndApply(
     if (!manifest_writers_.empty()) {
       manifest_writers_.front()->cv.Signal();
     }
-    return Status::ShutdownInProgress();
+    return Status::ColumnFamilyDropped();
   }
 
   return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 12e8070d1e8..ac97ce442af 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -58,7 +58,9 @@ class Status {
     kBusy = 11,
     kExpired = 12,
     kTryAgain = 13,
-    kCompactionTooLarge = 14
+    kCompactionTooLarge = 14,
+    kColumnFamilyDropped = 15,
+    kMaxCode
   };
 
   Code code() const { return code_; }
@@ -184,6 +186,15 @@ class Status {
     return Status(kCompactionTooLarge, msg, msg2);
   }
 
+  static Status ColumnFamilyDropped(SubCode msg = kNone) {
+    return Status(kColumnFamilyDropped, msg);
+  }
+
+  static Status ColumnFamilyDropped(const Slice& msg,
+                                    const Slice& msg2 = Slice()) {
+    return Status(kColumnFamilyDropped, msg, msg2);
+  }
+
   static Status NoSpace() { return Status(kIOError, kNoSpace); }
   static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIOError, kNoSpace, msg, msg2);
@@ -256,6 +267,9 @@ class Status {
   // Returns true iff the status indicates the proposed compaction is too large
   bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; }
 
+  // Returns true iff the status indicates Column Family Dropped
+  bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; }
+
   // Returns true iff the status indicates a NoSpace error
   // This is caused by an I/O error returning the specific "out of space"
   // error condition. Stricto sensu, an NoSpace error is an I/O error
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 193804ac318..d1585fcfa80 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -467,6 +467,8 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
         return 0xC;
       case rocksdb::Status::Code::kTryAgain:
         return 0xD;
+      case rocksdb::Status::Code::kColumnFamilyDropped:
+        return 0xE;
       default:
         return 0x7F;  // undefined
     }
@@ -584,6 +586,12 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
             new rocksdb::Status(rocksdb::Status::TryAgain(
               rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
         break;
+      case 0xE:
+        // ColumnFamilyDropped
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::ColumnFamilyDropped(
+                rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
       case 0x7F:
       default:
         return nullptr;
diff --git a/util/status.cc b/util/status.cc
index c66bf6f8e16..9405944808d 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -109,6 +109,9 @@ std::string Status::ToString() const {
     case kTryAgain:
       type = "Operation failed. Try again.: ";
       break;
+    case kColumnFamilyDropped:
+      type = "Column family dropped: ";
+      break;
     default:
       snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                static_cast<int>(code()));

From cd43446d017fd3929e5883bccf1206afafd57952 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 20 May 2019 13:47:32 -0700
Subject: [PATCH 046/572] Improve
 DBTablePropertiesTest.GetPropertiesOfTablesInRange (#5302)

Summary:
DBTablePropertiesTest.GetPropertiesOfTablesInRange sometimes hits the assert that generated LSM-tree doesn't have L1 file. Tighten the compaction triggering condition even further, hoping it goes away.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5302

Differential Revision: D15325971

Pulled By: siying

fbshipit-source-id: 3e032bdb16fe8d98d5fcfcd65dd8be9781f3d6ae
---
 db/db_table_properties_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 5a54fd81c05..77ea0020dd6 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -139,12 +139,12 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
   Options options;
   options.create_if_missing = true;
   options.write_buffer_size = 4096;
-  options.max_write_buffer_number = 3;
+  options.max_write_buffer_number = 2;
   options.level0_file_num_compaction_trigger = 2;
   options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 2;
   options.target_file_size_base = 2048;
-  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_base = 40960;
   options.max_bytes_for_level_multiplier = 4;
   options.hard_pending_compaction_bytes_limit = 16 * 1024;
   options.num_levels = 8;

From b2274da0e54da2a4c7faac571377edd8ece43cec Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 21 May 2019 12:17:15 -0700
Subject: [PATCH 047/572] LogWriter to only flush after finish generating whole
 record (#5328)

Summary:
Right now, in log writer, we call flush after writing each physical record. I don't see the necessarity of it. Right now, the underlying writer has a buffer, so there isn't a concern that the write request is too large either. On the other hand, in an Env where every flush is expensive, the current approach is significantly slower than only flushing after a whole record finishes, when the record is very large.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5328

Differential Revision: D15425032

Pulled By: siying

fbshipit-source-id: 440ebef002dfbb60c59d8388c9ddfc83d79700aa
---
 HISTORY.md       |  1 +
 db/log_writer.cc | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e9f06b53280..b65f5a038b1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -13,6 +13,7 @@
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
 * Reduce iterator key comparision for upper/lower bound check.
+* Log Writer will flush after finishing the whole record, rather than a fragment.
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 6ee39198184..c46965e16e0 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -102,6 +102,13 @@ Status Writer::AddRecord(const Slice& slice) {
     left -= fragment_length;
     begin = false;
   } while (s.ok() && left > 0);
+
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush();
+    }
+  }
+
   return s;
 }
 
@@ -146,11 +153,6 @@ Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
   Status s = dest_->Append(Slice(buf, header_size));
   if (s.ok()) {
     s = dest_->Append(Slice(ptr, n));
-    if (s.ok()) {
-      if (!manual_flush_) {
-        s = dest_->Flush();
-      }
-    }
   }
   block_offset_ += header_size + n;
   return s;

From dda474399affd9042c237ed3ab47a5a3e8a83c92 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Tue, 21 May 2019 16:19:39 -0700
Subject: [PATCH 048/572] Remove PATENTS text from a few straggler files
 (#5326)

Summary:
Remove PATENTS related wording from a few stragglers which still reference the old PATENTS file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5326

Differential Revision: D15423297

Pulled By: sagar0

fbshipit-source-id: 4babcddfc120b7d2fed6eb3898287cf8012bf8ea
---
 port/win/win_jemalloc.cc    | 6 +++---
 util/crc32c_ppc.c           | 8 +++-----
 util/crc32c_ppc.h           | 8 +++-----
 util/crc32c_ppc_asm.S       | 8 +++-----
 util/crc32c_ppc_constants.h | 8 +++-----
 util/ppc-opcode.h           | 8 +++-----
 6 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc
index 3268a56affd..b2077938806 100644
--- a/port/win/win_jemalloc.cc
+++ b/port/win/win_jemalloc.cc
@@ -1,7 +1,7 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c
index d9467d28262..654d606aaad 100644
--- a/util/crc32c_ppc.c
+++ b/util/crc32c_ppc.c
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #define CRC_TABLE
 #include <inttypes.h>
diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h
index 64a81a43102..c359061c610 100644
--- a/util/crc32c_ppc.h
+++ b/util/crc32c_ppc.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/util/crc32c_ppc_asm.S b/util/crc32c_ppc_asm.S
index 5142a8f259b..a317bf96b87 100644
--- a/util/crc32c_ppc_asm.S
+++ b/util/crc32c_ppc_asm.S
@@ -2,11 +2,9 @@
 //  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #include <ppc-asm.h>
 #include "ppc-opcode.h"
diff --git a/util/crc32c_ppc_constants.h b/util/crc32c_ppc_constants.h
index 21ec6fd9458..f6494cd01c3 100644
--- a/util/crc32c_ppc_constants.h
+++ b/util/crc32c_ppc_constants.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (C) 2015, 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/util/ppc-opcode.h b/util/ppc-opcode.h
index e632ef26a3c..5cc5af0e30c 100644
--- a/util/ppc-opcode.h
+++ b/util/ppc-opcode.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 

From 518cd1a62aeaaa9584516fdcf81bbfafbd75f18c Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Wed, 22 May 2019 09:17:39 -0700
Subject: [PATCH 049/572] Use GetCurrentManifestPath to locate current MANIFEST
 file (#5331)

Summary:
In version_set.cc, there is a function GetCurrentManifestPath. The goal of this task is to refactor ListColumnFamilies function so that ListColumnFamilies calls GetCurrentManifestPath to search for MANIFEST.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5331

Differential Revision: D15444524

Pulled By: HaoyuHuang

fbshipit-source-id: 1dcbd030bc0f2e835695741f450bba150f2f2903
---
 db/version_set.cc | 37 ++++++++++++++++++++-----------------
 db/version_set.h  |  4 +++-
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 15b9d01feea..5723c6d9253 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4041,10 +4041,15 @@ Status VersionSet::ExtractInfoFromVersionEdit(
   return Status::OK();
 }
 
-Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) {
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env,
+                                          std::string* manifest_path,
+                                          uint64_t* manifest_file_number) {
+  assert(env != nullptr);
   assert(manifest_path != nullptr);
+  assert(manifest_file_number != nullptr);
+
   std::string fname;
-  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname);
+  Status s = ReadFileToString(env, CurrentFileName(dbname), &fname);
   if (!s.ok()) {
     return s;
   }
@@ -4054,12 +4059,12 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) {
   // remove the trailing '\n'
   fname.resize(fname.size() - 1);
   FileType type;
-  bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type);
+  bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
   if (!parse_ok || type != kDescriptorFile) {
     return Status::Corruption("CURRENT file corrupted");
   }
-  *manifest_path = dbname_;
-  if (dbname_.back() != '/') {
+  *manifest_path = dbname;
+  if (dbname.back() != '/') {
     manifest_path->push_back('/');
   }
   *manifest_path += fname;
@@ -4080,7 +4085,8 @@ Status VersionSet::Recover(
 
   // Read "CURRENT" file, which contains a pointer to the current manifest file
   std::string manifest_path;
-  Status s = GetCurrentManifestPath(&manifest_path);
+  Status s = GetCurrentManifestPath(dbname_, env_, &manifest_path,
+                                    &manifest_file_number_);
   if (!s.ok()) {
     return s;
   }
@@ -4321,26 +4327,22 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
   // so we're fine using the defaults
   EnvOptions soptions;
   // Read "CURRENT" file, which contains a pointer to the current manifest file
-  std::string current;
-  Status s = ReadFileToString(env, CurrentFileName(dbname), &current);
+  std::string manifest_path;
+  uint64_t manifest_file_number;
+  Status s = GetCurrentManifestPath(dbname, env, &manifest_path,
+                                    &manifest_file_number);
   if (!s.ok()) {
     return s;
   }
-  if (current.empty() || current[current.size()-1] != '\n') {
-    return Status::Corruption("CURRENT file does not end with newline");
-  }
-  current.resize(current.size() - 1);
-
-  std::string dscname = dbname + "/" + current;
 
   std::unique_ptr<SequentialFileReader> file_reader;
   {
     std::unique_ptr<SequentialFile> file;
-    s = env->NewSequentialFile(dscname, &file, soptions);
+    s = env->NewSequentialFile(manifest_path, &file, soptions);
     if (!s.ok()) {
       return s;
   }
-  file_reader.reset(new SequentialFileReader(std::move(file), dscname));
+  file_reader.reset(new SequentialFileReader(std::move(file), manifest_path));
   }
 
   std::map<uint32_t, std::string> column_family_names;
@@ -5510,7 +5512,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
   Status s;
   do {
     std::string manifest_path;
-    s = GetCurrentManifestPath(&manifest_path);
+    s = GetCurrentManifestPath(dbname_, env_, &manifest_path,
+                               &manifest_file_number_);
     std::unique_ptr<SequentialFile> manifest_file;
     if (s.ok()) {
       if (nullptr == manifest_reader->get() ||
diff --git a/db/version_set.h b/db/version_set.h
index d82c5b47291..28ad0c2c234 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -807,7 +807,9 @@ class VersionSet {
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* new_cf_options = nullptr);
 
-  Status GetCurrentManifestPath(std::string* manifest_filename);
+  static Status GetCurrentManifestPath(const std::string& dbname, Env* env,
+                                       std::string* manifest_filename,
+                                       uint64_t* manifest_file_number);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families

From 3d9d77d9006c246ea54656440fb29eebfa048f8b Mon Sep 17 00:00:00 2001
From: Thomas Fersch <thomasf@fb.com>
Date: Wed, 22 May 2019 23:38:09 -0700
Subject: [PATCH 050/572] Restrict L0->L0 compaction according to
 max_compaction_bytes option (#5329)

Summary:
Modified FindIntraL0Compaction to stop picking more files if total
amount of compensated bytes would be larger than max_compaction_bytes
option.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5329

Differential Revision: D15435728

Pulled By: ThomasFersch

fbshipit-source-id: d118a6da88d5df8ee20944422ade37cf6b15d60c
---
 db/compaction_picker.cc      | 16 +++++++---
 db/compaction_picker.h       | 17 +++++++++++
 db/compaction_picker_fifo.cc |  3 +-
 db/compaction_picker_test.cc | 59 ++++++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index d6d7b69876e..4bd8ff0e33a 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -42,19 +42,23 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
 bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
                            uint64_t max_compact_bytes_per_del_file,
+                           uint64_t max_compaction_bytes,
                            CompactionInputFiles* comp_inputs) {
   size_t compact_bytes = static_cast<size_t>(level_files[0]->fd.file_size);
+  uint64_t compensated_compact_bytes = level_files[0]->compensated_file_size;
   size_t compact_bytes_per_del_file = port::kMaxSizet;
-  // compaction range will be [0, span_len).
+  // Compaction range will be [0, span_len).
   size_t span_len;
-  // pull in files until the amount of compaction work per deleted file begins
-  // increasing.
+  // Pull in files until the amount of compaction work per deleted file begins
+  // increasing or maximum total compaction size is reached.
   size_t new_compact_bytes_per_del_file = 0;
   for (span_len = 1; span_len < level_files.size(); ++span_len) {
     compact_bytes += static_cast<size_t>(level_files[span_len]->fd.file_size);
+    compensated_compact_bytes += level_files[span_len]->compensated_file_size;
     new_compact_bytes_per_del_file = compact_bytes / span_len;
     if (level_files[span_len]->being_compacted ||
-        new_compact_bytes_per_del_file > compact_bytes_per_del_file) {
+        new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+        compensated_compact_bytes > max_compaction_bytes) {
       break;
     }
     compact_bytes_per_del_file = new_compact_bytes_per_del_file;
@@ -1627,7 +1631,9 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() {
     return false;
   }
   return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
-                               port::kMaxUint64, &start_level_inputs_);
+                               port::kMaxUint64,
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_);
 }
 }  // namespace
 
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 01f5495e67b..250566b1065 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -273,9 +273,26 @@ class NullCompactionPicker : public CompactionPicker {
 };
 #endif  // !ROCKSDB_LITE
 
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files                     Metadata for L0 files.
+// @param min_files_to_compact            Minimum number of files required to
+//                                        do the compaction.
+// @param max_compact_bytes_per_del_file  Maximum average size in bytes per
+//                                        file that is going to get deleted by
+//                                        the compaction.
+// @param max_compaction_bytes            Maximum total size in bytes (in terms
+//                                        of compensated file size) for files
+//                                        to be compacted.
+// @param [out] comp_inputs               If a compaction was found, will be
+//                                        initialized with corresponding input
+//                                        files. Cannot be nullptr.
+//
+// @return                                true iff compaction was found.
 bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
                            uint64_t max_compact_bytes_per_del_file,
+                           uint64_t max_compaction_bytes,
                            CompactionInputFiles* comp_inputs);
 
 CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
diff --git a/db/compaction_picker_fifo.cc b/db/compaction_picker_fifo.cc
index 1322989e568..eadb31f9ee5 100644
--- a/db/compaction_picker_fifo.cc
+++ b/db/compaction_picker_fifo.cc
@@ -134,7 +134,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
               mutable_cf_options
                   .level0_file_num_compaction_trigger /* min_files_to_compact */
               ,
-              max_compact_bytes_per_del_file, &comp_inputs)) {
+              max_compact_bytes_per_del_file,
+              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
         Compaction* c = new Compaction(
             vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
             16 * 1024 * 1024 /* output file size limit */,
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 31325c12893..c759dae8b6c 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -1478,6 +1478,65 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
 }
 
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+  // spans entire L0 key range and is marked as being compacted to avoid
+  // L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U);
+  Add(0, 2U, "151", "200", 200000U);
+  Add(0, 3U, "201", "250", 200000U);
+  Add(0, 4U, "251", "300", 200000U);
+  Add(0, 5U, "301", "350", 200000U);
+  Add(1, 6U, "100", "350", 200000U);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0U, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+  // max_compaction_bytes limit (the minimum number of files for triggering
+  // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+  // is marked as being compacted to avoid L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U);
+  Add(0, 2U, "151", "200", 200000U);
+  Add(0, 3U, "201", "250", 200000U);
+  Add(0, 4U, "251", "300", 200000U);
+  Add(0, 5U, "301", "350", 200000U);
+  Add(1, 6U, "100", "350", 200000U);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0U, compaction->output_level());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 2095ae88585f8ee1ef24b07231f536ba583dd345 Mon Sep 17 00:00:00 2001
From: Silver Chan <chenchuanyinuestc@gmail.com>
Date: Thu, 23 May 2019 14:00:20 -0700
Subject: [PATCH 051/572] fixed db_stress.cc build error (#5307)

Summary:
when building this file using Xcode 10.2.1 in MacOSX10.14, the compiler report this error:
`
rocksdb/tools/db_stress.cc:3613:33: error: implicit instantiation of
      undefined template 'std::__1::array<std::__1::basic_string<char>, 10>'
    std::array<std::string, 10> keys = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
/usr/include/c++/v1/__tuple:223:64: note:
      template is declared here
template <class _Tp, size_t _Size> struct _LIBCPP_TEMPLATE_VIS array;
                                                               ^
1 error generated.
`
if including array, this error will be fixed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5307

Differential Revision: D15475217

Pulled By: sagar0

fbshipit-source-id: b04a7658c2ca2573157028863b3a80f5ab52b9de
---
 tools/db_stress.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 6eb974e0934..579178efffc 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -38,6 +38,7 @@ int main() {
 #include <stdlib.h>
 #include <sys/types.h>
 #include <algorithm>
+#include <array>
 #include <chrono>
 #include <exception>
 #include <queue>

From 40aa520a51bbf5b8bae54861a7c9c433a1b40006 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 23 May 2019 14:19:12 -0700
Subject: [PATCH 052/572] Add class comment for BlockFetcher

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5337

Differential Revision: D15482289

Pulled By: ltamasi

fbshipit-source-id: 8639ca78c1b8dfcc337a742d4d81d5752f12545f
---
 table/block_fetcher.h | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index b5fee941597..0dcdfc76125 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -13,14 +13,28 @@
 #include "util/memory_allocator.h"
 
 namespace rocksdb {
+
+// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or
+// persistent cache provided (if any) to try to avoid reading from the file
+// directly. Note that both the prefetch buffer and the persistent cache are
+// optional; also, note that the persistent cache may be configured to store either
+// compressed or uncompressed blocks.
+//
+// If the retrieved block is compressed and the do_uncompress flag is set,
+// BlockFetcher uncompresses the block (using the uncompression dictionary,
+// if provided, to prime the compression algorithm), and returns the resulting
+// uncompressed block data. Otherwise, it returns the original block.
+//
+// Two read options affect the behavior of BlockFetcher: if verify_checksums is
+// true, the checksum of the (original) block is checked; if fill_cache is true,
+// the block is added to the persistent cache if needed.
+//
+// Memory for uncompressed and compressed blocks is allocated as needed
+// using memory_allocator and memory_allocator_compressed, respectively
+// (if provided; otherwise, the default allocator is used).
+
 class BlockFetcher {
  public:
-  // Read the block identified by "handle" from "file".
-  // The only relevant option is options.verify_checksums for now.
-  // On failure return non-OK.
-  // On success fill *result and return OK - caller owns *result
-  // @param uncompression_dict Data for presetting the compression library's
-  //    dictionary.
   BlockFetcher(RandomAccessFileReader* file,
                FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                const ReadOptions& read_options, const BlockHandle& handle,

From dc30a9b69bc2c9f38e7e3266cfeb7983d2712ca4 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 23 May 2019 15:53:37 -0700
Subject: [PATCH 053/572] Add comments to db/db_iter.h (#5340)

Summary:
Add file comment in db/db_iter.h and minor changes in other parts.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5340

Differential Revision: D15484605

Pulled By: siying

fbshipit-source-id: 173771f9d5bd51303de5410ee5afd0a4af9d6572
---
 db/db_iter.h | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/db/db_iter.h b/db/db_iter.h
index a640f0296e5..8d8af3fd292 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -21,11 +21,36 @@
 
 namespace rocksdb {
 
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// a iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key is consist of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+//    user key: AAA  value: v3   seqno: 100    type: Put
+//    user key: AAA  value: v2   seqno: 97     type: Put
+//    user key: AAA  value: v1   seqno: 95     type: Put
+//    user key: BBB  value: v1   seqno: 90     type: Put
+//    user key: BBC  value: N/A  seqno: 98     type: Delete
+//    user key: BBC  value: v1   seqno: 95     type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+//    key: AAA  value: v3
+//    key: BBB  value: v1
+// If the snapshot passed in is 96, then it should expose:
+//    key: AAA  value: v1
+//    key: BBB  value: v1
+//    key: BBC  value: v1
+//
 class Arena;
 class DBIter;
 
 // Return a new iterator that converts internal keys (yielded by
-// "*internal_iter") that were live at the specified "sequence" number
+// "*internal_iter") that were live at the specified `sequence` number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
     Env* env, const ReadOptions& read_options,
@@ -41,6 +66,8 @@ extern Iterator* NewDBIterator(
 // a iterator hierarchy whose memory can be allocated inline. In that way,
 // accessing the iterator tree can be more cache friendly. It is also faster
 // to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
 class ArenaWrappedDBIter : public Iterator {
  public:
   virtual ~ArenaWrappedDBIter();

From 02830a20f8673de7b332a42e4cb376f79de0b121 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 23 May 2019 16:16:38 -0700
Subject: [PATCH 054/572] Add comments in db/dbformat.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5342

Differential Revision: D15485238

Pulled By: siying

fbshipit-source-id: a56b374584cb1d815c1173907a807d90b37d4dd6
---
 db/dbformat.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/db/dbformat.h b/db/dbformat.h
index c850adcb01a..437119fb775 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -28,6 +28,12 @@
 
 namespace rocksdb {
 
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
 class InternalKey;
 
 // Value types encoded as the last component of internal keys.
@@ -88,6 +94,8 @@ static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
 
 static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
 
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
 struct ParsedInternalKey {
   Slice user_key;
   SequenceNumber sequence;
@@ -192,9 +200,7 @@ class InternalKeyComparator
   }
 };
 
-// Modules in this directory should keep internal keys wrapped inside
-// the following class instead of plain strings so that we do not
-// incorrectly use string comparisons instead of an InternalKeyComparator.
+// The class represent the internal key in encoded form.
 class InternalKey {
  private:
   std::string rep_;
@@ -295,6 +301,12 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
   return num >> 8;
 }
 
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+//    address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+//    allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
 class IterKey {
  public:
   IterKey()
@@ -506,6 +518,8 @@ class IterKey {
   void operator=(const IterKey&) = delete;
 };
 
+// Convert from a SliceTranform of user keys, to a SliceTransform of
+// user keys.
 class InternalKeySliceTransform : public SliceTransform {
  public:
   explicit InternalKeySliceTransform(const SliceTransform* transform)
@@ -631,6 +645,7 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
   return r;
 }
 
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
 struct ParsedInternalKeyComparator {
   explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
       : cmp(c) {}

From 38a06aa2254ed363762c9f735df3638eb22b73b2 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 23 May 2019 16:22:13 -0700
Subject: [PATCH 055/572] Improve comments of classes for PlainTable (#5339)

Summary:
Simply add some comments.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5339

Differential Revision: D15485315

Pulled By: siying

fbshipit-source-id: 4594b1c4c967e6bd08aa7fa08a37df3481df1938
---
 table/plain_table_builder.h    |  3 +++
 table/plain_table_factory.h    | 14 +++++++++++++-
 table/plain_table_index.h      | 26 ++++++++++++++++++++++++--
 table/plain_table_key_coding.h | 15 ++++++++++++---
 table/plain_table_reader.h     | 13 +++++++------
 5 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index ca0879a4e1d..9a5b44b9c2c 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -24,6 +24,9 @@ class BlockHandle;
 class WritableFile;
 class TableBuilder;
 
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
 class PlainTableBuilder: public TableBuilder {
  public:
   // Create a builder that will store the contents of the table it is
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index dade1566096..1bd155f93e9 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -24,7 +24,19 @@ class WritableFile;
 class Table;
 class TableBuilder;
 
-// IndexedTable requires fixed length key, configured as a constructor
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+// 
+// PlainTable requires fixed length key, configured as a constructor
 // parameter of the factory class. Output file format:
 // +-------------+-----------------+
 // | version     | user_key_length |
diff --git a/table/plain_table_index.h b/table/plain_table_index.h
index 360d998279a..1457fd00d81 100644
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@@ -20,6 +20,12 @@
 
 namespace rocksdb {
 
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For descripton of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
 // PlainTableIndex contains buckets size of index_size_, each is a
 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
 // and the first bit of the integer indicates type of the offset.
@@ -55,6 +61,10 @@ namespace rocksdb {
 //    ....
 //   record N file offset:  fixedint32
 // <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
 class PlainTableIndex {
  public:
   enum IndexSearchResult {
@@ -72,11 +82,22 @@ class PlainTableIndex {
         index_(nullptr),
         sub_index_(nullptr) {}
 
+  // The function that executes the lookup the hash table.
+  // The hash key is `prefix_hash`. The function fills the hash bucket
+  // content in `bucket_value`, which is up to the caller to interpret.
   IndexSearchResult GetOffset(uint32_t prefix_hash,
                               uint32_t* bucket_value) const;
 
-  Status InitFromRawData(Slice data);
+  // Initialize data from `index_data`, which points to raw data for
+  // index stored in the SST file.
+  Status InitFromRawData(Slice index_data);
 
+  // Decode the sub index for specific hash bucket.
+  // The `offset` is the value returned as `bucket_value` by GetOffset()
+  // and is only valid when the return value is `kSubindex`.
+  // The return value is the pointer to the starting address of the
+  // sub-index. `upper_bound` is filled with the value indicating how many
+  // entries the sub-index has.
   const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
                                               uint32_t* upper_bound) const {
     const char* index_ptr = &sub_index_[offset];
@@ -106,9 +127,10 @@ class PlainTableIndex {
 // After calling Finish(), it returns Slice, which is usually
 // used either to initialize PlainTableIndex or
 // to save index to sst file.
-// For more details about the  index, please refer to:
+// For more details about the index, please refer to:
 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
 // #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
 class PlainTableIndexBuilder {
  public:
   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h
index 9a27ad06b78..93f8f7af4b5 100644
--- a/table/plain_table_key_coding.h
+++ b/table/plain_table_key_coding.h
@@ -11,6 +11,11 @@
 #include "db/dbformat.h"
 #include "table/plain_table_reader.h"
 
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
 namespace rocksdb {
 
 class WritableFile;
@@ -18,8 +23,8 @@ struct ParsedInternalKey;
 struct PlainTableReaderFileInfo;
 enum PlainTableEntryType : unsigned char;
 
-// Helper class to write out a key to an output file
-// Actual data format of the key is documented in plain_table_factory.h
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
 class PlainTableKeyEncoder {
  public:
   explicit PlainTableKeyEncoder(EncodingType encoding_type,
@@ -53,6 +58,10 @@ class PlainTableKeyEncoder {
   IterKey pre_prefix_;
 };
 
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
 class PlainTableFileReader {
  public:
   explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
@@ -122,7 +131,7 @@ class PlainTableFileReader {
 };
 
 // A helper class to decode keys from input buffer
-// Actual data format of the key is documented in plain_table_factory.h
+// The class is used by PlainTableBuilder.
 class PlainTableKeyDecoder {
  public:
   explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 14760f20a57..12b22aaf12e 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -56,16 +56,17 @@ struct PlainTableReaderFileInfo {
         file(std::move(_file)) {}
 };
 
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader: public TableReader {
+ public:
 // Based on following output file format shown in plain_table_factory.h
-// When opening the output file, IndexedTableReader creates a hash table
-// from key prefixes to offset of the output file. IndexedTable will decide
+// When opening the output file, PlainTableReader creates a hash table
+// from key prefixes to offset of the output file. PlainTable will decide
 // whether it points to the data offset of the first key with the key prefix
 // or the offset of it. If there are too many keys share this prefix, it will
 // create a binary search-able index from the suffix to offset on disk.
-//
-// The implementation of IndexedTableReader requires output file is mmaped
-class PlainTableReader: public TableReader {
- public:
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,

From 09b534cc2f36dc9e9ab13d1067fa8209456e9771 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 23 May 2019 16:26:07 -0700
Subject: [PATCH 056/572] improve comments for CompactionJob (#5341)

Summary:
add class/function level comments to the header file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5341

Differential Revision: D15485442

Pulled By: miasantreble

fbshipit-source-id: 9f11e2a1cd3ce0f4990f01353d0a6f4b050615cf
---
 db/compaction_job.cc |  6 ------
 db/compaction_job.h  | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index d1ae1932729..44fb385d1b3 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -415,7 +415,6 @@ void CompactionJob::Prepare() {
 
   write_hint_ =
       c->column_family_data()->CalculateSSTWriteHint(c->output_level());
-  // Is this compaction producing files at the bottommost level?
   bottommost_level_ = c->bottommost_level();
 
   if (c->ShouldFormSubcompactions()) {
@@ -445,11 +444,6 @@ struct RangeWithSize {
       : range(a, b), size(s) {}
 };
 
-// Generates a histogram representing potential divisions of key ranges from
-// the input. It adds the starting and/or ending keys of certain input files
-// to the working set and then finds the approximate size of data in between
-// each consecutive pair of slices. Then it divides these ranges into
-// consecutive groups such that each group has a similar size.
 void CompactionJob::GenSubcompactionBoundaries() {
   auto* c = compact_->compaction;
   auto* cfd = c->column_family_data();
diff --git a/db/compaction_job.h b/db/compaction_job.h
index b3a0f2eb4b5..a37c54de809 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -55,6 +55,11 @@ class Version;
 class VersionEdit;
 class VersionSet;
 
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
 class CompactionJob {
  public:
   CompactionJob(
@@ -80,17 +85,28 @@ class CompactionJob {
   CompactionJob& operator=(const CompactionJob& job) = delete;
 
   // REQUIRED: mutex held
+  // Prepare for the compaction by setting up boundaries for each subcompaction
   void Prepare();
   // REQUIRED mutex not held
+  // Launch threads for each subcompaction and wait for them to finish. After
+  // that, verify table is usable and finally do bookkeeping to unify
+  // subcompaction results
   Status Run();
 
   // REQUIRED: mutex held
+  // Add compaction input/output to the current version
   Status Install(const MutableCFOptions& mutable_cf_options);
 
  private:
   struct SubcompactionState;
 
   void AggregateStatistics();
+
+  // Generates a histogram representing potential divisions of key ranges from
+  // the input. It adds the starting and/or ending keys of certain input files
+  // to the working set and then finds the approximate size of data in between
+  // each consecutive pair of slices. Then it divides these ranges into
+  // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
   // update the thread status for starting a compaction.
@@ -163,6 +179,7 @@ class CompactionJob {
 
   EventLogger* event_logger_;
 
+  // Is this compaction creating a file in the bottom most level?
   bool bottommost_level_;
   bool paranoid_file_checks_;
   bool measure_io_stats_;

From 6a54278b4a9b86a1cce359e78db61015e7a1cc07 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 23 May 2019 16:26:08 -0700
Subject: [PATCH 057/572] add class level comment for RepeatableThread

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5344

Differential Revision: D15485431

Pulled By: miasantreble

fbshipit-source-id: 9c0f6cf0d826743e743012549976705ceb8cc0c4
---
 util/repeatable_thread.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h
index 967cc49945e..2d4729da02c 100644
--- a/util/repeatable_thread.h
+++ b/util/repeatable_thread.h
@@ -15,6 +15,9 @@
 
 namespace rocksdb {
 
+// Simple wrapper around port::Thread that supports calling a callback every
+// X seconds. If you pass in 0, then it will call your callback repeatedly
+// without delay.
 class RepeatableThread {
  public:
   RepeatableThread(std::function<void()> function,

From 74a334a2eb8db6c2ba2f38382be287af908e63c0 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 23 May 2019 21:54:23 -0700
Subject: [PATCH 058/572] Provide an option so that SST ingestion won't fall
 back to copy after hard linking fails (#5333)

Summary:
RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named  'failed_move_fall_back_to_copy' for users to choose which behavior they want.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5333

Differential Revision: D15457597

Pulled By: HaoyuHuang

fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214
---
 HISTORY.md                            |   3 +-
 db/external_sst_file_ingestion_job.cc |  19 ++---
 db/external_sst_file_test.cc          | 109 ++++++++++++++++++++++----
 include/rocksdb/options.h             |   2 +
 4 files changed, 106 insertions(+), 27 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b65f5a038b1..40d11096df0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
+* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
@@ -20,7 +21,7 @@
 
 ### Bug Fixes
 
-	
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 28b481678ab..588ac5110a2 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare(
   // Copy/Move external files into DB
   for (IngestedFileInfo& f : files_to_ingest_) {
     f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
-
+    f.copy_file = false;
     const std::string path_outside_db = f.external_file_path;
     const std::string path_inside_db =
         TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
                       f.fd.GetPathId());
-
     if (ingestion_options_.move_files) {
       status = env_->LinkFile(path_outside_db, path_inside_db);
-      if (status.IsNotSupported()) {
-        // Original file is on a different FS, use copy instead of hard linking
-        status = CopyFile(env_, path_outside_db, path_inside_db, 0,
-                          db_options_.use_fsync);
+      if (status.IsNotSupported() &&
+          ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
         f.copy_file = true;
-      } else {
-        f.copy_file = false;
       }
     } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
       status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                         db_options_.use_fsync);
-      f.copy_file = true;
     }
     TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
     if (!status.ok()) {
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index cbbb2fa2627..3850a2a031e 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -16,6 +16,54 @@
 
 namespace rocksdb {
 
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : DBTestBase("/external_sst_file_test"),
+        test_env_(new ExternalSSTTestEnv(env_, true)) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
 class ExternalSSTFileTest
     : public DBTestBase,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
@@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
 }
 
 /*
- * Test and verify the functionality of ingestion_options.move_files.
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
  */
-TEST_F(ExternalSSTFileTest, LinkExternalSst) {
-  Options options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  DestroyAndReopen(options);
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
   const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
 
   std::string file_path = sst_files_dir_ + "file1.sst";
   // Create SstFileWriter for default column family
-  SstFileWriter sst_file_writer(EnvOptions(), options);
+  SstFileWriter sst_file_writer(env_options, options_);
   ASSERT_OK(sst_file_writer.Open(file_path));
   for (int i = 0; i < kNumKeys; i++) {
     ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
@@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
   uint64_t file_size = 0;
   ASSERT_OK(env_->GetFileSize(file_path, &file_size));
 
-  IngestExternalFileOptions ifo;
-  ifo.move_files = true;
-  ASSERT_OK(db_->IngestExternalFile({file_path}, ifo));
+  bool copyfile = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);
 
   ColumnFamilyHandleImpl* cfh =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
@@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
     bytes_copied += stats.bytes_written;
     bytes_moved += stats.bytes_moved;
   }
-  // If bytes_moved > 0, it means external sst resides on the same FS
-  // supporting hard link operation. Therefore,
-  // 0 bytes should be copied, and the bytes_moved == file_size.
-  // Otherwise, FS does not support hard link, or external sst file resides on
-  // a different file system, then the bytes_copied should be equal to
-  // file_size.
-  if (bytes_moved > 0) {
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
     ASSERT_EQ(0, bytes_copied);
     ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
   } else {
-    ASSERT_EQ(file_size, bytes_copied);
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 class TestIngestExternalFileListener : public EventListener {
@@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
                                         std::make_tuple(true, false),
                                         std::make_tuple(true, true)));
 
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 7d22fb67559..cc7119410a0 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1398,6 +1398,8 @@ struct CompactRangeOptions {
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
   bool move_files = false;
+  // If set to true, ingestion falls back to copy when move fails.
+  bool failed_move_fall_back_to_copy = true;
   // If set to false, an ingested file keys could appear in existing snapshots
   // that where created before the file was ingested.
   bool snapshot_consistency = true;

From 5d359fc337803b1b365c7d151799e4d76f75b024 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Fri, 24 May 2019 10:02:36 -0700
Subject: [PATCH 059/572] Document AlignedBuffer (#5345)

Summary:
Add comments to util/aligned_buffer.h
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5345

Differential Revision: D15496004

Pulled By: sagar0

fbshipit-source-id: 31bc6f35e88dedd74cff55febe02c9e761304f76
---
 util/aligned_buffer.h | 74 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h
index 2201b487770..dbff9c8109e 100644
--- a/util/aligned_buffer.h
+++ b/util/aligned_buffer.h
@@ -13,21 +13,47 @@
 
 namespace rocksdb {
 
+// This file contains utilities to handle the alignment of pages and buffers.
+
+// Truncate to a multiple of page_size, which is also a page boundary. This
+// helps to figuring out the right alignment.
+// Example:
+//   TruncateToPageBoundary(5000, 4096)  => 4096
+//   TruncateToPageBoundary(10000, 4096) => 8192
 inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
   s -= (s & (page_size - 1));
   assert((s % page_size) == 0);
   return s;
 }
 
+// Round up x to a multiple of y.
+// Example:
+//   Roundup(13, 5)   => 15
+//   Roundup(201, 16) => 208
 inline size_t Roundup(size_t x, size_t y) {
   return ((x + y - 1) / y) * y;
 }
 
+// Round down x to a multiple of y.
+// Example:
+//   Rounddown(13, 5)   => 10
+//   Rounddown(201, 16) => 192
 inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
 
-// This class is to manage an aligned user
-// allocated buffer for direct I/O purposes
-// though can be used for any purpose.
+// AlignedBuffer manages a buffer by taking alignment into consideration, and
+// aligns the buffer start and end positions. It is mainly used for direct I/O,
+// though it can be used other purposes as well.
+// It also supports expanding the managed buffer, and copying whole or part of
+// the data from old buffer into the new expanded buffer. Such a copy especially
+// helps in cases avoiding an IO to re-fetch the data from disk.
+//
+// Example:
+//   AlignedBuffer buf;
+//   buf.Alignment(alignment);
+//   buf.AllocateNewBuffer(user_requested_buf_size);
+//   ...
+//   buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true,
+//                         copy_offset, copy_len);
 class AlignedBuffer {
   size_t alignment_;
   std::unique_ptr<char[]> buf_;
@@ -96,12 +122,21 @@ class AlignedBuffer {
     alignment_ = alignment;
   }
 
-  // Allocates a new buffer and sets bufstart_ to the aligned first byte.
+  // Allocates a new buffer and sets the start position to the first aligned
+  // byte.
+  //
   // requested_capacity: requested new buffer capacity. This capacity will be
   //     rounded up based on alignment.
-  // copy_data: Copy data from old buffer to new buffer.
+  // copy_data: Copy data from old buffer to new buffer. If copy_offset and
+  //     copy_len are not passed in and the new requested capacity is bigger
+  //     than the existing buffer's capacity, the data in the exising buffer is
+  //     fully copied over to the new buffer.
   // copy_offset: Copy data from this offset in old buffer.
   // copy_len: Number of bytes to copy.
+  //
+  // The function does nothing if the new requested_capacity is smaller than
+  // the current buffer capacity and copy_data is true i.e. the old buffer is
+  // retained as is.
   void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
                          uint64_t copy_offset = 0, size_t copy_len = 0) {
     assert(alignment_ > 0);
@@ -110,7 +145,7 @@ class AlignedBuffer {
     copy_len = copy_len > 0 ? copy_len : cursize_;
     if (copy_data && requested_capacity < copy_len) {
       // If we are downsizing to a capacity that is smaller than the current
-      // data in the buffer. Ignore the request.
+      // data in the buffer -- Ignore the request.
       return;
     }
 
@@ -132,8 +167,15 @@ class AlignedBuffer {
     capacity_ = new_capacity;
     buf_.reset(new_buf);
   }
-  // Used for write
-  // Returns the number of bytes appended
+
+  // Append to the buffer.
+  //
+  // src         : source to copy the data from.
+  // append_size : number of bytes to copy from src.
+  // Returns the number of bytes appended.
+  //
+  // If append_size is more than the remaining buffer size only the
+  // remaining-size worth of bytes are copied.
   size_t Append(const char* src, size_t append_size) {
     size_t buffer_remaining = capacity_ - cursize_;
     size_t to_copy = std::min(append_size, buffer_remaining);
@@ -145,6 +187,12 @@ class AlignedBuffer {
     return to_copy;
   }
 
+  // Read from the buffer.
+  //
+  // dest      : destination buffer to copy the data to.
+  // offset    : the buffer offset to start reading from.
+  // read_size : the number of bytes to copy from the buffer to dest.
+  // Returns the number of bytes read/copied to dest.
   size_t Read(char* dest, size_t offset, size_t read_size) const {
     assert(offset < cursize_);
 
@@ -158,7 +206,7 @@ class AlignedBuffer {
     return to_read;
   }
 
-  /// Pad to alignment
+  // Pad to the end of alignment with "padding"
   void PadToAlignmentWith(int padding) {
     size_t total_size = Roundup(cursize_, alignment_);
     size_t pad_size = total_size - cursize_;
@@ -176,7 +224,7 @@ class AlignedBuffer {
     cursize_ += pad_size;
   }
 
-  // After a partial flush move the tail to the beginning of the buffer
+  // After a partial flush move the tail to the beginning of the buffer.
   void RefitTail(size_t tail_offset, size_t tail_size) {
     if (tail_size > 0) {
       memmove(bufstart_, bufstart_ + tail_offset, tail_size);
@@ -184,7 +232,11 @@ class AlignedBuffer {
     cursize_ = tail_size;
   }
 
-  // Returns place to start writing
+  // Returns a place to start appending.
+  // WARNING: Note that it is possible to write past the end of the buffer if
+  // the buffer is modified without using the write APIs or encapsulation
+  // offered by AlignedBuffer. It is up to the user to guard against such
+  // errors.
   char* Destination() {
     return bufstart_ + cursize_;
   }

From 94c78b11e411d15f23bbc0c3c3f95c7e070ea528 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 24 May 2019 10:27:28 -0700
Subject: [PATCH 060/572] improve comments for statistics.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5351

Differential Revision: D15496346

Pulled By: miasantreble

fbshipit-source-id: eeb619e6bd8616003ba35b0cd4bb8050e6a8cb4d
---
 include/rocksdb/statistics.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 3b2b2e048c7..653b460cbdd 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -447,6 +447,10 @@ struct HistogramData {
   double min = 0.0;
 };
 
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+//   options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
 enum StatsLevel : uint8_t {
   // Disable timer stats, and skip histogram stats
   kExceptHistogramOrTimers,
@@ -464,7 +468,15 @@ enum StatsLevel : uint8_t {
   kAll,
 };
 
-// Analyze the performance of a db
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+//  Options options;
+//  options.statistics = rocksdb::CreateDBStatistics();
+//  Status s = DB::Open(options, kDBPath, &db);
+//  ...
+//  options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+//  HistogramData hist;
+//  options.statistics->histogramData(FLUSH_TIME, &hist);
 class Statistics {
  public:
   virtual ~Statistics() {}

From 88ff80780b3ccdbf802625c8302b9e4405a09b66 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 24 May 2019 10:36:26 -0700
Subject: [PATCH 061/572] improve comment for WalManager (#5350)

Summary:
att
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5350

Differential Revision: D15496467

Pulled By: miasantreble

fbshipit-source-id: c29c0b143bf4df2040695a82be0feb9814ddb641
---
 db/wal_manager.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/db/wal_manager.h b/db/wal_manager.h
index 6caf1640c06..9d5afb25d5e 100644
--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@@ -28,6 +28,10 @@
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
 class WalManager {
  public:
   WalManager(const ImmutableDBOptions& db_options,
@@ -40,6 +44,8 @@ class WalManager {
 
   Status GetSortedWalFiles(VectorLogPtr& files);
 
+  // Allow user to tail transaction log to find all recent changes to the
+  // database that are newer than `seq_number`.
   Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options,

From 98094f6caca6a5c0d2cff4c36f3bfdc7c1fcb7b6 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 24 May 2019 11:16:47 -0700
Subject: [PATCH 062/572] Add some comments for BlockContents

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5354

Differential Revision: D15496645

Pulled By: ltamasi

fbshipit-source-id: 1282b1ce11fbc412d3d87b2688fd0586e7bb6b85
---
 table/format.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/table/format.h b/table/format.h
index f5858850559..84242303ec7 100644
--- a/table/format.h
+++ b/table/format.h
@@ -194,6 +194,10 @@ inline CompressionType get_block_compression_type(const char* block_data,
   return static_cast<CompressionType>(block_data[block_size]);
 }
 
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region.
 struct BlockContents {
   Slice data;  // Actual contents of data
   CacheAllocationPtr allocation;
@@ -206,16 +210,20 @@ struct BlockContents {
 
   BlockContents() {}
 
+  // Does not take ownership of the underlying data bytes.
   BlockContents(const Slice& _data) : data(_data) {}
 
+  // Takes ownership of the underlying data bytes.
   BlockContents(CacheAllocationPtr&& _data, size_t _size)
       : data(_data.get(), _size), allocation(std::move(_data)) {}
 
+  // Takes ownership of the underlying data bytes.
   BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
       : data(_data.get(), _size) {
     allocation.reset(_data.release());
   }
 
+  // Returns whether the object has ownership of the underlying data bytes.
   bool own_bytes() const { return allocation.get() != nullptr; }
 
   // It's the caller's responsibility to make sure that this is

From 767d1f3ff17b002659f48a520c84fbb09f6ac3fc Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 24 May 2019 11:37:06 -0700
Subject: [PATCH 063/572] Improve comments for StatsHistoryIterator and
 InMemoryStatsHistoryIterator

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5346

Differential Revision: D15497679

Pulled By: miasantreble

fbshipit-source-id: c10caf10293c3d9663bfb398a0d331326d1e9e67
---
 db/db_impl.h                    |  1 -
 db/in_memory_stats_history.cc   |  4 ++++
 db/in_memory_stats_history.h    | 19 +++++++++++++++++++
 include/rocksdb/db.h            |  5 +++--
 include/rocksdb/stats_history.h | 20 +++++++++++++++++++-
 5 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/db/db_impl.h b/db/db_impl.h
index 08cb1949118..f574a8f4479 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -761,7 +761,6 @@ class DBImpl : public DB {
   static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
                                       std::unique_ptr<Directory>* directory);
 
-  // Given a time window, return an iterator for accessing stats history
   Status GetStatsHistory(
       uint64_t start_time, uint64_t end_time,
       std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
diff --git a/db/in_memory_stats_history.cc b/db/in_memory_stats_history.cc
index 39355cfbe0a..e9e0cc74950 100644
--- a/db/in_memory_stats_history.cc
+++ b/db/in_memory_stats_history.cc
@@ -17,6 +17,10 @@ bool InMemoryStatsHistoryIterator::Valid() const { return valid_; }
 
 Status InMemoryStatsHistoryIterator::status() const { return status_; }
 
+// Because of garbage collection, the next stats snapshot may or may not be
+// right after the current one. When reading from DBImpl::stats_history_, this
+// call will be protected by DB Mutex so it will not return partial or
+// corrupted results.
 void InMemoryStatsHistoryIterator::Next() {
   // increment start_time by 1 to avoid infinite loop
   AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
diff --git a/db/in_memory_stats_history.h b/db/in_memory_stats_history.h
index 4b52e23fffa..eeb679cc0a2 100644
--- a/db/in_memory_stats_history.h
+++ b/db/in_memory_stats_history.h
@@ -12,8 +12,20 @@
 
 namespace rocksdb {
 
+// InMemoryStatsHistoryIterator can be used to access stats history that was
+// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps
+// a copy of the stats snapshot (in stats_map_) that is currently being pointed
+// to, which allows the iterator to access the stats snapshot even when
+// the background garbage collecting thread purges it from the source of truth
+// (`DBImpl::stats_history_`). In that case, the iterator will continue to be
+// valid until a call to `Next()` returns no result and invalidates it. In
+// some extreme cases, the iterator may also return fragmented segments of
+// stats snapshots due to long gaps between `Next()` calls and interleaved
+// garbage collection.
 class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
  public:
+  // Setup InMemoryStatsHistoryIterator to return stats snapshots between
+  // microsecond timestamps [start_time, end_time)
   InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
                                DBImpl* db_impl)
       : start_time_(start_time),
@@ -26,9 +38,16 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
   bool Valid() const override;
   Status status() const override;
 
+  // Move to the next stats snapshot currently available
+  // This function may invalidate the iterator
+  // REQUIRES: Valid()
   void Next() override;
+
+  // REQUIRES: Valid()
   uint64_t GetStatsTime() const override;
 
+  // This function is idempotent
+  // REQUIRES: Valid()
   const std::map<std::string, uint64_t>& GetStatsMap() const override;
 
  private:
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 7b49b92c239..b0538433b4a 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1322,8 +1322,9 @@ class DB {
   // Needed for StackableDB
   virtual DB* GetRootDB() { return this; }
 
-  // Given a time window, return an iterator for accessing stats history
-  // User is responsible for deleting StatsHistoryIterator after use
+  // Given a window [start_time, end_time), setup a StatsHistoryIterator
+  // to access stats history. Note the start_time and end_time are epoch
+  // time measured in microsecond, and end_time is an exclusive bound.
   virtual Status GetStatsHistory(
       uint64_t /*start_time*/, uint64_t /*end_time*/,
       std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h
index 40ea51d1ff0..1a841908170 100644
--- a/include/rocksdb/stats_history.h
+++ b/include/rocksdb/stats_history.h
@@ -11,7 +11,6 @@
 #include <map>
 #include <string>
 
-// #include "db/db_impl.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 
@@ -19,6 +18,25 @@ namespace rocksdb {
 
 class DBImpl;
 
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+//   std::unique_ptr<StatsHistoryIterator> stats_iter;
+//   Status s = db->GetStatsHistory(0 /* start_time */,
+//                                  env->NowMicros() /* end_time*/,
+//                                  &stats_iter);
+//   if (s.ok) {
+//     for (; stats_iter->Valid(); stats_iter->Next()) {
+//       uint64_t stats_time = stats_iter->GetStatsTime();
+//       const std::map<std::string, uint64_t>& stats_map =
+//           stats_iter->GetStatsMap();
+//       process(stats_time, stats_map);
+//     }
+//   }
 class StatsHistoryIterator {
  public:
   StatsHistoryIterator() {}

From 596cc1547a01b8299293f9fb43f219722eeb6dad Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 24 May 2019 12:03:16 -0700
Subject: [PATCH 064/572] Update comments in column_family.h (#5347)

Summary:
Document relationships of data structures declared in column_family.h
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5347

Differential Revision: D15496941

Pulled By: siying

fbshipit-source-id: 47b37835abba26aa31a94fabea6b2775483e0ccb
---
 db/column_family.h | 108 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/db/column_family.h b/db/column_family.h
index 7a1ae85bfd3..655cb159261 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -45,7 +45,113 @@ class InstrumentedMutexLock;
 struct SuperVersionContext;
 
 extern const double kIncSlowdownRatio;
-
+// This file contains a list of data structures for managing column family
+// level metadata. 
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+//       +----------------------+    +----------------------+   +--------+
+//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
+//   |   +----------------------+ |  +----------------------+   +----+---+
+//   | +--------------------------+                                  |
+//   | |                               +-----------------------------+
+//   | |                               |
+//   | | +-----------------------------v-------------------------------+
+//   | | |                                                             |
+//   | | |                      ColumnFamilySet                        |
+//   | | |                                                             |
+//   | | +-------------+--------------------------+----------------+---+
+//   | |               |                          |                |
+//   | +-------------------------------------+    |                |
+//   |                 |                     |    |                v
+//   |   +-------------v-------------+ +-----v----v---------+
+//   |   |                           | |                    |
+//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
+//   |   |                           | |                    |
+//   +--->                           | |                    |
+//       |                 +---------+ |                    |
+//       |                 | MemTable| |                    |
+//       |                 |  List   | |                    |
+//       +--------+---+--+-+----+----+ +--------------------++
+//                |   |  |      |
+//                |   |  |      |
+//                |   |  |      +-----------------------+
+//                |   |  +-----------+                  |
+//                v   +--------+     |                  |
+//       +--------+--------+   |     |                  |
+//       |                 |   |     |       +----------v----------+
+// +---> |SuperVersion 1.a +----------------->                     |
+//       |                 +------+  |       | MemTableListVersion |
+//       +---+-------------+   |  |  |       |                     |
+//           |                 |  |  |       +----+------------+---+
+//           |      current    |  |  |            |            |
+//           |   +-------------+  |  |mem         |            |
+//           |   |                |  |            |            |
+//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
+//         |             |    |          |  |          |  |          |
+//         | Version 1.a |    | memtable |  | memtable |  | memtable |
+//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
+//         +-------------+    |          |  |          |  |          |
+//                            +----------+  +----------+  +----------+
+// 
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column       +--------------+      current       +-----------+
+// Family +---->+              +------------------->+           |
+//  Data        | SuperVersion +----------+         | Version A |
+//              |      3       |   imm    |         |           |
+// Iter2 +----->+              |  +-------v------+  +-----------+
+//              +-----+--------+  | MemtableList +----------------> Empty
+//                    |           |   Version r  |  +-----------+
+//                    |           +--------------+  |           |
+//                    +------------------+   current| Version B |
+//              +--------------+         |   +----->+           |
+//              |              |         |   |      +-----+-----+
+// Compaction +>+ SuperVersion +-------------+            ^
+//    Job       |      2       +------+  |                |current
+//              |              +----+ |  |     mem        |    +------------+
+//              +--------------+    | |  +--------------------->            |
+//                                  | +------------------------> MemTable a |
+//                                  |          mem        |    |            |
+//              +--------------+    |                     |    +------------+
+//              |              +--------------------------+
+//  Iter1 +-----> SuperVersion |    |                          +------------+
+//              |      1       +------------------------------>+            |
+//              |              +-+  |        mem               | MemTable b |
+//              +--------------+ |  |                          |            |
+//                               |  |    +--------------+      +-----^------+
+//                               |  |imm | MemtableList |            |
+//                               |  +--->+   Version s  +------------+
+//                               |       +--------------+
+//                               |       +--------------+
+//                               |       | MemtableList |
+//                               +------>+   Version t  +-------->  Empty
+//                                 imm   +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+  
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
 // is done using the column family

From f69e63dc5fa99277bc1e1ef6140383207be3c8ac Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 24 May 2019 12:20:14 -0700
Subject: [PATCH 065/572] Improve comments in compaction.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5356

Differential Revision: D15499033

Pulled By: siying

fbshipit-source-id: 069ae48669484beaf668dd90389b8743b3309dc3
---
 db/compaction.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/compaction.h b/db/compaction.h
index 2cf737b676a..e9ded632503 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -14,6 +14,8 @@
 #include "util/autovector.h"
 
 namespace rocksdb {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
 
 // Utility for comparing sstable boundary keys. Returns -1 if either a or b is
 // null which provides the property that a==null indicates a key that is less
@@ -63,7 +65,7 @@ class ColumnFamilyData;
 class VersionStorageInfo;
 class CompactionFilter;
 
-// A Compaction encapsulates information about a compaction.
+// A Compaction encapsulates metadata about a compaction.
 class Compaction {
  public:
   Compaction(VersionStorageInfo* input_version,
@@ -376,7 +378,7 @@ class Compaction {
   CompactionReason compaction_reason_;
 };
 
-// Utility function
+// Return sum of sizes of all files in `files`.
 extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
 
 }  // namespace rocksdb

From f66026c8c7a93473854966519d56c5d4fa115b24 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 24 May 2019 12:26:58 -0700
Subject: [PATCH 066/572] Comments for BlockBasedTable

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5352

Differential Revision: D15498477

Pulled By: vjnadimpalli

fbshipit-source-id: 08a981521848433362a56ac521c7fb83c7dd7b2a
---
 table/block_based_table_reader.h | 19 +++++++++++++++----
 table/table_reader.h             |  8 +++++---
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 8274f0cf965..270409b3ab6 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -59,9 +59,17 @@ class GetContext;
 
 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
 
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
@@ -425,7 +433,7 @@ class BlockBasedTable : public TableReader {
   friend class PartitionedFilterBlockTest;
 };
 
-// Maitaning state of a two-level iteration on a partitioned index structure
+// Maitaning state of a two-level iteration on a partitioned index structure.
 class BlockBasedTable::PartitionedIndexIteratorState
     : public TwoLevelIteratorState {
  public:
@@ -444,6 +452,8 @@ class BlockBasedTable::PartitionedIndexIteratorState
   bool index_key_is_full_;
 };
 
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
 struct BlockBasedTable::Rep {
   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
@@ -553,6 +563,7 @@ struct BlockBasedTable::Rep {
   }
 };
 
+// Iterates over the contents of BlockBasedTable.
 template <class TBlockIter, typename TValue = Slice>
 class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  public:
diff --git a/table/table_reader.h b/table/table_reader.h
index bd6071d9c67..037dbc33818 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -26,9 +26,11 @@ struct TableProperties;
 class GetContext;
 class MultiGetContext;
 
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
+// A Table (also referred to as SST) is a sorted map from strings to strings.
+// Tables are immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization. Table readers are used
+// for reading various types of table formats supported by rocksdb including
+// BlockBasedTable, PlainTable and CuckooTable format.
 class TableReader {
  public:
   virtual ~TableReader() {}

From 6267ed251ae5162b7b5c41521061e5541af301f5 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 24 May 2019 13:05:58 -0700
Subject: [PATCH 067/572] Improve comment in db_impl.h (#5338)

Summary:
Add some comments in db_impl.h. Also reordered function order a little bit so that I can add a comment to flag the area of functions implementing DB interface.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5338

Differential Revision: D15498284

Pulled By: siying

fbshipit-source-id: 3d7c59c8303577fe44d13c74ae84c7ce05164f77
---
 db/db_impl.h | 355 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 193 insertions(+), 162 deletions(-)

diff --git a/db/db_impl.h b/db/db_impl.h
index f574a8f4479..f2544e85941 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -75,16 +75,30 @@ struct JobContext;
 struct ExternalSstFileInfo;
 struct MemTableInfo;
 
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
 class DBImpl : public DB {
  public:
   DBImpl(const DBOptions& options, const std::string& dbname,
          const bool seq_per_batch = false, const bool batch_per_txn = true);
   virtual ~DBImpl();
 
+  // ---- Implementations of the DB interface ----
+
   using DB::Resume;
   virtual Status Resume() override;
 
-  // Implementations of the DB interface
   using DB::Put;
   virtual Status Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
@@ -110,13 +124,6 @@ class DBImpl : public DB {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
-  // Function that Get and KeyMayExist call with no_io true or false
-  // Note: 'value_found' from KeyMayExist propagates here
-  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
-                 const Slice& key, PinnableSlice* value,
-                 bool* value_found = nullptr, ReadCallback* callback = nullptr,
-                 bool* is_blob_index = nullptr);
-
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -174,12 +181,6 @@ class DBImpl : public DB {
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
       std::vector<Iterator*>* iterators) override;
-  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
-                                      ColumnFamilyData* cfd,
-                                      SequenceNumber snapshot,
-                                      ReadCallback* read_callback,
-                                      bool allow_blob = false,
-                                      bool allow_refresh = true);
 
   virtual const Snapshot* GetSnapshot() override;
   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
@@ -259,23 +260,19 @@ class DBImpl : public DB {
   virtual Status UnlockWAL() override;
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
-  virtual SequenceNumber GetLastPublishedSequence() const {
-    if (last_seq_same_as_publish_seq_) {
-      return versions_->LastSequence();
-    } else {
-      return versions_->LastPublishedSequence();
-    }
-  }
-  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
-  // the second write queue otherwise.
-  virtual void SetLastPublishedSequence(SequenceNumber seq);
-  // Returns LastSequence in last_seq_same_as_publish_seq_
-  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
-  // depends also on data written to the WAL but not to the memtable.
-  SequenceNumber TEST_GetLastVisibleSequence() const;
 
   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
 
+  virtual Status GetDbIdentity(std::string& identity) const override;
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+  virtual Status Close() override;
+
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
   virtual Status ResetStats() override;
@@ -313,12 +310,76 @@ class DBImpl : public DB {
   Status PromoteL0(ColumnFamilyHandle* column_family,
                    int target_level) override;
 
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& ingestion_options) override;
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
+  virtual Status VerifyChecksum() override;
+
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override;
+
+#endif  // ROCKSDB_LITE
+  
+  // ---- End of implementations of the DB interface ----
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value,
+                 bool* value_found = nullptr, ReadCallback* callback = nullptr,
+                 bool* is_blob_index = nullptr);
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool allow_blob = false,
+                                      bool allow_refresh = true);
+
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE  
   // Similar to Write() but will call the callback once on the single write
   // thread to determine whether it is safe to perform the write.
   virtual Status WriteWithCallback(const WriteOptions& write_options,
                                    WriteBatch* my_batch,
                                    WriteCallback* callback);
 
+  
   // Returns the sequence number that is guaranteed to be smaller than or equal
   // to the sequence number of any key that could be inserted into the current
   // memtables. It can then be assumed that any write with a larger(or equal)
@@ -360,25 +421,6 @@ class DBImpl : public DB {
                                  bool* found_record_for_key,
                                  bool* is_blob_index = nullptr);
 
-  using DB::IngestExternalFile;
-  virtual Status IngestExternalFile(
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& external_files,
-      const IngestExternalFileOptions& ingestion_options) override;
-
-  using DB::IngestExternalFiles;
-  virtual Status IngestExternalFiles(
-      const std::vector<IngestExternalFileArg>& args) override;
-
-  virtual Status VerifyChecksum() override;
-
-  using DB::StartTrace;
-  virtual Status StartTrace(
-      const TraceOptions& options,
-      std::unique_ptr<TraceWriter>&& trace_writer) override;
-
-  using DB::EndTrace;
-  virtual Status EndTrace() override;
   Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
   Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
 #endif  // ROCKSDB_LITE
@@ -393,8 +435,6 @@ class DBImpl : public DB {
   // match to our in-memory records
   virtual Status CheckConsistency();
 
-  virtual Status GetDbIdentity(std::string& identity) const override;
-
   // max_file_num_to_ignore allows bottom level compaction to filter out newly
   // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
   // disable the filtering
@@ -416,102 +456,6 @@ class DBImpl : public DB {
     return &logs_with_prep_tracker_;
   }
 
-#ifndef NDEBUG
-  // Extra methods (for testing) that are not in the public DB interface
-  // Implemented in db_impl_debug.cc
-
-  // Compact any files in the named level that overlap [*begin, *end]
-  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
-                           ColumnFamilyHandle* column_family = nullptr,
-                           bool disallow_trivial_move = false);
-
-  void TEST_SwitchWAL();
-
-  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
-
-  bool TEST_IsLogGettingFlushed() {
-    return alive_log_files_.begin()->getting_flushed;
-  }
-
-  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
-
-  // Force current memtable contents to be flushed.
-  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
-                            ColumnFamilyHandle* cfh = nullptr);
-
-  // Wait for memtable compaction
-  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
-
-  // Wait for any compaction
-  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
-  // is only for the special test of CancelledCompactions
-  Status TEST_WaitForCompact(bool waitUnscheduled = false);
-
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t TEST_MaxNextLevelOverlappingBytes(
-      ColumnFamilyHandle* column_family = nullptr);
-
-  // Return the current manifest file no.
-  uint64_t TEST_Current_Manifest_FileNo();
-
-  // Returns the number that'll be assigned to the next file that's created.
-  uint64_t TEST_Current_Next_FileNo();
-
-  // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize();
-
-  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
-                             std::vector<std::vector<FileMetaData>>* metadata);
-
-  void TEST_LockMutex();
-
-  void TEST_UnlockMutex();
-
-  // REQUIRES: mutex locked
-  void* TEST_BeginWrite();
-
-  // REQUIRES: mutex locked
-  // pass the pointer that you got from TEST_BeginWrite()
-  void TEST_EndWrite(void* w);
-
-  uint64_t TEST_MaxTotalInMemoryState() const {
-    return max_total_in_memory_state_;
-  }
-
-  size_t TEST_LogsToFreeSize();
-
-  uint64_t TEST_LogfileNumber();
-
-  uint64_t TEST_total_log_size() const { return total_log_size_; }
-
-  // Returns column family name to ImmutableCFOptions map.
-  Status TEST_GetAllImmutableCFOptions(
-      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
-
-  // Return the lastest MutableCFOptions of a column family
-  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
-                                        MutableCFOptions* mutable_cf_options);
-
-  Cache* TEST_table_cache() { return table_cache_.get(); }
-
-  WriteController& TEST_write_controler() { return write_controller_; }
-
-  uint64_t TEST_FindMinLogContainingOutstandingPrep();
-  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
-  size_t TEST_PreparedSectionCompletedSize();
-  size_t TEST_LogsWithPrepSize();
-
-  int TEST_BGCompactionsAllowed() const;
-  int TEST_BGFlushesAllowed() const;
-  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
-  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
-  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
-  bool TEST_IsPersistentStatsEnabled() const;
-  size_t TEST_EstiamteStatsHistorySize() const;
-
-#endif  // NDEBUG
-
   struct BGJobLimits {
     int max_flushes;
     int max_compactions;
@@ -555,12 +499,15 @@ class DBImpl : public DB {
   void PurgeObsoleteFiles(JobContext& background_contet,
                           bool schedule_only = false);
 
+  // Schedule a background job to actually delete obsolete files.
   void SchedulePurge();
 
-  ColumnFamilyHandle* DefaultColumnFamily() const override;
-
   const SnapshotList& snapshots() const { return snapshots_; }
 
+  // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+  // in ascending order.
+  // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+  // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
   void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
                      SequenceNumber* oldest_write_conflict_snapshot,
                      const SequenceNumber& max_seq) const {
@@ -572,6 +519,10 @@ class DBImpl : public DB {
     return immutable_db_options_;
   }
 
+  // Cancel all background jobs, including flush, compaction, background
+  // purging, stats dumping threads, etc. If `wait` = true, wait for the
+  // running jobs to abort or finish before returning. Otherwise, only
+  // sends the signals.
   void CancelAllBackgroundWork(bool wait);
 
   // Find Super version and reference it. Based on options, it might return
@@ -748,6 +699,8 @@ class DBImpl : public DB {
 
   InstrumentedMutex* mutex() const { return &mutex_; }
 
+  // Initialize a brand new DB. The DB directory is expected to be empty before
+  // calling it.
   Status NewDB();
 
   // This is to be used only by internal rocksdb classes.
@@ -756,21 +709,109 @@ class DBImpl : public DB {
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
                      const bool seq_per_batch, const bool batch_per_txn);
 
-  virtual Status Close() override;
 
   static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
                                       std::unique_ptr<Directory>* directory);
 
-  Status GetStatsHistory(
-      uint64_t start_time, uint64_t end_time,
-      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
-
   // find stats map from stats_history_ with smallest timestamp in
   // the range of [start_time, end_time)
   bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
                        uint64_t* new_time,
                        std::map<std::string, uint64_t>* stats_map);
 
+#ifndef NDEBUG
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
+
+  void TEST_SwitchWAL();
+
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+  bool TEST_IsLogGettingFlushed() {
+    return alive_log_files_.begin()->getting_flushed;
+  }
+
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+                            ColumnFamilyHandle* cfh = nullptr);
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+                             std::vector<std::vector<FileMetaData>>* metadata);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
+
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
+
+  size_t TEST_LogsToFreeSize();
+
+  uint64_t TEST_LogfileNumber();
+
+  uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+  // Returns column family name to ImmutableCFOptions map.
+  Status TEST_GetAllImmutableCFOptions(
+      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+  // Return the lastest MutableCFOptions of a column family
+  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+                                        MutableCFOptions* mutable_cf_options);
+
+  Cache* TEST_table_cache() { return table_cache_.get(); }
+
+  WriteController& TEST_write_controler() { return write_controller_; }
+
+  uint64_t TEST_FindMinLogContainingOutstandingPrep();
+  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
+
+  int TEST_BGCompactionsAllowed() const;
+  int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+  bool TEST_IsPersistentStatsEnabled() const;
+  size_t TEST_EstiamteStatsHistorySize() const;
+
+#endif  // NDEBUG
+  
  protected:
   Env* const env_;
   const std::string dbname_;
@@ -1700,16 +1741,6 @@ class DBImpl : public DB {
       ColumnFamilyData* cfd, SuperVersionContext* sv_context,
       const MutableCFOptions& mutable_cf_options);
 
-#ifndef ROCKSDB_LITE
-  using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(
-      ColumnFamilyHandle* column_family,
-      TablePropertiesCollection* props) override;
-  virtual Status GetPropertiesOfTablesInRange(
-      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
-      TablePropertiesCollection* props) override;
-
-#endif  // ROCKSDB_LITE
 
   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
                               const DBPropertyInfo& property_info,

From eb7647ee6ce96fdeb3f49a341463efab50cc7658 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Fri, 24 May 2019 13:24:52 -0700
Subject: [PATCH 068/572] Add comments t get_context.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5353

Differential Revision: D15497912

Pulled By: anand1976

fbshipit-source-id: 72cff2465ca342aa810f925be5a7016b938aa416
---
 table/get_context.h | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/table/get_context.h b/table/get_context.h
index 7ed316f0e1a..856e01a9502 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -17,6 +17,9 @@ namespace rocksdb {
 class MergeContext;
 class PinnedIteratorsManager;
 
+// Data structure for accumulating statistics during a point lookup. At the
+// end of the point lookup, the corresponding ticker stats are updated. This
+// avoids the overhead of frequent ticker stats updates
 struct GetContextStats {
   uint64_t num_cache_hit = 0;
   uint64_t num_cache_index_hit = 0;
@@ -41,8 +44,17 @@ struct GetContextStats {
   uint64_t num_cache_compression_dict_bytes_insert = 0;
 };
 
+// A class to hold context about a point lookup, such as pointer to value
+// slice, key, merge context etc, as well as the current state of the
+// lookup. Any user using GetContext to track the lookup result must call
+// SaveValue() whenever the internal key is found. This can happen
+// repeatedly in case of merge operands. In case the key may exist with
+// high probability, but IO is required to confirm and the user doesn't allow
+// it, MarkKeyMayExist() must be called instead of SaveValue().
 class GetContext {
  public:
+  // Current state of the point lookup. All except kNotFound and kMerge are
+  // terminal states
   enum GetState {
     kNotFound,
     kFound,
@@ -53,6 +65,19 @@ class GetContext {
   };
   GetContextStats get_context_stats_;
 
+  // Constructor
+  // @param value_found If non-nullptr, set to false if key may be present
+  //                    but we can't be certain because we cannot do IO
+  // @param max_covering_tombstone_seq Pointer to highest sequence number of
+  //                    range deletion covering the key. When an internal key
+  //                    is found with smaller sequence number, the lookup
+  //                    terminates
+  // @param seq If non-nullptr, the sequence number of the found key will be
+  //            saved here
+  // @param callback Pointer to ReadCallback to perform additional checks
+  //                 for visibility of a key
+  // @param is_blob_index If non-nullptr, will be used to indicate if a found
+  //                      key is of type blob index
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
@@ -64,13 +89,15 @@ class GetContext {
 
   GetContext() = default;
 
+  // This can be called to indicate that a key may be present, but cannot be
+  // confirmed due to IO not allowed
   void MarkKeyMayExist();
 
   // Records this key, value, and any meta-data (such as sequence number and
   // state) into this GetContext.
   //
   // If the parsed_key matches the user key that we are looking for, sets
-  // mathced to true.
+  // matched to true.
   //
   // Returns True if more keys need to be read (due to merges) or
   //         False if the complete value has been found.
@@ -133,6 +160,9 @@ class GetContext {
   bool* is_blob_index_;
 };
 
+// Call this to replay a log and bring the get_context up to date. The replay
+// log must have been created by another GetContext object, whose replay log
+// must have been set by calling GetContext::SetReplayLog().
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
                          GetContext* get_context,
                          Cleanable* value_pinner = nullptr);

From 029b98984e2f6babc2526362ddfffeea0798d625 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Fri, 24 May 2019 14:22:42 -0700
Subject: [PATCH 069/572] Add some comments in table_cache.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5343

Differential Revision: D15485831

Pulled By: anand1976

fbshipit-source-id: 8735ccfba90d7ecb3559e63f792e34527f04ed29
---
 db/table_cache.h | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/db/table_cache.h b/db/table_cache.h
index 1e96dfa1bd5..64d7b898b22 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -32,6 +32,19 @@ struct FileDescriptor;
 class GetContext;
 class HistogramImpl;
 
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
 class TableCache {
  public:
   TableCache(const ImmutableCFOptions& ioptions,
@@ -39,14 +52,16 @@ class TableCache {
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
-  // file length must be exactly "file_size" bytes).  If "tableptr" is
-  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
+  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
   // underlying the returned iterator, or nullptr if no Table object underlies
-  // the returned iterator.  The returned "*tableptr" object is owned by
-  // the cache and should not be deleted, and is valid for as long as the
+  // the returned iterator.  The returned "*table_reader_ptr" object is owned
+  // by the cache and should not be deleted, and is valid for as long as the
   // returned iterator is live.
   // @param range_del_agg If non-nullptr, adds range deletions to the
   //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+  // @param for_compaction If true, a new TableReader may be allocated (but
+  //                       not cached), depending on the CF options
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
   InternalIterator* NewIterator(
@@ -61,11 +76,13 @@ class TableCache {
       const InternalKey* largest_compaction_key = nullptr);
 
   // If a seek to internal key "k" in specified file finds an entry,
-  // call (*handle_result)(arg, found_key, found_value) repeatedly until
-  // it returns false.
-  // @param get_context State for get operation. If its range_del_agg() returns
-  //    non-nullptr, adds range deletions to the aggregator. If an error occurs,
-  //    returns non-ok status.
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param get_context Context for get operation. The result of the lookup
+  //                    can be retrieved by calling get_context->State()
+  // @param file_read_hist If non-nullptr, the file reader statistics are
+  //                       recorded
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
   Status Get(const ReadOptions& options,
@@ -76,6 +93,15 @@ class TableCache {
              HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
              int level = -1);
 
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param mget_range Pointer to the structure describing a batch of keys to
+  //                   be looked up in this table file. The result is stored
+  //                   in the embedded GetContext
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
   Status MultiGet(const ReadOptions& options,
                   const InternalKeyComparator& internal_comparator,
                   const FileMetaData& file_meta,

From a466120cd50a87caf786311beca5684b8dc40eae Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 24 May 2019 15:26:02 -0700
Subject: [PATCH 070/572] improve comments in db_impl_secondary

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5360

Differential Revision: D15502973

Pulled By: miasantreble

fbshipit-source-id: 15b7f9d7928e771a6fac0643861173be8ba6b37a
---
 db/db_impl_secondary.cc |  2 --
 db/db_impl_secondary.h  | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc
index 5dfa2d0c942..586158ef7ce 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl_secondary.cc
@@ -120,8 +120,6 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
   return s;
 }
 
-// try to find log reader using log_number from log_readers_ map, initialize
-// if it doesn't exist
 Status DBImplSecondary::MaybeInitLogReader(
     uint64_t log_number, log::FragmentBufferedReader** log_reader) {
   auto iter = log_readers_.find(log_number);
diff --git a/db/db_impl_secondary.h b/db/db_impl_secondary.h
index 912708b1ec0..a57835432dc 100644
--- a/db/db_impl_secondary.h
+++ b/db/db_impl_secondary.h
@@ -13,6 +13,7 @@
 
 namespace rocksdb {
 
+// A wrapper class to hold log reader, log reporter, log status.
 class LogReaderContainer {
  public:
   LogReaderContainer()
@@ -62,11 +63,19 @@ class LogReaderContainer {
   };
 };
 
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
 class DBImplSecondary : public DBImpl {
  public:
   DBImplSecondary(const DBOptions& options, const std::string& dbname);
   ~DBImplSecondary() override;
 
+  // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+  // and log_readers_ to facilitate future operations.
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  bool read_only, bool error_if_log_file_exist,
                  bool error_if_data_exists_in_logs) override;
@@ -182,10 +191,15 @@ class DBImplSecondary : public DBImpl {
   // method can take long time due to all the I/O and CPU costs.
   Status TryCatchUpWithPrimary() override;
 
+
+  // Try to find log reader using log_number from log_readers_ map, initialize
+  // if it doesn't exist
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
  protected:
+  // ColumnFamilyCollector is a write batch handler which does nothing
+  // except recording unique column family IDs
   class ColumnFamilyCollector : public WriteBatch::Handler {
     std::unordered_set<uint32_t> column_family_ids_;
 
@@ -262,6 +276,8 @@ class DBImplSecondary : public DBImpl {
       std::unordered_set<ColumnFamilyData*>* cfds_changed,
       JobContext* job_context);
   Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+  // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+  // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence,
                          std::unordered_set<ColumnFamilyData*>* cfds_changed,

From b09c018b4d42049de5a9275f2af3c0776b622655 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Fri, 24 May 2019 16:55:53 -0700
Subject: [PATCH 071/572] Add comments to trace_replay.h (#5359)

Summary:
Add file, class, and function level comments in trace_replay.h
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5359

Differential Revision: D15505318

Pulled By: sagar0

fbshipit-source-id: 181e3d4ea805fd9a33f91b89e123bbd0c1ead2ce
---
 util/trace_replay.h | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/util/trace_replay.h b/util/trace_replay.h
index 29c00c287b2..d4030c61518 100644
--- a/util/trace_replay.h
+++ b/util/trace_replay.h
@@ -15,6 +15,9 @@
 
 namespace rocksdb {
 
+// This file contains Tracer and Replayer classes that enable capturing and
+// replaying RocksDB traces.
+
 class ColumnFamilyHandle;
 class ColumnFamilyData;
 class DB;
@@ -29,6 +32,7 @@ const unsigned int kTracePayloadLengthSize = 4;
 const unsigned int kTraceMetadataSize =
     kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
 
+// Supported Trace types.
 enum TraceType : char {
   kTraceBegin = 1,
   kTraceEnd = 2,
@@ -36,13 +40,16 @@ enum TraceType : char {
   kTraceGet = 4,
   kTraceIteratorSeek = 5,
   kTraceIteratorSeekForPrev = 6,
+  // All trace types should be added before kTraceMax
   kTraceMax,
 };
 
 // TODO: This should also be made part of public interface to help users build
 // custom TracerReaders and TraceWriters.
+//
+// The data structure that defines a single trace.
 struct Trace {
-  uint64_t ts;
+  uint64_t ts;  // timestamp
   TraceType type;
   std::string payload;
 
@@ -53,25 +60,47 @@ struct Trace {
   }
 };
 
-// Trace RocksDB operations using a TraceWriter.
+// Tracer captures all RocksDB operations using a user-provided TraceWriter.
+// Every RocksDB operation is written as a single trace. Each trace will have a
+// timestamp and type, followed by the trace payload.
 class Tracer {
  public:
   Tracer(Env* env, const TraceOptions& trace_options,
          std::unique_ptr<TraceWriter>&& trace_writer);
   ~Tracer();
 
+  // Trace all write operations -- Put, Merge, Delete, SingleDelete, Write
   Status Write(WriteBatch* write_batch);
+
+  // Trace Get operations.
   Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+
+  // Trace Iterators.
   Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
   Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+
+  // Returns true if the trace is over the configured max trace file limit.
+  // False otherwise.
   bool IsTraceFileOverMax();
 
+  // Writes a trace footer at the end of the tracing
   Status Close();
 
  private:
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number, trace version, RocksDB version, and
+  // trace format.
   Status WriteHeader();
+
+  // Write a trace footer, typically on ending a trace, with some metadata.
   Status WriteFooter();
+
+  // Write a single trace using the provided TraceWriter to the underlying
+  // system, say, a filesystem or a streaming service.
   Status WriteTrace(const Trace& trace);
+
+  // Helps in filtering and sampling of traces.
+  // Returns true if a trace should be skipped, false otherwise.
   bool ShouldSkipTrace(const TraceType& type);
 
   Env* env_;
@@ -80,14 +109,24 @@ class Tracer {
   uint64_t trace_request_count_;
 };
 
-// Replay RocksDB operations from a trace.
+// Replayer helps to replay the captured RocksDB operations, using a user
+// provided TraceReader.
+// The Replayer is instantiated via db_bench today, on using "replay" benchmark.
 class Replayer {
  public:
   Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
            std::unique_ptr<TraceReader>&& reader);
   ~Replayer();
 
+  // Replay all the traces from the provided trace stream, taking the delay
+  // between the traces into consideration.
   Status Replay();
+
+  // Enables fast forwarding a replay by reducing the delay between the ingested
+  // traces.
+  // fast_forward : Rate of replay speedup.
+  //   If 1, replay the operations at the same rate as in the trace stream.
+  //   If > 1, speed up the replay by this amount.
   Status SetFastForward(uint32_t fast_forward);
 
  private:

From bd9f1d2d0ff7ea7beb289cb1ca230f1593ceedae Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 24 May 2019 18:35:11 -0700
Subject: [PATCH 072/572] Fix RocksDB auto-recovery from SpaceLimit err (#5334)

Summary:
If RocksDB is configured with a positive max_allowed_space (via sst file manager),
then the sst file manager should use this value instead of total free disk
space to determine whether to clear the background error of space limit
reached.

In DBSSTTest.DBWithMaxSpaceAllowed, we configure a low space limit that is very
likely lower than the free disk space of the test machine. Therefore, once the
test db encounters a Status::SpaceLimit, error handler will call into sst file
manager to start error recovery which may clear the bg error since disk free
space is larger than reserved_disk_buffer_.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5334

Differential Revision: D15501622

Pulled By: riversand963

fbshipit-source-id: 58035efc450b062d6b28c78c322005ec3705fb47
---
 util/sst_file_manager_impl.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc
index 6a770b106e8..047b0c093d6 100644
--- a/util/sst_file_manager_impl.cc
+++ b/util/sst_file_manager_impl.cc
@@ -266,6 +266,9 @@ void SstFileManagerImpl::ClearError() {
 
     uint64_t free_space;
     Status s = env_->GetFreeSpace(path_, &free_space);
+    free_space = max_allowed_space_ > 0
+                     ? std::min(max_allowed_space_, free_space)
+                     : free_space;
     if (s.ok()) {
       // In case of multi-DB instances, some of them may have experienced a
       // soft error and some a hard error. In the SstFileManagerImpl, a hard

From e264eebcd7f5880093b42f13a44c7e67d1619969 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Fri, 24 May 2019 20:28:52 -0700
Subject: [PATCH 073/572] Add comments in file_reader_writer.h (#5355)

Summary:
Add file and class level comments in file_reader_writer.h
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5355

Differential Revision: D15499020

Pulled By: sagar0

fbshipit-source-id: 925b2326885cdb4357e6a139ac65ee5e2ce1d613
---
 util/file_reader_writer.h | 83 +++++++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 8 deletions(-)

diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 4451f8b81bf..1ef23e8c936 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -6,6 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+
 #pragma once
 #include <atomic>
 #include <sstream>
@@ -22,9 +23,22 @@ namespace rocksdb {
 class Statistics;
 class HistogramImpl;
 
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
 std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
   std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
 
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
 class SequentialFileReader {
  private:
   std::unique_ptr<SequentialFile> file_;
@@ -61,6 +75,12 @@ class SequentialFileReader {
   bool use_direct_io() const { return file_->use_direct_io(); }
 };
 
+// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
 class RandomAccessFileReader {
  private:
 #ifndef ROCKSDB_LITE
@@ -151,7 +171,13 @@ class RandomAccessFileReader {
   bool use_direct_io() const { return file_->use_direct_io(); }
 };
 
-// Use posix write to write data to a file.
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
 class WritableFileWriter {
  private:
 #ifndef ROCKSDB_LITE
@@ -277,13 +303,31 @@ class WritableFileWriter {
   Status SyncInternal(bool use_fsync);
 };
 
-// FilePrefetchBuffer can automatically do the readahead if file_reader,
-// readahead_size, and max_readahead_size are passed in.
-// max_readahead_size should be greater than or equal to readahead_size.
-// readahead_size will be doubled on every IO, until max_readahead_size.
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
 class FilePrefetchBuffer {
  public:
-  // If `track_min_offset` is true, track minimum offset ever read.
+  // Constructor.
+  //
+  // All arguments are optional.
+  // file_reader        : the file reader to use. Can be a nullptr.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  //
+  // Automatic readhead is enabled for a file if file_reader, readahead_size,
+  // and max_readahead_size are passed in.
+  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+  // does not make any sense. So it does nothing.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
   FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
                      size_t readadhead_size = 0, size_t max_readahead_size = 0,
                      bool enable = true, bool track_min_offset = false)
@@ -294,11 +338,26 @@ class FilePrefetchBuffer {
         min_offset_read_(port::kMaxSizet),
         enable_(enable),
         track_min_offset_(track_min_offset) {}
+
+  // Load data into the buffer from a file.
+  // reader : the file reader.
+  // offset : the file offset to start reading from.
+  // n      : the number of bytes to read.
   Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
+
+  // Tries returning the data for a file raed from this buffer, if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readadhead_size is set as part
+  // of the constructor.
+  //
+  // offset : the file offset.
+  // n      : the number of bytes.
+  // result : output buffer to put the data into.
   bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
 
-  // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked
-  // if track_min_offset = true.
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
   size_t min_offset_read() const { return min_offset_read_; }
 
  private:
@@ -317,9 +376,17 @@ class FilePrefetchBuffer {
   bool track_min_offset_;
 };
 
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
 extern Status NewWritableFile(Env* env, const std::string& fname,
                               std::unique_ptr<WritableFile>* result,
                               const EnvOptions& options);
+
+// Read a single line from a file.
 bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
                  std::string* output, bool* has_data, Status* result);
 

From b5e4ee2e763789e23ee2e31e8fc8f82916bafc2d Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 28 May 2019 12:16:22 -0700
Subject: [PATCH 074/572] Fix a clang analyze error (#5365)

Summary:
The analyzer thinks max_allowed_ space can be 0. In that case, free_space will
be assigned as free_space. It fails to realize that the function call
GetFreeSpace actually sets the free_space variable properly, which is possibly
due to lack of inter-function call analysis.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5365

Differential Revision: D15521790

Pulled By: riversand963

fbshipit-source-id: 839d0a285a1c8773a28a385f0c3be4bb7fbe32cb
---
 util/sst_file_manager_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc
index 047b0c093d6..d85b9c960de 100644
--- a/util/sst_file_manager_impl.cc
+++ b/util/sst_file_manager_impl.cc
@@ -264,7 +264,7 @@ void SstFileManagerImpl::ClearError() {
       return;
     }
 
-    uint64_t free_space;
+    uint64_t free_space = 0;
     Status s = env_->GetFreeSpace(path_, &free_space);
     free_space = max_allowed_space_ > 0
                      ? std::min(max_allowed_space_, free_space)

From 4d0c3b1f9644ae5b6a13740075e259268eff40df Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Tue, 28 May 2019 12:18:31 -0700
Subject: [PATCH 075/572] Add comments in compaction_picker.h

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5357

Differential Revision: D15522825

Pulled By: siying

fbshipit-source-id: d775386b9d10c7179f5d3af2c821ed213abfacdf
---
 db/compaction_picker.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 250566b1065..05895a26753 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -24,11 +24,26 @@
 
 namespace rocksdb {
 
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
 class LogBuffer;
 class Compaction;
 class VersionStorageInfo;
 struct CompactionInputFiles;
 
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
 class CompactionPicker {
  public:
   CompactionPicker(const ImmutableCFOptions& ioptions,
@@ -221,6 +236,9 @@ class CompactionPicker {
   const InternalKeyComparator* const icmp_;
 };
 
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
 class LevelCompactionPicker : public CompactionPicker {
  public:
   LevelCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -236,6 +254,8 @@ class LevelCompactionPicker : public CompactionPicker {
 };
 
 #ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
 class NullCompactionPicker : public CompactionPicker {
  public:
   NullCompactionPicker(const ImmutableCFOptions& ioptions,

From f5576c33173f3ef27fe9ba1d71beeb6f1aa15c6a Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 28 May 2019 14:18:24 -0700
Subject: [PATCH 076/572] WritePrepared: disableWAL in commit without prepare
 (#5327)

Summary:
When committing a transaction without prepare, WritePrepared simply writes the batch to db and add the commit entry to CommitCache. When two_write_queues=true, following the rule of committing only from 2nd write queue, the first write, writes the batch and the only thing the 2nd write does is to write the commit entry to CommitCache. Currently the write batch in 2nd write is set to an empty LogData entry, while the write to the WAL could simply be entirely disabled.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5327

Differential Revision: D15424546

Pulled By: maysamyabandeh

fbshipit-source-id: 3d9ea3922d5196984c584d62a3ed57e1f7ca7b9f
---
 .../transactions/pessimistic_transaction_db.cc     |  2 +-
 utilities/transactions/transaction_test.h          |  5 +++++
 .../write_prepared_transaction_test.cc             |  7 ++++---
 utilities/transactions/write_prepared_txn_db.cc    | 14 +++-----------
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index c4e6e247756..7b1b0241c97 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -235,7 +235,7 @@ Status TransactionDB::Open(
   if (txn_db_options.write_policy == WRITE_PREPARED &&
       db_options.unordered_write && !db_options.two_write_queues) {
     return Status::NotSupported(
-        "WRITE_UNPREPARED is incompatible with unordered_writes if "
+        "WRITE_PREPARED is incompatible with unordered_writes if "
         "two_write_queues is not enabled.");
   }
 
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 00fa6cf0364..2e3b9952709 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -214,6 +214,8 @@ class TransactionTestBase : public ::testing::Test {
   std::atomic<size_t> exp_seq = {0};
   std::atomic<size_t> commit_writes = {0};
   std::atomic<size_t> expected_commits = {0};
+  // Without Prepare, the commit does not write to WAL
+  std::atomic<size_t> with_empty_commits = {0};
   std::function<void(size_t, Status)> txn_t0_with_status = [&](size_t index,
                                                                Status exp_s) {
     // Test DB's internal txn. It involves no prepare phase nor a commit marker.
@@ -231,6 +233,7 @@ class TransactionTestBase : public ::testing::Test {
         exp_seq++;
       }
     }
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t0 = [&](size_t index) {
     return txn_t0_with_status(index, Status::OK());
@@ -257,6 +260,7 @@ class TransactionTestBase : public ::testing::Test {
       }
     }
     ASSERT_OK(s);
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t2 = [&](size_t index) {
     // Commit without prepare. It should write to DB without a commit marker.
@@ -282,6 +286,7 @@ class TransactionTestBase : public ::testing::Test {
       }
     }
     delete txn;
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t3 = [&](size_t index) {
     // A full 2pc txn that also involves a commit marker.
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index b93f1a74ffe..7b5a585df91 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1396,6 +1396,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     }
     DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     auto seq = db_impl->TEST_GetLastVisibleSequence();
+    with_empty_commits = 0;
     exp_seq = seq;
     // This is increased before writing the batch for commit
     commit_writes = 0;
@@ -1487,12 +1488,12 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->TEST_GetLastVisibleSequence();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if flush preserves the last sequence number
     db_impl->Flush(fopt);
     seq = db_impl->GetLatestSequenceNumber();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if recovery after flush preserves the last sequence number
     db_impl->FlushWAL(true);
@@ -1500,7 +1501,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->GetLatestSequenceNumber();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
   }
 }
 
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 3b09cbbf7d6..6b6831fd83b 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -151,11 +151,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
 
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   WriteOptions write_options(write_options_orig);
-  bool sync = write_options.sync;
-  if (!do_one_write) {
-    // No need to sync on the first write
-    write_options.sync = false;
-  }
   // In the absence of Prepare markers, use Noop as a batch separator
   WriteBatchInternal::InsertNoop(batch);
   const bool DISABLE_MEMTABLE = true;
@@ -192,8 +187,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   if (do_one_write) {
     return s;
   }  // else do the 2nd write for commit
-  // Set the original value of sync
-  write_options.sync = sync;
   ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
                     "CommitBatchInternal 2nd write prepare_seq: %" PRIu64,
                     prepare_seq);
@@ -203,10 +196,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
       this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
-  const size_t ONE_BATCH = 1;
-  // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  write_options.disableWAL = true;
+  write_options.sync = false;
+  const size_t ONE_BATCH = 1; // Just to inc the seq
   s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
                           no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);

From eab4f49a2cba969af04a502e561653ca018dba97 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 28 May 2019 16:26:14 -0700
Subject: [PATCH 077/572] WritePrepared: skip_concurrency_control option
 (#5330)

Summary:
This enables the user to set TransactionDBOptions::skip_concurrency_control so the standard `DB::Write(const WriteOptions& opts, WriteBatch* updates)` would skip the concurrency control. This would give higher throughput to the users who know their use case doesn't need concurrency control.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5330

Differential Revision: D15525932

Pulled By: maysamyabandeh

fbshipit-source-id: 68421ac1ba34f549a4a8de9ce4c2dccf6fb4b06b
---
 include/rocksdb/utilities/transaction_db.h    |  7 +++++
 tools/db_bench_tool.cc                        | 10 +++++++
 .../pessimistic_transaction_db.cc             | 27 +++++++------------
 .../transactions/pessimistic_transaction_db.h | 22 +++++++++++++++
 .../transactions/write_prepared_txn_db.cc     | 14 +++++++++-
 .../transactions/write_prepared_txn_db.h      |  3 +++
 6 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 6c4346ff3e7..db32ba0bc3a 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -94,6 +94,13 @@ struct TransactionDBOptions {
   // for the special way that myrocks uses this operands.
   bool rollback_merge_operands = false;
 
+  // If true, the TransactionDB implementation might skip concurrency control
+  // unless it is overridden by TransactionOptions or
+  // TransactionDBWriteOptimizations. This can be used in conjuction with
+  // DBOptions::unordered_write when the TransactionDB is used solely for write
+  // ordering rather than concurrency control.
+  bool skip_concurrency_control = false;
+
  private:
   // 128 entries
   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 18d8733439b..2ceca4fd950 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -3788,6 +3788,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       } else if (FLAGS_transaction_db) {
         TransactionDB* ptr;
         TransactionDBOptions txn_db_options;
+        if (options.unordered_write) {
+          options.two_write_queues = true;
+          txn_db_options.skip_concurrency_control = true;
+          txn_db_options.write_policy = WRITE_PREPARED;
+        }
         s = TransactionDB::Open(options, txn_db_options, db_name,
                                 column_families, &db->cfh, &ptr);
         if (s.ok()) {
@@ -3814,6 +3819,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     } else if (FLAGS_transaction_db) {
       TransactionDB* ptr = nullptr;
       TransactionDBOptions txn_db_options;
+      if (options.unordered_write) {
+        options.two_write_queues = true;
+        txn_db_options.skip_concurrency_control = true;
+        txn_db_options.write_policy = WRITE_PREPARED;
+      }
       s = CreateLoggerFromOptions(db_name, options, &options.info_log);
       if (s.ok()) {
         s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 7b1b0241c97..c1b37c148f5 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -522,23 +522,16 @@ Status PessimisticTransactionDB::Merge(const WriteOptions& options,
 
 Status PessimisticTransactionDB::Write(const WriteOptions& opts,
                                        WriteBatch* updates) {
-  // Need to lock all keys in this batch to prevent write conflicts with
-  // concurrent transactions.
-  Transaction* txn = BeginInternalTransaction(opts);
-  txn->DisableIndexing();
-
-  auto txn_impl =
-      static_cast_with_check<PessimisticTransaction, Transaction>(txn);
-
-  // Since commitBatch sorts the keys before locking, concurrent Write()
-  // operations will not cause a deadlock.
-  // In order to avoid a deadlock with a concurrent Transaction, Transactions
-  // should use a lock timeout.
-  Status s = txn_impl->CommitBatch(updates);
-
-  delete txn;
+  return WriteWithConcurrencyControl(opts, updates);
+}
 
-  return s;
+Status WriteCommittedTxnDB::Write(const WriteOptions& opts,
+                                  WriteBatch* updates) {
+  if (txn_db_options_.skip_concurrency_control) {
+    return db_impl_->Write(opts, updates);
+  } else {
+    return WriteWithConcurrencyControl(opts, updates);
+  }
 }
 
 Status WriteCommittedTxnDB::Write(
@@ -547,7 +540,7 @@ Status WriteCommittedTxnDB::Write(
   if (optimizations.skip_concurrency_control) {
     return db_impl_->Write(opts, updates);
   } else {
-    return Write(opts, updates);
+    return WriteWithConcurrencyControl(opts, updates);
   }
 }
 
diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h
index e80b28852e7..5242c6260b1 100644
--- a/utilities/transactions/pessimistic_transaction_db.h
+++ b/utilities/transactions/pessimistic_transaction_db.h
@@ -19,6 +19,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
@@ -67,6 +68,26 @@ class PessimisticTransactionDB : public TransactionDB {
 
   using TransactionDB::Write;
   virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+  inline Status WriteWithConcurrencyControl(const WriteOptions& opts,
+                                            WriteBatch* updates) {
+    // Need to lock all keys in this batch to prevent write conflicts with
+    // concurrent transactions.
+    Transaction* txn = BeginInternalTransaction(opts);
+    txn->DisableIndexing();
+
+    auto txn_impl =
+        static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+
+    // Since commitBatch sorts the keys before locking, concurrent Write()
+    // operations will not cause a deadlock.
+    // In order to avoid a deadlock with a concurrent Transaction, Transactions
+    // should use a lock timeout.
+    Status s = txn_impl->CommitBatch(updates);
+
+    delete txn;
+
+    return s;
+  }
 
   using StackableDB::CreateColumnFamily;
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
@@ -191,6 +212,7 @@ class WriteCommittedTxnDB : public PessimisticTransactionDB {
   virtual Status Write(const WriteOptions& opts,
                        const TransactionDBWriteOptimizations& optimizations,
                        WriteBatch* updates) override;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
 };
 
 }  //  namespace rocksdb
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 6b6831fd83b..5250f3f2de5 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -108,6 +108,18 @@ Transaction* WritePreparedTxnDB::BeginTransaction(
   }
 }
 
+Status WritePreparedTxnDB::Write(const WriteOptions& opts,
+                                 WriteBatch* updates) {
+  if (txn_db_options_.skip_concurrency_control) {
+    // Skip locking the rows
+    const size_t UNKNOWN_BATCH_CNT = 0;
+    WritePreparedTxn* NO_TXN = nullptr;
+    return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN);
+  } else {
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
 Status WritePreparedTxnDB::Write(
     const WriteOptions& opts,
     const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
@@ -123,7 +135,7 @@ Status WritePreparedTxnDB::Write(
   } else {
     // TODO(myabandeh): Make use of skip_duplicate_key_check hint
     // Fall back to unoptimized version
-    return PessimisticTransactionDB::Write(opts, updates);
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
   }
 }
 
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 25b9b9a1b05..ffdf2f29d8f 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -72,6 +72,9 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
                                 const TransactionOptions& txn_options,
                                 Transaction* old_txn) override;
 
+  using TransactionDB::Write;
+  Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
   // Optimized version of ::Write that receives more optimization request such
   // as skip_concurrency_control.
   using PessimisticTransactionDB::Write;

From 545d20604084993174f1c0680deeff33bc67a553 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Wed, 29 May 2019 20:44:08 -0700
Subject: [PATCH 078/572] Move some file related files outside util/ (#5375)

Summary:
util/ means for lower level libraries, so it's a good idea to move the files which requires knowledge to DB out. Create a file/ and move some files there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5375

Differential Revision: D15550935

Pulled By: siying

fbshipit-source-id: 61a9715dcde5386eebfb43e93f847bba1ae0d3f2
---
 CMakeLists.txt                                 | 10 +++++-----
 Makefile                                       |  2 +-
 TARGETS                                        | 10 +++++-----
 db/builder.cc                                  |  2 +-
 db/column_family.cc                            |  2 +-
 db/compaction_job.cc                           |  4 ++--
 db/compaction_job_stats_test.cc                |  2 +-
 db/compaction_picker.cc                        |  2 +-
 db/compaction_picker_universal.cc              |  2 +-
 db/corruption_test.cc                          |  2 +-
 db/db_filesnapshot.cc                          |  4 ++--
 db/db_impl.cc                                  |  6 +++---
 db/db_impl_compaction_flush.cc                 |  2 +-
 db/db_impl_files.cc                            |  4 ++--
 db/db_impl_open.cc                             |  2 +-
 db/db_info_dumper.cc                           |  2 +-
 db/db_iter.cc                                  |  2 +-
 db/db_sst_test.cc                              |  2 +-
 db/db_test.cc                                  |  2 +-
 db/db_test_util.h                              |  2 +-
 db/deletefile_test.cc                          |  2 +-
 db/error_handler.cc                            |  2 +-
 db/external_sst_file_ingestion_job.cc          |  2 +-
 db/external_sst_file_test.cc                   |  2 +-
 db/fault_injection_test.cc                     |  2 +-
 db/filename_test.cc                            |  2 +-
 db/flush_job.cc                                |  4 ++--
 db/listener_test.cc                            |  2 +-
 db/memtable_list.h                             |  2 +-
 db/obsolete_files_test.cc                      |  2 +-
 db/plain_table_db_test.cc                      |  2 +-
 db/repair.cc                                   |  2 +-
 db/repair_test.cc                              |  2 +-
 db/table_cache.cc                              |  2 +-
 db/transaction_log_impl.h                      |  2 +-
 db/version_set.cc                              |  2 +-
 db/wal_manager.cc                              |  4 ++--
 {util => file}/delete_scheduler.cc             |  4 ++--
 {util => file}/delete_scheduler.h              |  0
 {util => file}/delete_scheduler_test.cc        |  4 ++--
 {util => file}/file_util.cc                    |  4 ++--
 {util => file}/file_util.h                     |  2 +-
 {util => file}/filename.cc                     |  2 +-
 {util => file}/filename.h                      |  0
 {util => file}/sst_file_manager_impl.cc        |  2 +-
 {util => file}/sst_file_manager_impl.h         |  2 +-
 src.mk                                         |  8 ++++----
 tools/ldb_cmd.cc                               |  2 +-
 tools/write_stress.cc                          |  2 +-
 util/auto_roll_logger.h                        |  2 +-
 util/fault_injection_test_env.h                |  2 +-
 utilities/backupable/backupable_db.cc          |  2 +-
 utilities/backupable/backupable_db_test.cc     |  2 +-
 utilities/blob_db/blob_db_impl.cc              |  6 +++---
 utilities/blob_db/blob_db_impl_filesnapshot.cc |  2 +-
 utilities/blob_db/blob_db_test.cc              |  4 ++--
 utilities/blob_db/blob_file.cc                 |  2 +-
 utilities/checkpoint/checkpoint_impl.cc        |  4 ++--
 utilities/checkpoint/checkpoint_impl.h         |  2 +-
 utilities/convenience/info_log_finder.cc       |  2 +-
 utilities/options/options_util.cc              |  2 +-
 utilities/ttl/db_ttl_impl.cc                   |  2 +-
 62 files changed, 85 insertions(+), 85 deletions(-)
 rename {util => file}/delete_scheduler.cc (99%)
 rename {util => file}/delete_scheduler.h (100%)
 rename {util => file}/delete_scheduler_test.cc (99%)
 rename {util => file}/file_util.cc (97%)
 rename {util => file}/file_util.h (97%)
 rename {util => file}/filename.cc (99%)
 rename {util => file}/filename.h (100%)
 rename {util => file}/sst_file_manager_impl.cc (99%)
 rename {util => file}/sst_file_manager_impl.h (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5bb0c089f2e..4d74152d9d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -538,6 +538,10 @@ set(SOURCES
         env/env_encryption.cc
         env/env_hdfs.cc
         env/mock_env.cc
+        file/delete_scheduler.cc
+        file/file_util.cc
+        file/filename.cc
+        file/sst_file_manager_impl.cc
         memtable/alloc_tracker.cc
         memtable/hash_linklist_rep.cc
         memtable/hash_skiplist_rep.cc
@@ -612,12 +616,9 @@ set(SOURCES
         util/concurrent_arena.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
-        util/delete_scheduler.cc
         util/dynamic_bloom.cc
         util/event_logger.cc
         util/file_reader_writer.cc
-        util/file_util.cc
-        util/filename.cc
         util/filter_policy.cc
         util/hash.cc
         util/jemalloc_nodump_allocator.cc
@@ -626,7 +627,6 @@ set(SOURCES
         util/random.cc
         util/rate_limiter.cc
         util/slice.cc
-        util/sst_file_manager_impl.cc
         util/status.cc
         util/string_util.cc
         util/sync_point.cc
@@ -931,6 +931,7 @@ if(WITH_TESTS)
         env/env_basic_test.cc
         env/env_test.cc
         env/mock_env_test.cc
+        file/delete_scheduler_test.cc
         memtable/inlineskiplist_test.cc
         memtable/skiplist_test.cc
         memtable/write_buffer_manager_test.cc
@@ -959,7 +960,6 @@ if(WITH_TESTS)
         util/bloom_test.cc
         util/coding_test.cc
         util/crc32c_test.cc
-        util/delete_scheduler_test.cc
         util/dynamic_bloom_test.cc
         util/event_logger_test.cc
         util/file_reader_writer_test.cc
diff --git a/Makefile b/Makefile
index ee20a41bb1a..ec0a04ed106 100644
--- a/Makefile
+++ b/Makefile
@@ -1369,7 +1369,7 @@ fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
 rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
+delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/TARGETS b/TARGETS
index 073c977e5ad..7d271515728 100644
--- a/TARGETS
+++ b/TARGETS
@@ -143,6 +143,10 @@ cpp_library(
         "env/env_posix.cc",
         "env/io_posix.cc",
         "env/mock_env.cc",
+        "file/delete_scheduler.cc",
+        "file/file_util.cc",
+        "file/filename.cc",
+        "file/sst_file_manager_impl.cc",
         "memtable/alloc_tracker.cc",
         "memtable/hash_linklist_rep.cc",
         "memtable/hash_skiplist_rep.cc",
@@ -218,12 +222,9 @@ cpp_library(
         "util/concurrent_arena.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
-        "util/delete_scheduler.cc",
         "util/dynamic_bloom.cc",
         "util/event_logger.cc",
         "util/file_reader_writer.cc",
-        "util/file_util.cc",
-        "util/filename.cc",
         "util/filter_policy.cc",
         "util/hash.cc",
         "util/jemalloc_nodump_allocator.cc",
@@ -232,7 +233,6 @@ cpp_library(
         "util/random.cc",
         "util/rate_limiter.cc",
         "util/slice.cc",
-        "util/sst_file_manager_impl.cc",
         "util/status.cc",
         "util/string_util.cc",
         "util/sync_point.cc",
@@ -663,7 +663,7 @@ ROCKS_TESTS = [
     ],
     [
         "delete_scheduler_test",
-        "util/delete_scheduler_test.cc",
+        "file/delete_scheduler_test.cc",
         "serial",
     ],
     [
diff --git a/db/builder.cc b/db/builder.cc
index 7f2fd72a191..b42ac187ef0 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -21,6 +21,7 @@
 #include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "file/filename.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
@@ -32,7 +33,6 @@
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
 
diff --git a/db/column_family.cc b/db/column_family.cc
index 4592c945f2e..325610b8844 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -29,6 +29,7 @@
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
@@ -36,7 +37,6 @@
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
 #include "util/compression.h"
-#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 44fb385d1b3..7d2015e5629 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -38,6 +38,8 @@
 #include "db/merge_helper.h"
 #include "db/range_del_aggregator.h"
 #include "db/version_set.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -53,12 +55,10 @@
 #include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc
index 48e883bc6cc..5ca6bf4a337 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction_job_stats_test.cc
@@ -27,6 +27,7 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "monitoring/thread_status_util.h"
@@ -52,7 +53,6 @@
 #include "table/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
-#include "util/filename.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 4bd8ff0e33a..f500def41ee 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -20,8 +20,8 @@
 #include <utility>
 #include <vector>
 #include "db/column_family.h"
+#include "file/filename.h"
 #include "monitoring/statistics.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc
index 9291178585a..c25ae94fa1b 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction_picker_universal.cc
@@ -20,8 +20,8 @@
 #include <string>
 #include <utility>
 #include "db/column_family.h"
+#include "file/filename.h"
 #include "monitoring/statistics.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 1ccb1aa2b09..ba97ca1502b 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -19,6 +19,7 @@
 #include "db/db_impl.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
@@ -26,7 +27,6 @@
 #include "rocksdb/write_batch.h"
 #include "table/block_based_table_builder.h"
 #include "table/meta_blocks.h"
-#include "util/filename.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index ace0befb6d5..5b630e21635 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -17,11 +17,11 @@
 #include "db/db_impl.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3ec9e2ab2d6..e7ed1866469 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -53,6 +53,9 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "db/write_callback.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
 #include "memtable/hash_linklist_rep.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/iostats_context_imp.h"
@@ -89,12 +92,9 @@
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 38c69dfc1e4..1e39bdd4271 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -16,12 +16,12 @@
 #include "db/builder.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
 #include "util/concurrent_task_limiter_impl.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc
index b16cf87947d..64c6dc96879 100644
--- a/db/db_impl_files.cc
+++ b/db/db_impl_files.cc
@@ -16,8 +16,8 @@
 #include <unordered_set>
 #include "db/event_helpers.h"
 #include "db/memtable_list.h"
-#include "util/file_util.h"
-#include "util/sst_file_manager_impl.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index 66104d0ba28..4240b2012dc 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -15,11 +15,11 @@
 
 #include "db/builder.h"
 #include "db/error_handler.h"
+#include "file/sst_file_manager_impl.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
 #include "table/block_based_table_factory.h"
 #include "util/rate_limiter.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index 31050d20a29..be85357c2e1 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -15,8 +15,8 @@
 #include <algorithm>
 #include <vector>
 
+#include "file/filename.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index a606e3acd66..8fc17e1446e 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -16,6 +16,7 @@
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
+#include "file/filename.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -24,7 +25,6 @@
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "util/arena.h"
-#include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 9003ed6b1ac..815aed23e0e 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -8,10 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 8a112e48fcd..7864a7e2c65 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -31,6 +31,7 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
@@ -59,7 +60,6 @@
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
diff --git a/db/db_test_util.h b/db/db_test_util.h
index f5d7fd1a75f..81186bfb9ad 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -27,6 +27,7 @@
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -45,7 +46,6 @@
 #include "table/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
-#include "util/filename.h"
 #include "util/mock_time_env.h"
 #include "util/mutexlock.h"
 
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 54bab847927..81ff8d0b99f 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -16,10 +16,10 @@
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/filename.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
diff --git a/db/error_handler.cc b/db/error_handler.cc
index afec14edcbe..140fb4850f6 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -6,7 +6,7 @@
 #include "db/error_handler.h"
 #include "db/db_impl.h"
 #include "db/event_helpers.h"
-#include "util/sst_file_manager_impl.h"
+#include "file/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 588ac5110a2..7bfc64f77cb 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -17,12 +17,12 @@
 #include <vector>
 
 #include "db/version_edit.h"
+#include "file/file_util.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
-#include "util/file_util.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
 
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 3850a2a031e..0a0994f0ea9 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -7,11 +7,11 @@
 
 #include <functional>
 #include "db/db_test_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
 #include "util/fault_injection_test_env.h"
-#include "util/filename.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 53de312c017..1bfaa299456 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -15,13 +15,13 @@
 #include "db/log_format.h"
 #include "db/version_set.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
 #include "util/fault_injection_test_env.h"
-#include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
diff --git a/db/filename_test.cc b/db/filename_test.cc
index d6bde52834e..869469f3f0c 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/filename.h"
+#include "file/filename.h"
 
 #include "db/dbformat.h"
 #include "port/port.h"
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 21c1ff3a746..46915ca13a8 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -29,6 +29,8 @@
 #include "db/merge_context.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -45,8 +47,6 @@
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
 #include "util/event_logger.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 56968d8f803..6b716a1d4b1 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -8,6 +8,7 @@
 #include "db/dbformat.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/cache.h"
@@ -23,7 +24,6 @@
 #include "rocksdb/table_properties.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
-#include "util/filename.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 5df35660a4d..a5f0c123292 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -16,13 +16,13 @@
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
+#include "file/filename.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "util/autovector.h"
-#include "util/filename.h"
 #include "util/log_buffer.h"
 
 namespace rocksdb {
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 52175a07b74..6bf2acf8519 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -16,10 +16,10 @@
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/filename.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 8a08cf9fede..ef770c2e50b 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -15,6 +15,7 @@
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -28,7 +29,6 @@
 #include "table/plain_table_key_coding.h"
 #include "table/plain_table_reader.h"
 #include "table/table_builder.h"
-#include "util/filename.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/repair.cc b/db/repair.cc
index 2715adcf129..577c122bcf9 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -74,6 +74,7 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -82,7 +83,6 @@
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 3422532da4b..1851cde0dfc 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -11,10 +11,10 @@
 
 #include "db/db_impl.h"
 #include "db/db_test_util.h"
+#include "file/file_util.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/transaction_log.h"
-#include "util/file_util.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 06255d6a354..01724dfc5cb 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -12,7 +12,7 @@
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
-#include "util/filename.h"
+#include "file/filename.h"
 
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/statistics.h"
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index 6382b61a5b7..68ba620714c 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -9,13 +9,13 @@
 
 #include "db/log_reader.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/types.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 5723c6d9253..c10eb9f7ac3 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -33,6 +33,7 @@
 #include "db/pinned_iterators_manager.h"
 #include "db/table_cache.h"
 #include "db/version_builder.h"
+#include "file/filename.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
@@ -49,7 +50,6 @@
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 62511819e4d..cce714750e7 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -22,6 +22,8 @@
 #include "db/log_writer.h"
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
@@ -29,8 +31,6 @@
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
diff --git a/util/delete_scheduler.cc b/file/delete_scheduler.cc
similarity index 99%
rename from util/delete_scheduler.cc
rename to file/delete_scheduler.cc
index f5ee2844896..41ec84376b6 100644
--- a/util/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -5,16 +5,16 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "util/delete_scheduler.h"
+#include "file/delete_scheduler.h"
 
 #include <thread>
 #include <vector>
 
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
diff --git a/util/delete_scheduler.h b/file/delete_scheduler.h
similarity index 100%
rename from util/delete_scheduler.h
rename to file/delete_scheduler.h
diff --git a/util/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
similarity index 99%
rename from util/delete_scheduler_test.cc
rename to file/delete_scheduler_test.cc
index 0d8e354b9c0..c8544004cd5 100644
--- a/util/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -12,10 +12,10 @@
 #include <thread>
 #include <vector>
 
+#include "file/delete_scheduler.h"
+#include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "util/delete_scheduler.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
diff --git a/util/file_util.cc b/file/file_util.cc
similarity index 97%
rename from util/file_util.cc
rename to file/file_util.cc
index ba1b4744bbb..0364f834022 100644
--- a/util/file_util.cc
+++ b/file/file_util.cc
@@ -3,13 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/file_util.h"
+#include "file/file_util.h"
 
 #include <string>
 #include <algorithm>
 
+#include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
diff --git a/util/file_util.h b/file/file_util.h
similarity index 97%
rename from util/file_util.h
rename to file/file_util.h
index c3b365c8bc3..9116c1fecfb 100644
--- a/util/file_util.h
+++ b/file/file_util.h
@@ -6,11 +6,11 @@
 #pragma once
 #include <string>
 
+#include "file/filename.h"
 #include "options/db_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 // use_fsync maps to options.use_fsync, which determines the way that
diff --git a/util/filename.cc b/file/filename.cc
similarity index 99%
rename from util/filename.cc
rename to file/filename.cc
index 32289aecb4b..0a48dc78c36 100644
--- a/util/filename.cc
+++ b/file/filename.cc
@@ -10,7 +10,7 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include "util/filename.h"
+#include "file/filename.h"
 #include <inttypes.h>
 
 #include <ctype.h>
diff --git a/util/filename.h b/file/filename.h
similarity index 100%
rename from util/filename.h
rename to file/filename.h
diff --git a/util/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
similarity index 99%
rename from util/sst_file_manager_impl.cc
rename to file/sst_file_manager_impl.cc
index d85b9c960de..86bcb2d19ca 100644
--- a/util/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sst_file_manager_impl.h"
+#include "file/sst_file_manager_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/util/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
similarity index 99%
rename from util/sst_file_manager_impl.h
rename to file/sst_file_manager_impl.h
index 211b4fa7160..b506ece2796 100644
--- a/util/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -13,8 +13,8 @@
 
 #include "db/compaction.h"
 #include "db/error_handler.h"
+#include "file/delete_scheduler.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/delete_scheduler.h"
 
 namespace rocksdb {
 
diff --git a/src.mk b/src.mk
index e3fe5632f87..2541b9fd12b 100644
--- a/src.mk
+++ b/src.mk
@@ -67,6 +67,10 @@ LIB_SOURCES =                                                   \
   env/env_posix.cc                                              \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
+  file/delete_scheduler.cc                                      \
+  file/file_util.cc                                             \
+  file/filename.cc                                              \
+  file/sst_file_manager_impl.cc                                 \
   memtable/alloc_tracker.cc                                     \
   memtable/hash_linklist_rep.cc                                 \
   memtable/hash_skiplist_rep.cc                                 \
@@ -139,12 +143,9 @@ LIB_SOURCES =                                                   \
   util/concurrent_arena.cc                                      \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
-  util/delete_scheduler.cc                                      \
   util/dynamic_bloom.cc                                         \
   util/event_logger.cc                                          \
   util/file_reader_writer.cc                                    \
-  util/file_util.cc                                             \
-  util/filename.cc                                              \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
   util/jemalloc_nodump_allocator.cc                             \
@@ -153,7 +154,6 @@ LIB_SOURCES =                                                   \
   util/random.cc                                                \
   util/rate_limiter.cc                                          \
   util/slice.cc                                                 \
-  util/sst_file_manager_impl.cc                                 \
   util/status.cc                                                \
   util/string_util.cc                                           \
   util/sync_point.cc                                            \
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index d05ae4a5810..10e9a495d23 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -16,6 +16,7 @@
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "port/port_dirent.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table_properties.h"
@@ -31,7 +32,6 @@
 #include "tools/sst_dump_tool_imp.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/filename.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "utilities/ttl/db_ttl_impl.h"
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index ddb1d0aed03..8cde31e6b84 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -67,12 +67,12 @@ int main() {
 #include <string>
 #include <thread>
 
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "util/filename.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index 64fce4d63e7..24f4714b4fd 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -10,9 +10,9 @@
 #include <list>
 #include <string>
 
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/util_logger.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
 
diff --git a/util/fault_injection_test_env.h b/util/fault_injection_test_env.h
index a39e5b71e9d..d962acfd585 100644
--- a/util/fault_injection_test_env.h
+++ b/util/fault_injection_test_env.h
@@ -19,9 +19,9 @@
 
 #include "db/version_set.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index b7c15c39150..149eb911f7f 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -10,6 +10,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/backupable_db.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/transaction_log.h"
@@ -17,7 +18,6 @@
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/logging.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 1548203dd0a..e4abd96e95f 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -14,6 +14,7 @@
 
 #include "db/db_impl.h"
 #include "env/env_chroot.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/rate_limiter.h"
@@ -22,7 +23,6 @@
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/options_util.h"
 #include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 5dcddc214c8..9f3839370eb 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -12,6 +12,9 @@
 
 #include "db/db_impl.h"
 #include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/convenience.h"
@@ -26,12 +29,9 @@
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
 #include "util/timer_queue.h"
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 8effe88c0a6..16b9ff826e6 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -7,7 +7,7 @@
 
 #include "utilities/blob_db/blob_db_impl.h"
 
-#include "util/filename.h"
+#include "file/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index afb953df9c5..e24ba1d983c 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -14,13 +14,13 @@
 #include <vector>
 
 #include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
 #include "util/cast_util.h"
 #include "util/fault_injection_test_env.h"
-#include "util/file_util.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 3bcbd048734..e14307d44cd 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -19,7 +19,7 @@
 #include "db/column_family.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
-#include "util/filename.h"
+#include "file/filename.h"
 #include "util/logging.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 9863ac1d564..920f9bf535b 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -21,13 +21,13 @@
 #include <vector>
 
 #include "db/wal_manager.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "util/file_util.h"
-#include "util/filename.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h
index a85fde59b60..d26a9f66bfc 100644
--- a/utilities/checkpoint/checkpoint_impl.h
+++ b/utilities/checkpoint/checkpoint_impl.h
@@ -9,8 +9,8 @@
 #include "rocksdb/utilities/checkpoint.h"
 
 #include <string>
+#include "file/filename.h"
 #include "rocksdb/db.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc
index 72c4a6275ae..3e599961630 100644
--- a/utilities/convenience/info_log_finder.cc
+++ b/utilities/convenience/info_log_finder.cc
@@ -8,8 +8,8 @@
 // found in the LICENSE file.
 
 #include "rocksdb/utilities/info_log_finder.h"
+#include "file/filename.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc
index 3975eadd755..561e925ebbe 100644
--- a/utilities/options/options_util.cc
+++ b/utilities/options/options_util.cc
@@ -7,9 +7,9 @@
 
 #include "rocksdb/utilities/options_util.h"
 
+#include "file/filename.h"
 #include "options/options_parser.h"
 #include "rocksdb/options.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 Status LoadOptionsFromFile(const std::string& file_name, Env* env,
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 1952e6188d6..47049a13585 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -7,12 +7,12 @@
 #include "utilities/ttl/db_ttl_impl.h"
 
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "util/coding.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 

From 87fe4bcab857c38a22ebecfb6e7d0e5a8d9a0864 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 30 May 2019 10:43:34 -0700
Subject: [PATCH 079/572] Fix FIFO dynamic options sanitization (#5367)

Summary:
When dynamically setting options, we check the option type info and skip options that are marked deprecated. However this check is only done at top level, which results in bugs where SetOptions will corrupt option values and cause unexpected system behavior iff a deprecated second level option is set dynamically.
For exmaple, the following call:
```
dbfull()->SetOptions(
    {{"compaction_options_fifo",
        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}});
```
was from pre 6.0 release when `ttl` was part of `compaction_options_fifo`. Now that it got moved out of `compaction_options_fifo`, this call will incorrectly set `compaction_options_fifo.max_table_files_size` to 731 (as `max_table_files_size` is the first one in `OptionsHelper::fifo_compaction_options_type_info` struct) and cause files to gett evicted much faster than expected.

This PR adds verification to second level options like `compaction_options_fifo.ttl` or `compaction_options_fifo.max_table_files_size` when set dynamically, and filter out those marked as deprecated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5367

Differential Revision: D15530998

Pulled By: miasantreble

fbshipit-source-id: 818258be5c3abe09cd82d62f3c083572d70fecdd
---
 db/db_options_test.cc     | 47 +++++++++++++++++++++++++++++++++++++++
 options/options_helper.cc |  5 +++++
 2 files changed, 52 insertions(+)

diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index a7ecf12744b..cb9a0e02e61 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -1007,6 +1007,53 @@ TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
   ASSERT_EQ(256, env_->compaction_readahead_size_);
   Close();
 }
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.create_if_missing = true;
+
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // In release 6.0, ttl was promoted from a secondary level option under
+  // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+  // We still need to handle old SetOptions calls but should ignore
+  // ttl under compaction_options_fifo.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+       {"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+  // Put ttl as the first option inside compaction_options_fifo. That works as
+  // it doesn't overwrite any other option.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+       {"ttl", "191"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/options/options_helper.cc b/options/options_helper.cc
index dbee1636d9f..82e7a1fa13a 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -372,6 +372,11 @@ bool ParseSingleStructOption(
     return false;
   }
   const auto& opt_info = iter->second;
+  if (opt_info.verification == OptionVerificationType::kDeprecated) {
+    // Should also skip deprecated sub-options such as
+    // fifo_compaction_options_type_info.ttl
+    return true;
+  }
   return ParseOptionHelper(
       reinterpret_cast<char*>(options) + opt_info.mutable_offset, opt_info.type,
       value);

From a984040f0bf205cb102cfbc377f8c9e44aff0300 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 30 May 2019 11:08:35 -0700
Subject: [PATCH 080/572] Increase Trash/DB size ratio in
 DBSSTTest.RateLimitedWALDelete (#5366)

Summary:
By increasing the ratio, we ensure that all files go through background deletion and eliminate flakiness due to timing of deletions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5366

Differential Revision: D15549992

Pulled By: anand1976

fbshipit-source-id: d137375cd791fc1a802841412755d6e2b8fd7688
---
 db/db_sst_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 815aed23e0e..799d0e14f6b 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -430,6 +430,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   env_->time_elapse_only_sleep_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
   options.env = env_;
 
   int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
@@ -439,7 +440,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   ASSERT_OK(s);
   options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
-  sfm->delete_scheduler()->SetMaxTrashDBRatio(2.1);
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
 
   ASSERT_OK(TryReopen(options));
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();

From e9e0101ca46f00e8a456e69912a913d907be56fc Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 30 May 2019 11:21:38 -0700
Subject: [PATCH 081/572] Move test related files under util/ to test_util/
 (#5377)

Summary:
There are too many types of files under util/. Some test related files don't belong to there or just are just loosely related. Mo
ve them to a new directory test_util/, so that util/ is cleaner.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5377

Differential Revision: D15551366

Pulled By: siying

fbshipit-source-id: 0f5c8653832354ef8caa31749c0143815d719e2c
---
 CMakeLists.txt                                     | 12 ++++++------
 Makefile                                           |  4 ++--
 TARGETS                                            | 14 +++++++-------
 buckifier/buckify_rocksdb.py                       |  2 +-
 cache/cache_test.cc                                |  2 +-
 cache/lru_cache_test.cc                            |  2 +-
 db/builder.cc                                      |  2 +-
 db/column_family_test.cc                           |  8 ++++----
 db/compact_files_test.cc                           |  4 ++--
 db/compaction.cc                                   |  2 +-
 db/compaction_iterator.cc                          |  2 +-
 db/compaction_iterator_test.cc                     |  4 ++--
 db/compaction_job.cc                               |  2 +-
 db/compaction_job_stats_test.cc                    |  6 +++---
 db/compaction_job_test.cc                          |  4 ++--
 db/compaction_picker.cc                            |  2 +-
 db/compaction_picker_test.cc                       |  4 ++--
 db/compaction_picker_universal.cc                  |  2 +-
 db/comparator_db_test.cc                           |  4 ++--
 db/corruption_test.cc                              |  4 ++--
 db/cuckoo_table_db_test.cc                         |  4 ++--
 db/db_basic_test.cc                                |  4 ++--
 db/db_compaction_test.cc                           |  4 ++--
 db/db_encryption_test.cc                           |  2 +-
 db/db_filesnapshot.cc                              |  2 +-
 db/db_flush_test.cc                                |  4 ++--
 db/db_impl.cc                                      |  2 +-
 db/db_impl_compaction_flush.cc                     |  2 +-
 db/db_impl_open.cc                                 |  2 +-
 db/db_impl_write.cc                                |  2 +-
 db/db_iter_stress_test.cc                          |  2 +-
 db/db_iter_test.cc                                 |  4 ++--
 db/db_options_test.cc                              |  4 ++--
 db/db_range_del_test.cc                            |  2 +-
 db/db_secondary_test.cc                            |  4 ++--
 db/db_table_properties_test.cc                     |  4 ++--
 db/db_test.cc                                      |  6 +++---
 db/db_test_util.h                                  |  8 ++++----
 db/db_universal_compaction_test.cc                 |  2 +-
 db/db_wal_test.cc                                  |  4 ++--
 db/db_write_test.cc                                |  4 ++--
 db/dbformat_test.cc                                |  2 +-
 db/deletefile_test.cc                              |  6 +++---
 db/error_handler_test.cc                           |  4 ++--
 db/external_sst_file_basic_test.cc                 |  2 +-
 db/external_sst_file_ingestion_job.cc              |  2 +-
 db/external_sst_file_test.cc                       |  4 ++--
 db/fault_injection_test.cc                         |  8 ++++----
 db/file_indexer_test.cc                            |  4 ++--
 db/filename_test.cc                                |  2 +-
 db/flush_job.cc                                    |  2 +-
 db/flush_job_test.cc                               |  4 ++--
 db/forward_iterator.cc                             |  2 +-
 db/forward_iterator_bench.cc                       |  2 +-
 db/listener_test.cc                                |  6 +++---
 db/log_test.cc                                     |  4 ++--
 db/manual_compaction_test.cc                       |  2 +-
 db/memtable_list.cc                                |  2 +-
 db/memtable_list_test.cc                           |  4 ++--
 db/merge_helper_test.cc                            |  4 ++--
 db/merge_test.cc                                   |  2 +-
 db/obsolete_files_test.cc                          |  6 +++---
 db/options_file_test.cc                            |  2 +-
 db/perf_context_test.cc                            |  2 +-
 db/plain_table_db_test.cc                          |  4 ++--
 db/prefix_test.cc                                  |  2 +-
 db/range_del_aggregator_bench.cc                   |  2 +-
 db/range_del_aggregator_test.cc                    |  2 +-
 db/range_tombstone_fragmenter_test.cc              |  2 +-
 db/table_cache.cc                                  |  2 +-
 db/table_properties_collector_test.cc              |  4 ++--
 db/version_builder_test.cc                         |  4 ++--
 db/version_edit.cc                                 |  2 +-
 db/version_edit_test.cc                            |  4 ++--
 db/version_set.cc                                  |  2 +-
 db/version_set_test.cc                             |  4 ++--
 db/wal_manager.cc                                  |  2 +-
 db/wal_manager_test.cc                             |  4 ++--
 db/write_batch_test.cc                             |  2 +-
 db/write_callback_test.cc                          |  4 ++--
 db/write_controller_test.cc                        |  2 +-
 db/write_thread.cc                                 |  2 +-
 env/env_basic_test.cc                              |  2 +-
 env/env_posix.cc                                   |  2 +-
 env/env_test.cc                                    |  6 +++---
 env/io_posix.cc                                    |  2 +-
 env/mock_env_test.cc                               |  2 +-
 env/posix_logger.h                                 |  2 +-
 file/delete_scheduler.cc                           |  2 +-
 file/delete_scheduler_test.cc                      |  6 +++---
 file/filename.cc                                   |  2 +-
 file/sst_file_manager_impl.cc                      |  2 +-
 java/rocksjni/write_batch_test.cc                  |  2 +-
 memtable/inlineskiplist_test.cc                    |  2 +-
 memtable/memtablerep_bench.cc                      |  2 +-
 memtable/skiplist_test.cc                          |  2 +-
 memtable/write_buffer_manager_test.cc              |  2 +-
 monitoring/histogram_test.cc                       |  2 +-
 monitoring/instrumented_mutex.cc                   |  2 +-
 monitoring/iostats_context_test.cc                 |  2 +-
 monitoring/statistics_test.cc                      |  4 ++--
 options/options_parser.cc                          |  2 +-
 options/options_settable_test.cc                   |  2 +-
 options/options_test.cc                            |  4 ++--
 port/win/env_default.cc                            |  2 +-
 port/win/io_win.cc                                 |  2 +-
 src.mk                                             | 12 ++++++------
 table/block.h                                      |  2 +-
 table/block_based_filter_block_test.cc             |  4 ++--
 table/block_based_table_reader.cc                  |  2 +-
 table/block_test.cc                                |  4 ++--
 table/cleanable_test.cc                            |  4 ++--
 table/cuckoo_table_builder_test.cc                 |  4 ++--
 table/cuckoo_table_reader_test.cc                  |  4 ++--
 table/data_block_hash_index_test.cc                |  4 ++--
 table/full_filter_block_test.cc                    |  4 ++--
 table/merger_test.cc                               |  4 ++--
 table/merging_iterator.cc                          |  2 +-
 table/meta_blocks.cc                               |  2 +-
 table/mock_table.h                                 |  4 ++--
 table/partitioned_filter_block_test.cc             |  4 ++--
 table/sst_file_reader_test.cc                      |  4 ++--
 table/sst_file_writer.cc                           |  2 +-
 table/table_reader_bench.cc                        |  4 ++--
 table/table_test.cc                                |  6 +++---
 {util => test_util}/fault_injection_test_env.cc    |  2 +-
 {util => test_util}/fault_injection_test_env.h     |  0
 {util => test_util}/mock_time_env.h                |  0
 {util => test_util}/sync_point.cc                  |  4 ++--
 {util => test_util}/sync_point.h                   |  0
 {util => test_util}/sync_point_impl.cc             |  2 +-
 {util => test_util}/sync_point_impl.h              |  2 +-
 {util => test_util}/testharness.cc                 |  2 +-
 {util => test_util}/testharness.h                  |  0
 {util => test_util}/testutil.cc                    |  2 +-
 {util => test_util}/testutil.h                     |  0
 {util => test_util}/transaction_test_util.cc       |  2 +-
 {util => test_util}/transaction_test_util.h        |  0
 tools/db_bench_tool.cc                             |  4 ++--
 tools/db_bench_tool_test.cc                        |  4 ++--
 tools/db_repl_stress.cc                            |  2 +-
 tools/db_stress.cc                                 |  4 ++--
 tools/ldb_cmd_test.cc                              |  2 +-
 tools/reduce_levels_test.cc                        |  4 ++--
 tools/sst_dump_test.cc                             |  4 ++--
 tools/trace_analyzer_test.cc                       |  4 ++--
 util/arena.cc                                      |  2 +-
 util/arena_test.cc                                 |  2 +-
 util/auto_roll_logger.h                            |  2 +-
 util/auto_roll_logger_test.cc                      |  4 ++--
 util/autovector_test.cc                            |  4 ++--
 util/bloom_test.cc                                 |  4 ++--
 util/coding_test.cc                                |  2 +-
 util/crc32c_test.cc                                |  2 +-
 util/dynamic_bloom_test.cc                         |  4 ++--
 util/event_logger_test.cc                          |  2 +-
 util/file_reader_writer.cc                         |  2 +-
 util/file_reader_writer.h                          |  2 +-
 util/file_reader_writer_test.cc                    |  4 ++--
 util/filelock_test.cc                              |  2 +-
 util/hash_test.cc                                  |  2 +-
 util/log_write_bench.cc                            |  4 ++--
 util/rate_limiter.cc                               |  2 +-
 util/rate_limiter_test.cc                          |  4 ++--
 util/repeatable_thread.h                           |  2 +-
 util/repeatable_thread_test.cc                     |  4 ++--
 util/slice_transform_test.cc                       |  2 +-
 util/thread_list_test.cc                           |  2 +-
 util/thread_local_test.cc                          |  6 +++---
 util/timer_queue.h                                 |  2 +-
 utilities/backupable/backupable_db.cc              |  2 +-
 utilities/backupable/backupable_db_test.cc         |  6 +++---
 utilities/blob_db/blob_db_impl.cc                  |  2 +-
 utilities/blob_db/blob_db_test.cc                  |  6 +++---
 utilities/cassandra/cassandra_format_test.cc       |  2 +-
 utilities/cassandra/cassandra_functional_test.cc   |  2 +-
 utilities/cassandra/cassandra_row_merge_test.cc    |  2 +-
 utilities/cassandra/cassandra_serialize_test.cc    |  2 +-
 utilities/cassandra/format.h                       |  2 +-
 utilities/cassandra/test_utils.h                   |  2 +-
 utilities/checkpoint/checkpoint_impl.cc            |  2 +-
 utilities/checkpoint/checkpoint_test.cc            |  6 +++---
 utilities/env_librados_test.cc                     |  2 +-
 utilities/env_mirror_test.cc                       |  2 +-
 utilities/env_timed_test.cc                        |  2 +-
 utilities/memory/memory_test.cc                    |  4 ++--
 .../string_append/stringappend_test.cc             |  2 +-
 utilities/object_registry_test.cc                  |  2 +-
 utilities/options/options_util_test.cc             |  4 ++--
 utilities/persistent_cache/block_cache_tier.cc     |  2 +-
 utilities/persistent_cache/hash_table_test.cc      |  2 +-
 utilities/persistent_cache/persistent_cache_test.h |  2 +-
 .../transactions/optimistic_transaction_test.cc    |  4 ++--
 utilities/transactions/pessimistic_transaction.cc  |  2 +-
 .../transactions/pessimistic_transaction_db.cc     |  2 +-
 utilities/transactions/transaction_lock_mgr.cc     |  2 +-
 utilities/transactions/transaction_test.cc         | 10 +++++-----
 utilities/transactions/transaction_test.h          | 10 +++++-----
 .../write_prepared_transaction_test.cc             | 10 +++++-----
 utilities/transactions/write_prepared_txn_db.cc    |  2 +-
 utilities/ttl/ttl_test.cc                          |  2 +-
 utilities/util_merge_operators_test.cc             |  4 ++--
 .../write_batch_with_index_test.cc                 |  2 +-
 203 files changed, 322 insertions(+), 322 deletions(-)
 rename {util => test_util}/fault_injection_test_env.cc (99%)
 rename {util => test_util}/fault_injection_test_env.h (100%)
 rename {util => test_util}/mock_time_env.h (100%)
 rename {util => test_util}/sync_point.cc (95%)
 rename {util => test_util}/sync_point.h (100%)
 rename {util => test_util}/sync_point_impl.cc (98%)
 rename {util => test_util}/sync_point_impl.h (98%)
 rename {util => test_util}/testharness.cc (97%)
 rename {util => test_util}/testharness.h (100%)
 rename {util => test_util}/testutil.cc (99%)
 rename {util => test_util}/testutil.h (100%)
 rename {util => test_util}/transaction_test_util.cc (99%)
 rename {util => test_util}/transaction_test_util.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d74152d9d2..6449047fca6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -600,6 +600,10 @@ set(SOURCES
         table/sst_file_writer.cc
         table/table_properties.cc
         table/two_level_iterator.cc
+        test_util/sync_point.cc
+        test_util/sync_point_impl.cc
+        test_util/testutil.cc
+        test_util/transaction_test_util.cc
         tools/db_bench_tool.cc
         tools/dump/db_dump_tool.cc
         tools/ldb_cmd.cc
@@ -629,13 +633,9 @@ set(SOURCES
         util/slice.cc
         util/status.cc
         util/string_util.cc
-        util/sync_point.cc
-        util/sync_point_impl.cc
-        util/testutil.cc
         util/thread_local.cc
         util/threadpool_imp.cc
         util/trace_replay.cc
-        util/transaction_test_util.cc
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_compaction_filter.cc
@@ -1006,7 +1006,7 @@ if(WITH_TESTS)
     tools/db_bench.cc
     table/table_reader_bench.cc
     utilities/persistent_cache/hash_table_bench.cc)
-  add_library(testharness OBJECT util/testharness.cc)
+  add_library(testharness OBJECT test_util/testharness.cc)
   foreach(sourcefile ${BENCHMARKS})
     get_filename_component(exename ${sourcefile} NAME_WE)
     add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
@@ -1020,7 +1020,7 @@ if(WITH_TESTS)
       db/db_test_util.cc
       monitoring/thread_status_updater_debug.cc
       table/mock_table.cc
-      util/fault_injection_test_env.cc
+      test_util/fault_injection_test_env.cc
       utilities/cassandra/test_utils.cc
   )
   # test utilities are only build in debug
diff --git a/Makefile b/Makefile
index ec0a04ed106..16d5da0b16c 100644
--- a/Makefile
+++ b/Makefile
@@ -404,8 +404,8 @@ LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o)
 MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o)
 
 GTEST = $(GTEST_DIR)/gtest/gtest-all.o
-TESTUTIL = ./util/testutil.o
-TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
+TESTUTIL = ./test_util/testutil.o
+TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
 VALGRIND_ERROR = 2
 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 
diff --git a/TARGETS b/TARGETS
index 7d271515728..c438aa3fb45 100644
--- a/TARGETS
+++ b/TARGETS
@@ -207,6 +207,9 @@ cpp_library(
         "table/sst_file_writer.cc",
         "table/table_properties.cc",
         "table/two_level_iterator.cc",
+        "test_util/sync_point.cc",
+        "test_util/sync_point_impl.cc",
+        "test_util/transaction_test_util.cc",
         "tools/dump/db_dump_tool.cc",
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
@@ -235,12 +238,9 @@ cpp_library(
         "util/slice.cc",
         "util/status.cc",
         "util/string_util.cc",
-        "util/sync_point.cc",
-        "util/sync_point_impl.cc",
         "util/thread_local.cc",
         "util/threadpool_imp.cc",
         "util/trace_replay.cc",
-        "util/transaction_test_util.cc",
         "util/xxhash.cc",
         "utilities/backupable/backupable_db.cc",
         "utilities/blob_db/blob_compaction_filter.cc",
@@ -309,10 +309,10 @@ cpp_library(
     srcs = [
         "db/db_test_util.cc",
         "table/mock_table.cc",
+        "test_util/fault_injection_test_env.cc",
+        "test_util/testharness.cc",
+        "test_util/testutil.cc",
         "tools/trace_analyzer_tool.cc",
-        "util/fault_injection_test_env.cc",
-        "util/testharness.cc",
-        "util/testutil.cc",
         "utilities/cassandra/test_utils.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
@@ -326,9 +326,9 @@ cpp_library(
 cpp_library(
     name = "rocksdb_tools_lib",
     srcs = [
+        "test_util/testutil.cc",
         "tools/db_bench_tool.cc",
         "tools/trace_analyzer_tool.cc",
-        "util/testutil.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index a5d71b65d4e..94b63a4e8bf 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -118,7 +118,7 @@ def generate_targets(repo_path):
         "rocksdb_tools_lib",
         src_mk.get("BENCH_LIB_SOURCES", []) +
         src_mk.get("ANALYZER_LIB_SOURCES", []) +
-        ["util/testutil.cc"],
+        ["test_util/testutil.cc"],
         [":rocksdb_lib"])
 
     # test for every test we found in the Makefile
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index f9f77234cdb..377ae146876 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -18,7 +18,7 @@
 #include "cache/lru_cache.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 9980dd72b7b..575764611ce 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -8,7 +8,7 @@
 #include <string>
 #include <vector>
 #include "port/port.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/builder.cc b/db/builder.cc
index b42ac187ef0..2b97ce1d608 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -34,7 +34,7 @@
 #include "table/internal_iterator.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index bdc832bd235..f5d57c35b78 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -21,11 +21,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "util/coding.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index ce80375e0e1..b97fd064e70 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -15,8 +15,8 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction.cc b/db/compaction.cc
index f8805376f1d..00ebd28b087 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -19,7 +19,7 @@
 #include "db/column_family.h"
 #include "rocksdb/compaction_filter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index ca55eef7123..7e060969962 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -9,7 +9,7 @@
 #include "port/likely.h"
 #include "rocksdb/listener.h"
 #include "table/internal_iterator.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #define DEFINITELY_IN_SNAPSHOT(seq, snapshot)                       \
   ((seq) <= (snapshot) &&                                           \
diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc
index c466f6c9122..b0a553136a3 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction_iterator_test.cc
@@ -10,8 +10,8 @@
 
 #include "port/port.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 7d2015e5629..91c7f437a17 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -61,7 +61,7 @@
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc
index 5ca6bf4a337..91441f5d76a 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction_job_stats_test.cc
@@ -58,9 +58,9 @@
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #if !defined(IOS_CROSS_COMPILE)
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 60394cc9735..4608cceeac1 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -27,8 +27,8 @@
 #include "table/mock_table.h"
 #include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index f500def41ee..c01f2884d4c 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -25,7 +25,7 @@
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index c759dae8b6c..82fc16f4f5a 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -14,8 +14,8 @@
 
 #include "util/logging.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc
index c25ae94fa1b..b8d23795fbc 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction_picker_universal.cc
@@ -25,7 +25,7 @@
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace {
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index a7ff587949d..ba7042049cb 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -12,8 +12,8 @@
 #include "util/hash.h"
 #include "util/kv_map.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index ba97ca1502b..379c33e4599 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -28,8 +28,8 @@
 #include "table/block_based_table_builder.h"
 #include "table/meta_blocks.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 2d4487ff454..ecd6d71ca2e 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -12,8 +12,8 @@
 #include "table/cuckoo_table_reader.h"
 #include "table/meta_blocks.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 236a534657f..45524b250f7 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -10,9 +10,9 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 
 namespace rocksdb {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 91a04205e07..623836454db 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -14,8 +14,8 @@
 #include "rocksdb/experimental.h"
 #include "rocksdb/utilities/convenience.h"
 #include "util/concurrent_task_limiter_impl.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 46ba411b6fd..4ddc11986b8 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -7,7 +7,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 #include <iostream>
 #include <string>
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 5b630e21635..a1a1c8f99d6 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -23,7 +23,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 876605b2e48..b901a5a7805 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -9,8 +9,8 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index e7ed1866469..749bd3629a0 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -97,7 +97,7 @@
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 1e39bdd4271..c5cc0736665 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -22,7 +22,7 @@
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
 #include "util/concurrent_task_limiter_impl.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index 4240b2012dc..0be85031ba3 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -20,7 +20,7 @@
 #include "rocksdb/wal_filter.h"
 #include "table/block_based_table_factory.h"
 #include "util/rate_limiter.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 Options SanitizeOptions(const std::string& dbname, const Options& src) {
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index 92edc84254c..98463f7b27f 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -16,7 +16,7 @@
 #include "db/event_helpers.h"
 #include "monitoring/perf_context_imp.h"
 #include "options/options_helper.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 // Convenience methods
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index a0f1dfeab45..8c3588e9abd 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -10,7 +10,7 @@
 #include "rocksdb/slice.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 #ifdef GFLAGS
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 29fbd320861..49e670abc28 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -18,8 +18,8 @@
 #include "table/iterator_wrapper.h"
 #include "table/merging_iterator.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index cb9a0e02e61..37a9f1a365b 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -20,8 +20,8 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/stats_history.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index aa63286f60a..16d682fc083 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -5,7 +5,7 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index a4267c7d596..50a0923b4c8 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -10,8 +10,8 @@
 #include "db/db_impl_secondary.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 77ea0020dd6..82f106133e8 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -14,8 +14,8 @@
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifndef ROCKSDB_LITE
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 7864a7e2c65..66df2323de2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -63,9 +63,9 @@
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 81186bfb9ad..3bc107889b4 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -46,13 +46,13 @@
 #include "table/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
-#include "util/mock_time_env.h"
+#include "test_util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 2bd8af684e0..4f1df4a7d57 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -11,7 +11,7 @@
 #include "port/stack_trace.h"
 #if !defined(ROCKSDB_LITE)
 #include "rocksdb/utilities/table_properties_collectors.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 78f72b4a0e7..9a1382e98ab 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -11,8 +11,8 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 class DBWALTest : public DBTestBase {
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index e6bab875114..322381b3867 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -12,9 +12,9 @@
 #include "db/write_thread.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index 0b16c13f573..e3f06fe6b65 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -9,7 +9,7 @@
 
 #include "db/dbformat.h"
 #include "util/logging.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 81ff8d0b99f..9c67102c5f0 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -21,9 +21,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/error_handler_test.cc b/db/error_handler_test.cc
index d33e19df5d5..c18706fc28e 100644
--- a/db/error_handler_test.cc
+++ b/db/error_handler_test.cc
@@ -12,9 +12,9 @@
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 
 namespace rocksdb {
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 256db0728bf..91a422bed9e 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -9,7 +9,7 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 7bfc64f77cb..26cd1127b94 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -24,7 +24,7 @@
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 0a0994f0ea9..ebd6cb2b160 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -11,8 +11,8 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
-#include "util/fault_injection_test_env.h"
-#include "util/testutil.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 1bfaa299456..330df7bfe48 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -21,12 +21,12 @@
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
index 935a01ef8dd..754cb3c4651 100644
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -13,8 +13,8 @@
 #include "db/version_edit.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/filename_test.cc b/db/filename_test.cc
index 869469f3f0c..dabe673d849 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -12,7 +12,7 @@
 #include "db/dbformat.h"
 #include "port/port.h"
 #include "util/logging.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 46915ca13a8..4930ecac7e9 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -51,7 +51,7 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 199ed29cacc..d97ad9f0c2d 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -15,8 +15,8 @@
 #include "table/mock_table.h"
 #include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 94e448ee97d..f95debec62c 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -22,7 +22,7 @@
 #include "rocksdb/slice_transform.h"
 #include "table/merging_iterator.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index 113ded94b69..9d6851dab16 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -35,7 +35,7 @@ int main() { return 0; }
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 const int MAX_SHARDS = 100000;
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 6b716a1d4b1..663116b7b8d 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -29,9 +29,9 @@
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #ifndef ROCKSDB_LITE
diff --git a/db/log_test.cc b/db/log_test.cc
index fd237b030e7..5b159acf21f 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -14,8 +14,8 @@
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 namespace log {
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index 02732a55583..35e5019ca7e 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -12,7 +12,7 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "port/port.h"
 
 using namespace rocksdb;
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index bdcbd218663..b50b58a1af7 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -24,7 +24,7 @@
 #include "table/merging_iterator.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index a14c13b893b..59da8af1664 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -14,8 +14,8 @@
 #include "rocksdb/status.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc
index b61092ee575..dc3624af53e 100644
--- a/db/merge_helper_test.cc
+++ b/db/merge_helper_test.cc
@@ -10,8 +10,8 @@
 #include "db/merge_helper.h"
 #include "rocksdb/comparator.h"
 #include "util/coding.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 3bd4b9a6004..d3dadaa5d30 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -18,7 +18,7 @@
 #include "db/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "utilities/merge_operators.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 6bf2acf8519..c6e7d6af07a 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -21,9 +21,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using std::cerr;
 using std::cout;
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index 0a9a34ff0b5..c7eba52c290 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -10,7 +10,7 @@
 #include "db/db_test_util.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 class OptionsFileTest : public testing::Test {
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index b7efec182a1..42d592862c7 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -19,7 +19,7 @@
 #include "rocksdb/slice_transform.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 bool FLAGS_random_key = false;
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index ef770c2e50b..7648ed85ff7 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -33,8 +33,8 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index be420ded183..e8290e76bca 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -31,7 +31,7 @@ int main() {
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc
index 34b2f7e5db1..54a86169b20 100644
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@@ -26,7 +26,7 @@ int main() {
 #include "util/coding.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 #include "util/gflags_compat.h"
 
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index 28c8129ecb0..7ce666326a8 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -12,7 +12,7 @@
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc
index ddd3f774176..11f3574967d 100644
--- a/db/range_tombstone_fragmenter_test.cc
+++ b/db/range_tombstone_fragmenter_test.cc
@@ -7,7 +7,7 @@
 
 #include "db/db_test_util.h"
 #include "rocksdb/comparator.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 01724dfc5cb..4efd3fdf759 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -25,7 +25,7 @@
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index ea561e982ff..6171b2938c2 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -20,8 +20,8 @@
 #include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 514952bb5b1..5c3bd686b1c 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -8,8 +8,8 @@
 #include "db/version_set.h"
 #include "util/logging.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 01ec44515a7..018517a1381 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -14,7 +14,7 @@
 #include "util/coding.h"
 #include "util/event_logger.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 64d1fd77bc1..5f1ae98ba4f 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -9,8 +9,8 @@
 
 #include "db/version_edit.h"
 #include "util/coding.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/version_set.cc b/db/version_set.cc
index c10eb9f7ac3..b9616f3730b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -52,7 +52,7 @@
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 43924a3addd..41c27fdab65 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -12,8 +12,8 @@
 #include "table/mock_table.h"
 #include "util/logging.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index cce714750e7..20b5780c877 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -34,7 +34,7 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 379f12f52aa..b1478e26e54 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -21,8 +21,8 @@
 #include "table/mock_table.h"
 #include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 322bd8945b0..88c52522917 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -19,7 +19,7 @@
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index 7f2b20d892f..dbb4759fa03 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -17,8 +17,8 @@
 #include "rocksdb/write_batch.h"
 #include "port/port.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 using std::string;
 
diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc
index 55feb00a339..919c2c11808 100644
--- a/db/write_controller_test.cc
+++ b/db/write_controller_test.cc
@@ -8,7 +8,7 @@
 #include "db/write_controller.h"
 
 #include "rocksdb/env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/write_thread.cc b/db/write_thread.cc
index 835992c8fce..872d32ca81b 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -10,7 +10,7 @@
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/random.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc
index 93764d945f9..f306edbd6ba 100644
--- a/env/env_basic_test.cc
+++ b/env/env_basic_test.cc
@@ -12,7 +12,7 @@
 #include "env/mock_env.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 387c0279397..3f75dd6893c 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -54,7 +54,7 @@
 #include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/thread_local.h"
 #include "util/threadpool_imp.h"
 
diff --git a/env/env_test.cc b/env/env_test.cc
index 47800928499..852a99c1adc 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -44,9 +44,9 @@
 #include "util/log_buffer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifdef OS_LINUX
 static const size_t kPageSize = sysconf(_SC_PAGESIZE);
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 0ced06ff262..27198b1f975 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -33,7 +33,7 @@
 #include "rocksdb/slice.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
 #define F_LINUX_SPECIFIC_BASE 1024
diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc
index 97c49b5f516..b21b953b568 100644
--- a/env/mock_env_test.cc
+++ b/env/mock_env_test.cc
@@ -10,7 +10,7 @@
 #include <string>
 
 #include "rocksdb/env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/env/posix_logger.h b/env/posix_logger.h
index 401df6a3ffb..8406a6d8acc 100644
--- a/env/posix_logger.h
+++ b/env/posix_logger.h
@@ -27,7 +27,7 @@
 #include "env/io_posix.h"
 #include "monitoring/iostats_context_imp.h"
 #include "rocksdb/env.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index 41ec84376b6..44e3110d5e7 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -15,7 +15,7 @@
 #include "rocksdb/env.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
index c8544004cd5..122a5d6177e 100644
--- a/file/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifndef ROCKSDB_LITE
 
diff --git a/file/filename.cc b/file/filename.cc
index 0a48dc78c36..ed19b4109ff 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -21,7 +21,7 @@
 #include "util/logging.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index 86bcb2d19ca..9b7278c7d5b 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -17,7 +17,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 266fb4abf74..9d5de9a2f86 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -23,7 +23,7 @@
 #include "rocksjni/portal.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 /*
  * Class:     org_rocksdb_WriteBatchTest
diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc
index b416ef7c557..a2f62d5304a 100644
--- a/memtable/inlineskiplist_test.cc
+++ b/memtable/inlineskiplist_test.cc
@@ -14,7 +14,7 @@
 #include "util/concurrent_arena.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc
index 51ff11a015c..ae199096563 100644
--- a/memtable/memtablerep_bench.cc
+++ b/memtable/memtablerep_bench.cc
@@ -39,7 +39,7 @@ int main() {
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc
index 50c3588bb86..054e3c9df07 100644
--- a/memtable/skiplist_test.cc
+++ b/memtable/skiplist_test.cc
@@ -13,7 +13,7 @@
 #include "util/arena.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc
index 3c89c8095e1..06514eabde4 100644
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/write_buffer_manager.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc
index df58822fc21..ed9a7bd32ff 100644
--- a/monitoring/histogram_test.cc
+++ b/monitoring/histogram_test.cc
@@ -7,7 +7,7 @@
 
 #include "monitoring/histogram.h"
 #include "monitoring/histogram_windowing.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc
index 7b61bcf4fb8..796bb26dd4b 100644
--- a/monitoring/instrumented_mutex.cc
+++ b/monitoring/instrumented_mutex.cc
@@ -6,7 +6,7 @@
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace {
diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc
index 74d3e43291d..28d305d021a 100644
--- a/monitoring/iostats_context_test.cc
+++ b/monitoring/iostats_context_test.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/iostats_context.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc
index a77022bfb3d..162afb264b2 100644
--- a/monitoring/statistics_test.cc
+++ b/monitoring/statistics_test.cc
@@ -5,8 +5,8 @@
 //
 
 #include "port/stack_trace.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #include "rocksdb/statistics.h"
 
diff --git a/options/options_parser.cc b/options/options_parser.cc
index f09e53e4a49..9ae3dfb2785 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -19,7 +19,7 @@
 #include "util/cast_util.h"
 #include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #include "port/port.h"
 
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 79a4fa81475..2e21a2688f8 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -15,7 +15,7 @@
 
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
diff --git a/options/options_test.cc b/options/options_test.cc
index ded336dd18d..704b2db802b 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -30,8 +30,8 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators/bytesxor.h"
 
 #ifndef GFLAGS
diff --git a/port/win/env_default.cc b/port/win/env_default.cc
index d24c21918aa..db64878bc02 100644
--- a/port/win/env_default.cc
+++ b/port/win/env_default.cc
@@ -12,7 +12,7 @@
 #include <rocksdb/env.h>
 #include "port/win/env_win.h"
 #include "util/compression_context_cache.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 64ded8465d0..15d1e711412 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -12,7 +12,7 @@
 #include "monitoring/iostats_context_imp.h"
 #include "util/aligned_buffer.h"
 #include "util/coding.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace port {
diff --git a/src.mk b/src.mk
index 2541b9fd12b..100b3355e74 100644
--- a/src.mk
+++ b/src.mk
@@ -131,6 +131,9 @@ LIB_SOURCES =                                                   \
   table/sst_file_writer.cc                                      \
   table/table_properties.cc                                     \
   table/two_level_iterator.cc                                   \
+  test_util/sync_point.cc                                       \
+  test_util/sync_point_impl.cc                                  \
+  test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
   util/arena.cc                                                 \
   util/auto_roll_logger.cc                                      \
@@ -156,12 +159,9 @@ LIB_SOURCES =                                                   \
   util/slice.cc                                                 \
   util/status.cc                                                \
   util/string_util.cc                                           \
-  util/sync_point.cc                                            \
-  util/sync_point_impl.cc                                       \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
   util/trace_replay.cc                                          \
-  util/transaction_test_util.cc                                 \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_compaction_filter.cc                   \
@@ -242,15 +242,15 @@ ANALYZER_LIB_SOURCES = \
 
 MOCK_LIB_SOURCES = \
   table/mock_table.cc \
-  util/fault_injection_test_env.cc
+  test_util/fault_injection_test_env.cc
 
 BENCH_LIB_SOURCES = \
   tools/db_bench_tool.cc                                        \
 
 TEST_LIB_SOURCES = \
   db/db_test_util.cc                                            \
-  util/testharness.cc                                           \
-  util/testutil.cc                                              \
+  test_util/testharness.cc                                           \
+  test_util/testutil.cc                                              \
   utilities/cassandra/test_utils.cc                             \
 
 MAIN_SOURCES =                                                          \
diff --git a/table/block.h b/table/block.h
index df4d4eb82fc..869d2f1f286 100644
--- a/table/block.h
+++ b/table/block.h
@@ -31,7 +31,7 @@
 #include "table/data_block_hash_index.h"
 #include "table/internal_iterator.h"
 #include "util/random.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc
index 6b352b2f6b0..2cb3abc27a6 100644
--- a/table/block_based_filter_block_test.cc
+++ b/table/block_based_filter_block_test.cc
@@ -13,8 +13,8 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 34e40979247..a45fc0a5b47 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -51,7 +51,7 @@
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/xxhash.h"
 
 namespace rocksdb {
diff --git a/table/block_test.cc b/table/block_test.cc
index 3e0ff3eab59..d359b4e59ca 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -23,8 +23,8 @@
 #include "table/block_builder.h"
 #include "table/format.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc
index f18c33b8399..8478adf523d 100644
--- a/table/cleanable_test.cc
+++ b/table/cleanable_test.cc
@@ -9,8 +9,8 @@
 #include "port/stack_trace.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index c1e350327f3..eeba9480592 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -13,8 +13,8 @@
 #include "table/meta_blocks.h"
 #include "table/cuckoo_table_builder.h"
 #include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 extern const uint64_t kCuckooTableMagicNumber;
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 74fb52e6c78..6d596f6e115 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -31,8 +31,8 @@ int main() {
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/table/data_block_hash_index_test.cc b/table/data_block_hash_index_test.cc
index 11226648ef2..0511b257aa3 100644
--- a/table/data_block_hash_index_test.cc
+++ b/table/data_block_hash_index_test.cc
@@ -15,8 +15,8 @@
 #include "table/data_block_hash_index.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc
index 3abae979a4c..0ef5c5a970c 100644
--- a/table/full_filter_block_test.cc
+++ b/table/full_filter_block_test.cc
@@ -10,8 +10,8 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/merger_test.cc b/table/merger_test.cc
index 1b04d065727..8efa2834db6 100644
--- a/table/merger_test.cc
+++ b/table/merger_test.cc
@@ -7,8 +7,8 @@
 #include <string>
 
 #include "table/merging_iterator.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 244b5e82c3d..85a2fcc0324 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -23,7 +23,7 @@
 #include "util/autovector.h"
 #include "util/heap.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 3f48095c55b..98e05a4d032 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -18,7 +18,7 @@
 #include "table/table_properties_internal.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 5bca14644d8..f99941863a9 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -20,8 +20,8 @@
 #include "table/table_builder.h"
 #include "table/table_reader.h"
 #include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 namespace mock {
diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc
index 8afa530d71a..4bdc2fd36f1 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/partitioned_filter_block_test.cc
@@ -13,8 +13,8 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 51bc975af00..529634ccd75 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -11,8 +11,8 @@
 #include "rocksdb/sst_file_reader.h"
 #include "rocksdb/sst_file_writer.h"
 #include "table/sst_file_writer_collectors.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index b9a7273e07d..71b395fd6be 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -11,7 +11,7 @@
 #include "table/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
 #include "util/file_reader_writer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a9b75715b5f..6b05d385e06 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -24,8 +24,8 @@ int main() {
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/table/table_test.cc b/table/table_test.cc
index 7292ad7c32d..dccc4919409 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -49,9 +49,9 @@
 #include "util/compression.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc
similarity index 99%
rename from util/fault_injection_test_env.cc
rename to test_util/fault_injection_test_env.cc
index 9cad23871b6..a591ff4b57b 100644
--- a/util/fault_injection_test_env.cc
+++ b/test_util/fault_injection_test_env.cc
@@ -11,7 +11,7 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include <functional>
 #include <utility>
 
diff --git a/util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h
similarity index 100%
rename from util/fault_injection_test_env.h
rename to test_util/fault_injection_test_env.h
diff --git a/util/mock_time_env.h b/test_util/mock_time_env.h
similarity index 100%
rename from util/mock_time_env.h
rename to test_util/mock_time_env.h
diff --git a/util/sync_point.cc b/test_util/sync_point.cc
similarity index 95%
rename from util/sync_point.cc
rename to test_util/sync_point.cc
index 4599c256d9f..a09be9e8fa1 100644
--- a/util/sync_point.cc
+++ b/test_util/sync_point.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point.h"
-#include "util/sync_point_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/sync_point_impl.h"
 
 int rocksdb_kill_odds = 0;
 std::vector<std::string> rocksdb_kill_prefix_blacklist;
diff --git a/util/sync_point.h b/test_util/sync_point.h
similarity index 100%
rename from util/sync_point.h
rename to test_util/sync_point.h
diff --git a/util/sync_point_impl.cc b/test_util/sync_point_impl.cc
similarity index 98%
rename from util/sync_point_impl.cc
rename to test_util/sync_point_impl.cc
index 248c381a328..db44f472a05 100644
--- a/util/sync_point_impl.cc
+++ b/test_util/sync_point_impl.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point_impl.h"
+#include "test_util/sync_point_impl.h"
 
 #ifndef NDEBUG
 namespace rocksdb {
diff --git a/util/sync_point_impl.h b/test_util/sync_point_impl.h
similarity index 98%
rename from util/sync_point_impl.h
rename to test_util/sync_point_impl.h
index 3c7e7049183..d96d7325786 100644
--- a/util/sync_point_impl.h
+++ b/test_util/sync_point_impl.h
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #include <assert.h>
 #include <atomic>
diff --git a/util/testharness.cc b/test_util/testharness.cc
similarity index 97%
rename from util/testharness.cc
rename to test_util/testharness.cc
index 8f5eb2a4d6e..62cc535a198 100644
--- a/util/testharness.cc
+++ b/test_util/testharness.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include <string>
 #include <thread>
 
diff --git a/util/testharness.h b/test_util/testharness.h
similarity index 100%
rename from util/testharness.h
rename to test_util/testharness.h
diff --git a/util/testutil.cc b/test_util/testutil.cc
similarity index 99%
rename from util/testutil.cc
rename to test_util/testutil.cc
index b6493258f60..18e1a45bb36 100644
--- a/util/testutil.cc
+++ b/test_util/testutil.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 #include <cctype>
 #include <sstream>
diff --git a/util/testutil.h b/test_util/testutil.h
similarity index 100%
rename from util/testutil.h
rename to test_util/testutil.h
diff --git a/util/transaction_test_util.cc b/test_util/transaction_test_util.cc
similarity index 99%
rename from util/transaction_test_util.cc
rename to test_util/transaction_test_util.cc
index bd2d6afdca0..14d39065182 100644
--- a/util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -8,7 +8,7 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include "util/transaction_test_util.h"
+#include "test_util/transaction_test_util.h"
 
 #include <inttypes.h>
 #include <algorithm>
diff --git a/util/transaction_test_util.h b/test_util/transaction_test_util.h
similarity index 100%
rename from util/transaction_test_util.h
rename to test_util/transaction_test_util.h
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 2ceca4fd950..12caa2809ad 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -68,8 +68,8 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/xxhash.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/merge_operators.h"
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index 1b19de5f17e..52a1f9b91eb 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -11,8 +11,8 @@
 #include "options/options_parser.h"
 #include "rocksdb/utilities/options_util.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifdef GFLAGS
 #include "util/gflags_compat.h"
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index c640b5945b0..41ae4c2761e 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -19,7 +19,7 @@ int main() {
 #include "rocksdb/db.h"
 #include "rocksdb/types.h"
 #include "util/gflags_compat.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 // Run a thread to perform Put's.
 // Another thread uses GetUpdatesSince API to keep getting the updates.
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 579178efffc..72461b13ab4 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -72,9 +72,9 @@ int main() {
 #include "util/string_util.h"
 // SyncPoint is not supported in Released Windows Mode.
 #if !(defined NDEBUG) || !defined(OS_WIN)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 #include "utilities/merge_operators.h"
 
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 3b709953373..24622b7ccf3 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,7 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/ldb_cmd.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 using std::string;
 using std::vector;
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index 1718b3344e9..a76416b6c1d 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -12,8 +12,8 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 #include "tools/ldb_cmd_impl.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 6bf3e3b97a1..a2c226b926c 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -16,8 +16,8 @@
 #include "table/block_based_table_factory.h"
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index b2cc777d5a4..2f31c5d8249 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -28,8 +28,8 @@ int main() {
 #include "rocksdb/status.h"
 #include "rocksdb/trace_reader_writer.h"
 #include "tools/trace_analyzer_tool.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/trace_replay.h"
 
 namespace rocksdb {
diff --git a/util/arena.cc b/util/arena.cc
index d7799eb266a..67e8a4db782 100644
--- a/util/arena.cc
+++ b/util/arena.cc
@@ -22,7 +22,7 @@
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "util/logging.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/arena_test.cc b/util/arena_test.cc
index 9dfc28ab2ea..052f2a6d5db 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -9,7 +9,7 @@
 
 #include "util/arena.h"
 #include "util/random.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index 24f4714b4fd..5a2049b6405 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -14,7 +14,7 @@
 #include "port/port.h"
 #include "port/util_logger.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index ab9e0595808..3adbdbb1363 100644
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -20,8 +20,8 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "util/logging.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 namespace {
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
index 13299669cd4..edb7af9eaf2 100644
--- a/util/autovector_test.cc
+++ b/util/autovector_test.cc
@@ -11,8 +11,8 @@
 #include "rocksdb/env.h"
 #include "util/autovector.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using std::cout;
 using std::endl;
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 4b25e9b6c6f..87cd9da5569 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -22,8 +22,8 @@ int main() {
 #include "util/arena.h"
 #include "util/gflags_compat.h"
 #include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
diff --git a/util/coding_test.cc b/util/coding_test.cc
index f7b1671d1ec..7f73e00e155 100644
--- a/util/coding_test.cc
+++ b/util/coding_test.cc
@@ -9,7 +9,7 @@
 
 #include "util/coding.h"
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
index d5983586bc6..90f0c815cc2 100644
--- a/util/crc32c_test.cc
+++ b/util/crc32c_test.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "util/crc32c.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 4244bff1a4e..a8a7000f648 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -29,8 +29,8 @@ int main() {
 #include "util/gflags_compat.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
diff --git a/util/event_logger_test.cc b/util/event_logger_test.cc
index 4bcf30ff5eb..16c6c59f70e 100644
--- a/util/event_logger_test.cc
+++ b/util/event_logger_test.cc
@@ -6,7 +6,7 @@
 #include <string>
 
 #include "util/event_logger.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 9a818cb0f07..3003a1ebac0 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -17,7 +17,7 @@
 #include "port/port.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 1ef23e8c936..317c1d6c78c 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -16,7 +16,7 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/rate_limiter.h"
 #include "util/aligned_buffer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 6a7ea6d7da4..18bb65a72bb 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -7,8 +7,8 @@
 #include <algorithm>
 #include <vector>
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/util/filelock_test.cc b/util/filelock_test.cc
index f8721b5909a..bd0fc7c4221 100644
--- a/util/filelock_test.cc
+++ b/util/filelock_test.cc
@@ -9,7 +9,7 @@
 #include <vector>
 #include <fcntl.h>
 #include "util/coding.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 959e8cd0f68..6618c5a4bc1 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -10,7 +10,7 @@
 #include <vector>
 
 #include "util/hash.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 // The hash algorithm is part of the file format, for example for the Bloom
 // filters. Test that the hash values are stable for a set of random strings of
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index 5c9b3e84bf4..dd5322151e3 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -15,8 +15,8 @@ int main() {
 #include "rocksdb/env.h"
 #include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index 9d23c38f7ac..93665837fc4 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -12,7 +12,7 @@
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "util/aligned_buffer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index d3f3be3ba95..3316a75b571 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -20,8 +20,8 @@
 #include "db/db_test_util.h"
 #include "rocksdb/env.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h
index 2d4729da02c..4226f35396c 100644
--- a/util/repeatable_thread.h
+++ b/util/repeatable_thread.h
@@ -10,7 +10,7 @@
 
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/mock_time_env.h"
+#include "test_util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc
index ee853c1056f..29af340d7cb 100644
--- a/util/repeatable_thread_test.cc
+++ b/util/repeatable_thread_test.cc
@@ -8,8 +8,8 @@
 
 #include "db/db_test_util.h"
 #include "util/repeatable_thread.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 class RepeatableThreadTest : public testing::Test {
  public:
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
index f91675ccec8..96d90a9cd9b 100644
--- a/util/slice_transform_test.cc
+++ b/util/slice_transform_test.cc
@@ -14,7 +14,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index a4a343a9cf4..37f59bab8ca 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -8,7 +8,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "rocksdb/db.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #ifdef ROCKSDB_USING_THREAD_STATUS
 
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index 789be83d8fd..787638138c0 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -10,9 +10,9 @@
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/autovector.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/util/timer_queue.h b/util/timer_queue.h
index bd8a4f85048..a5f74ae5679 100644
--- a/util/timer_queue.h
+++ b/util/timer_queue.h
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "port/port.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 // Allows execution of handlers at a specified time in the future
 // Guarantees:
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 149eb911f7f..816c9718b2d 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -20,7 +20,7 @@
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index e4abd96e95f..c7377064f82 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -27,9 +27,9 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 9f3839370eb..54eb3f2dbb5 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -33,7 +33,7 @@
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index e24ba1d983c..19dce3f87d7 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -19,11 +19,11 @@
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
 #include "util/cast_util.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
 #include "utilities/blob_db/blob_index.h"
diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc
index 8f9baa72357..7af21247eb1 100644
--- a/utilities/cassandra/cassandra_format_test.cc
+++ b/utilities/cassandra/cassandra_format_test.cc
@@ -5,7 +5,7 @@
 
 #include <cstring>
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/serialize.h"
 #include "utilities/cassandra/test_utils.h"
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index dacc6f03ce3..347846d075c 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -8,7 +8,7 @@
 #include "db/db_impl.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/cassandra/cassandra_compaction_filter.h"
diff --git a/utilities/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc
index 8d6dc10ded0..88dee118b5b 100644
--- a/utilities/cassandra/cassandra_row_merge_test.cc
+++ b/utilities/cassandra/cassandra_row_merge_test.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/test_utils.h"
 
diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc
index 68d2c163d96..bfce2a36e30 100644
--- a/utilities/cassandra/cassandra_serialize_test.cc
+++ b/utilities/cassandra/cassandra_serialize_test.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/serialize.h"
 
 using namespace rocksdb::cassandra;
diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h
index 09a4923565f..562c1aff3ff 100644
--- a/utilities/cassandra/format.h
+++ b/utilities/cassandra/format.h
@@ -60,7 +60,7 @@
 #include <memory>
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 namespace cassandra {
diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h
index 80374b0cbab..f58bd730015 100644
--- a/utilities/cassandra/test_utils.h
+++ b/utilities/cassandra/test_utils.h
@@ -5,7 +5,7 @@
 
 #pragma once
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/serialize.h"
 
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 920f9bf535b..7468c8eedee 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -28,7 +28,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index 9318a733dcf..da2972affd7 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -23,9 +23,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 class CheckpointTest : public testing::Test {
diff --git a/utilities/env_librados_test.cc b/utilities/env_librados_test.cc
index 1a3746860b6..e5f91894599 100644
--- a/utilities/env_librados_test.cc
+++ b/utilities/env_librados_test.cc
@@ -9,7 +9,7 @@
 #include "rocksdb/utilities/env_librados.h"
 #include <rados/librados.hpp>
 #include "env/mock_env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc
index 3c0ed228522..6b20f1f1334 100644
--- a/utilities/env_mirror_test.cc
+++ b/utilities/env_mirror_test.cc
@@ -8,7 +8,7 @@
 
 #include "rocksdb/utilities/env_mirror.h"
 #include "env/mock_env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/utilities/env_timed_test.cc b/utilities/env_timed_test.cc
index 8bdef6396e0..989c79a391d 100644
--- a/utilities/env_timed_test.cc
+++ b/utilities/env_timed_test.cc
@@ -7,7 +7,7 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/perf_context.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index 8d976ef9214..c3ff640816e 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -12,8 +12,8 @@
 #include "rocksdb/utilities/stackable_db.h"
 #include "table/block_based_table_factory.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index 54c89a03abf..160bd347bd2 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -15,7 +15,7 @@
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 
 using namespace rocksdb;
diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc
index 4444d8712f9..cc7c38d8a65 100644
--- a/utilities/object_registry_test.cc
+++ b/utilities/object_registry_test.cc
@@ -6,7 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/object_registry.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index ed7bfdfd6f7..342db490280 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -18,8 +18,8 @@
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_util.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index f7f72df6dfc..775ef29cf8d 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -13,7 +13,7 @@
 #include "port/port.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "utilities/persistent_cache/block_cache_tier_file.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc
index d6ff3e68e42..51ad211e929 100644
--- a/utilities/persistent_cache/hash_table_test.cc
+++ b/utilities/persistent_cache/hash_table_test.cc
@@ -11,7 +11,7 @@
 #include "db/db_test_util.h"
 #include "util/arena.h"
 #include "util/random.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/persistent_cache/hash_table.h"
 #include "utilities/persistent_cache/hash_table_evictable.h"
 
diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h
index ad99ea864bd..33cda4ea72d 100644
--- a/utilities/persistent_cache/persistent_cache_test.h
+++ b/utilities/persistent_cache/persistent_cache_test.h
@@ -23,7 +23,7 @@
 #include "table/block_builder.h"
 #include "port/port.h"
 #include "util/arena.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/persistent_cache/volatile_tier_impl.h"
 
 namespace rocksdb {
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index fbb0d44fdc7..e3105a2139c 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -15,8 +15,8 @@
 #include "util/crc32c.h"
 #include "util/logging.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/transaction_test_util.h"
+#include "test_util/testharness.h"
+#include "test_util/transaction_test_util.h"
 #include "port/port.h"
 
 using std::string;
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index d0e4f20467b..fd9da17aac4 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -21,7 +21,7 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 #include "utilities/transactions/transaction_util.h"
 
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index c1b37c148f5..95c88594ca9 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -22,7 +22,7 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 #include "utilities/transactions/write_prepared_txn_db.h"
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index 48d496bfd7f..173e012d88a 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -25,7 +25,7 @@
 #include "rocksdb/utilities/transaction_db_mutex.h"
 #include "util/cast_util.h"
 #include "util/hash.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 3c8036614f0..d183401f42f 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -23,13 +23,13 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 2e3b9952709..8dfa6b053c5 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -21,13 +21,13 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 7b5a585df91..5287cca2038 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -27,14 +27,14 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 5250f3f2de5..0508a596e43 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -24,7 +24,7 @@
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 88e90af269c..c7d8f52aa52 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -10,7 +10,7 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
diff --git a/utilities/util_merge_operators_test.cc b/utilities/util_merge_operators_test.cc
index d8b3cfba69c..d591ac8f12c 100644
--- a/utilities/util_merge_operators_test.cc
+++ b/utilities/util_merge_operators_test.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index be715fe32ca..f8875d9ac1f 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -16,7 +16,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 

From bd44ec2006fbb44d632ee7be7cf8f553d90b09d9 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 30 May 2019 11:38:02 -0700
Subject: [PATCH 082/572] Fix reopen voting logic in db_stress when using
 MultiGet (#5374)

Summary:
When the --reopen option is non-zero, the DB is reopened after every ops_per_thread/(reopen+1) ops, with the check being done after every op. With MultiGet, we might do multiple ops in one iteration, which broke the logic that checked when to synchronize among the threads and reopen the DB. This PR fixes that logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5374

Differential Revision: D15559780

Pulled By: anand1976

fbshipit-source-id: ee6563a68045df7f367eca3cbc2500d3e26359ef
---
 tools/db_stress.cc | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 72461b13ab4..b9ab1a2df11 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1967,13 +1967,18 @@ class StressTest {
     const int writeBound = prefixBound + (int)FLAGS_writepercent;
     const int delBound = writeBound + (int)FLAGS_delpercent;
     const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
+    const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
+    int multiget_batch_size = 0;
 
     thread->stats.Start();
     for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
       if (thread->shared->HasVerificationFailedYet()) {
         break;
       }
-      if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
+      // Check if the multiget batch crossed the ops_per_open boundary. If it
+      // did, then we should vote to reopen
+      if (i != 0 && (i % ops_per_open == 0 ||
+          i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) {
         {
           thread->stats.FinishedSingleOp();
           MutexLock l(thread->shared->GetMutex());
@@ -2168,7 +2173,7 @@ class StressTest {
             snap_state);
       }
       while (!thread->snapshot_queue.empty() &&
-          i == thread->snapshot_queue.front().first) {
+          i >= thread->snapshot_queue.front().first) {
         auto snap_state = thread->snapshot_queue.front().second;
         assert(snap_state.snapshot);
         // Note: this is unsafe as the cf might be dropped concurrently. But it
@@ -2185,13 +2190,24 @@ class StressTest {
       }
 
       int prob_op = thread->rand.Uniform(100);
+      // Reset this in case we pick something other than a read op. We don't
+      // want to use a stale value when deciding at the beginning of the loop
+      // whether to vote to reopen
+      multiget_batch_size = 0;
       if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
         // OPERATION read
         if (FLAGS_use_multiget) {
-          int num_keys = thread->rand.Uniform(64);
-          rand_keys = GenerateNKeys(thread, num_keys, i);
+          // Leave room for one more iteration of the loop with a single key
+          // batch. This is to ensure that each thread does exactly the same
+          // number of ops
+          multiget_batch_size = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                                       FLAGS_ops_per_thread - i - 1));
+          // If its the last iteration, ensure that multiget_batch_size is 1
+          multiget_batch_size = std::max(multiget_batch_size, 1);
+          rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
           TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
-          i += num_keys - 1;
+          i += multiget_batch_size - 1;
         } else {
           TestGet(thread, read_opts, rand_column_families, rand_keys);
         }

From 1e355842519debea764a8e04c5c08918dcc01d91 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 30 May 2019 11:49:36 -0700
Subject: [PATCH 083/572] Move the index readers out of the block cache (#5298)

Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).

Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298

Differential Revision: D15303203

Pulled By: ltamasi

fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
---
 HISTORY.md                        |    2 +
 db/db_block_cache_test.cc         |   12 +-
 table/block_based_table_reader.cc | 1085 ++++++++++++++---------------
 table/block_based_table_reader.h  |  114 ++-
 table/table_test.cc               |   67 +-
 5 files changed, 590 insertions(+), 690 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 40d11096df0..55366b006fc 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,8 @@
 ## Unreleased
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Partitions of partitioned indexes no longer affect the read amplification statistics.
+* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index f6e1aad323c..8eb73a23dd7 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -365,7 +365,10 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
   // set the cache capacity to the current usage
   cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+  // The index eviction statistics were broken by the refactoring that moved
+  // the index readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
   ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
   // Note that the second key needs to be no longer than the first one.
   // Otherwise the second index block may not fit in cache.
@@ -377,8 +380,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
             index_bytes_insert);
   ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
             filter_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
-            index_bytes_insert);
+  // The index eviction statistics were broken by the refactoring that moved
+  // the index readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+  //           index_bytes_insert);
   ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
             filter_bytes_insert);
 }
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index a45fc0a5b47..82f96492662 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -120,16 +120,8 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) {
 }
 
 void DeleteCachedFilterEntry(const Slice& key, void* value);
-void DeleteCachedIndexEntry(const Slice& key, void* value);
 void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
 
-// Release the cached entry and decrement its ref count.
-void ReleaseCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle);
-}
-
 // Release the cached entry and decrement its ref count.
 void ForceReleaseCachedEntry(void* arg, void* h) {
   Cache* cache = reinterpret_cast<Cache*>(arg);
@@ -137,17 +129,6 @@ void ForceReleaseCachedEntry(void* arg, void* h) {
   cache->Release(handle, true /* force_erase */);
 }
 
-Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
-                            size_t cache_key_prefix_size, uint64_t offset,
-                            char* cache_key) {
-  assert(cache_key != nullptr);
-  assert(cache_key_prefix_size != 0);
-  assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize);
-  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
-  char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset);
-  return Slice(cache_key, static_cast<size_t>(end - cache_key));
-}
-
 Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
                                  int level, Tickers block_cache_miss_ticker,
                                  Tickers block_cache_hit_ticker,
@@ -217,70 +198,193 @@ bool PrefixExtractorChanged(const TableProperties* table_properties,
 
 }  // namespace
 
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+public:
+  IndexReaderCommon(BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t)
+      , index_block_(std::move(index_block))
+  {
+    assert(table_ != nullptr);
+  }
+
+protected:
+  static Status ReadIndexBlock(BlockBasedTable* table,
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options,
+    GetContext* get_context, CachableEntry<Block>* index_block);
+
+  BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    const TableProperties* const properties =
+      table_->get_rep()->table_properties.get();
+
+    return properties == nullptr || !properties->index_key_is_user_key;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    const TableProperties* const properties =
+      table_->get_rep()->table_properties.get();
+
+    return properties == nullptr || !properties->index_value_is_delta_encoded;
+  }
+
+  Status GetOrReadIndexBlock(const ReadOptions& read_options,
+                             GetContext* get_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue() ?
+      index_block_.GetValue()->ApproximateMemoryUsage() : 0;
+  }
+
+private:
+  BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+  BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+  const ReadOptions& read_options, GetContext* get_context,
+  CachableEntry<Block>* index_block) {
+
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  constexpr bool is_index = true;
+  const Status s = BlockBasedTable::RetrieveBlock(prefetch_buffer,
+    rep, read_options, rep->footer.index_handle(),
+    UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+  const ReadOptions& read_options, GetContext* get_context,
+  CachableEntry<Block>* index_block) const {
+
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    *index_block = CachableEntry<Block>(index_block_.GetValue(),
+      nullptr /* cache */, nullptr /* cache_handle */, false /* own_value */);
+    return Status::OK();
+  }
+
+  return ReadIndexBlock(table_, nullptr /* prefetch_buffer */,
+    read_options, get_context, index_block);
+}
+
 // Index that allows binary search lookup in a two-level index structure.
-class PartitionIndexReader : public IndexReader {
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
  public:
   // Read the partition index from the file and create an instance for
   // `PartitionIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(BlockBasedTable* table, RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const Footer& footer, const BlockHandle& index_handle,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       IndexReader** index_reader,
-                       const PersistentCacheOptions& cache_options,
-                       const int level, const bool index_key_includes_seq,
-                       const bool index_value_is_full,
-                       MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
+  static Status Create(BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin, IndexReader** index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                                      nullptr /* get_context */, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
 
-    if (s.ok()) {
-      *index_reader = new PartitionIndexReader(
-          table, icomparator, std::move(index_block), ioptions.statistics,
-          level, index_key_includes_seq, index_value_is_full);
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
     }
 
-    return s;
+    *index_reader = new PartitionIndexReader(table, std::move(index_block));
+
+    return Status::OK();
   }
 
   // return a two-level iterator: first level is on the partition index
   InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
-      bool fill_cache = true) override {
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context) override {
+
+    CachableEntry<Block> index_block;
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<BlockHandle>(s);
+    }
+
+    InternalIteratorBase<BlockHandle>* it = nullptr;
+
     Statistics* kNullStats = nullptr;
     // Filters are already checked before seeking the index
     if (!partition_map_.empty()) {
-      // We don't return pinned datat from index blocks, so no need
+      // We don't return pinned data from index blocks, so no need
       // to set `block_contents_pinned`.
-      return NewTwoLevelIterator(
+      it = NewTwoLevelIterator(
           new BlockBasedTable::PartitionedIndexIteratorState(
-              table_, &partition_map_, index_key_includes_seq_,
-              index_value_is_full_),
-          index_block_->NewIterator<IndexBlockIter>(
-              icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_, index_value_is_full_));
+              table(), &partition_map_, index_key_includes_seq(),
+              index_value_is_full()),
+          index_block.GetValue()->NewIterator<IndexBlockIter>(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_key_includes_seq(),
+              index_value_is_full()));
     } else {
-      auto ro = ReadOptions();
-      ro.fill_cache = fill_cache;
-      bool kIsIndex = true;
-      // We don't return pinned datat from index blocks, so no need
+      ReadOptions ro;
+      ro.fill_cache = read_options.fill_cache;
+      constexpr bool is_index = true;
+      // We don't return pinned data from index blocks, so no need
       // to set `block_contents_pinned`.
-      return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
-          table_, ro, *icomparator_,
-          index_block_->NewIterator<IndexBlockIter>(
-              icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_, index_value_is_full_),
-          false, true, /* prefix_extractor */ nullptr, kIsIndex,
-          index_key_includes_seq_, index_value_is_full_);
+      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
+          table(), ro, *internal_comparator(),
+          index_block.GetValue()->NewIterator<IndexBlockIter>(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_key_includes_seq(),
+              index_value_is_full()),
+          false, true, /* prefix_extractor */ nullptr, is_index,
+          index_key_includes_seq(), index_value_is_full());
     }
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+
     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
     // on-stack BlockIter while the state is on heap. Currentlly it assumes
     // the first level iter is always on heap and will attempt to delete it
@@ -289,15 +393,26 @@ class PartitionIndexReader : public IndexReader {
 
   void CacheDependencies(bool pin) override {
     // Before read partitions, prefetch them to avoid lots of IOs
-    auto rep = table_->rep_;
+    auto rep = table()->rep_;
     IndexBlockIter biter;
     BlockHandle handle;
     Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
+
+    CachableEntry<Block> index_block;
+    Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */,
+                                   &index_block);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log,
+                     "Error retrieving top-level index block while trying to "
+                     "cache index partitions: %s", s.ToString().c_str());
+      return;
+    }
+
+    // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
-        index_key_includes_seq_, index_value_is_full_);
+    index_block.GetValue()->NewIterator<IndexBlockIter>(
+        internal_comparator(), internal_comparator()->user_comparator(), &biter,
+        kNullStats, true, index_key_includes_seq(), index_value_is_full());
     // Index partitions are assumed to be consecuitive. Prefetch them all.
     // Read the first block offset
     biter.SeekToFirst();
@@ -318,10 +433,10 @@ class PartitionIndexReader : public IndexReader {
     uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
     uint64_t prefetch_len = last_off - prefetch_off;
     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-    auto& file = table_->rep_->file;
+    auto& file = rep->file;
     prefetch_buffer.reset(new FilePrefetchBuffer());
-    Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
-                                         static_cast<size_t>(prefetch_len));
+    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len));
 
     // After prefetch, read the partitions one by one
     biter.SeekToFirst();
@@ -332,7 +447,7 @@ class PartitionIndexReader : public IndexReader {
       const bool is_index = true;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
-      s = table_->MaybeReadBlockAndLoadToCache(
+      s = BlockBasedTable::MaybeReadBlockAndLoadToCache(
           prefetch_buffer.get(), rep, ro, handle,
           UncompressionDict::GetEmptyDict(), &block, is_index,
           nullptr /* get_context */);
@@ -348,12 +463,8 @@ class PartitionIndexReader : public IndexReader {
     }
   }
 
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
-
   size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
+    size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     usage += malloc_usable_size((void*)this);
 #else
@@ -364,78 +475,79 @@ class PartitionIndexReader : public IndexReader {
   }
 
  private:
-  PartitionIndexReader(BlockBasedTable* table,
-                       const InternalKeyComparator* icomparator,
-                       std::unique_ptr<Block>&& index_block, Statistics* stats,
-                       const int /*level*/, const bool index_key_includes_seq,
-                       const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        table_(table),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
-  BlockBasedTable* table_;
-  std::unique_ptr<Block> index_block_;
+  PartitionIndexReader(BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+    : IndexReaderCommon(t, std::move(index_block))
+  {}
+
   std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
 };
 
 // Index that allows binary search lookup for the first key of each block.
 // This class can be viewed as a thin wrapper for `Block` class which already
 // supports binary search.
-class BinarySearchIndexReader : public IndexReader {
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
  public:
   // Read index from the file and create an intance for
   // `BinarySearchIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const Footer& footer, const BlockHandle& index_handle,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       IndexReader** index_reader,
-                       const PersistentCacheOptions& cache_options,
-                       const bool index_key_includes_seq,
-                       const bool index_value_is_full,
-                       MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
+  static Status Create(BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin, IndexReader** index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                                      nullptr /* get_context */, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
 
-    if (s.ok()) {
-      *index_reader = new BinarySearchIndexReader(
-          icomparator, std::move(index_block), ioptions.statistics,
-          index_key_includes_seq, index_value_is_full);
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
     }
 
-    return s;
+    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));
+
+    return Status::OK();
   }
 
   InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
-      bool /*dont_care*/ = true) override {
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context) override {
+    CachableEntry<Block> index_block;
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<BlockHandle>(s);
+    }
+
     Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
+    // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    return index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
-        index_key_includes_seq_, index_value_is_full_);
-  }
+    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, true, index_key_includes_seq(), index_value_is_full());
 
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
 
   size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
+    size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     usage += malloc_usable_size((void*)this);
 #else
@@ -445,60 +557,51 @@ class BinarySearchIndexReader : public IndexReader {
   }
 
  private:
-  BinarySearchIndexReader(const InternalKeyComparator* icomparator,
-                          std::unique_ptr<Block>&& index_block,
-                          Statistics* stats, const bool index_key_includes_seq,
-                          const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
-  std::unique_ptr<Block> index_block_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
+  BinarySearchIndexReader(BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+    : IndexReaderCommon(t, std::move(index_block))
+  {}
 };
 
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
-class HashIndexReader : public IndexReader {
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
  public:
-  static Status Create(
-      const SliceTransform* hash_key_extractor, const Footer& footer,
-      RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
-      const ImmutableCFOptions& ioptions,
-      const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
-      InternalIterator* meta_index_iter, IndexReader** index_reader,
-      bool /*hash_index_allow_collision*/,
-      const PersistentCacheOptions& cache_options,
-      const bool index_key_includes_seq, const bool index_value_is_full,
-      MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
+  static Status Create(BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin, IndexReader** index_reader) {
+    assert(table != nullptr);
+    assert(index_reader != nullptr);
+    assert(!pin || prefetch);
+
+    auto rep = table->get_rep();
+    assert(rep != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                                      nullptr /* get_context */, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
 
-    if (!s.ok()) {
-      return s;
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
     }
 
     // Note, failure to create prefix hash index does not need to be a
     // hard error. We can still fall back to the original binary search index.
     // So, Create will succeed regardless, from this point on.
 
-    auto new_index_reader = new HashIndexReader(
-        icomparator, std::move(index_block), ioptions.statistics,
-        index_key_includes_seq, index_value_is_full);
+    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
     *index_reader = new_index_reader;
 
     // Get prefixes block
     BlockHandle prefixes_handle;
-    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
-                      &prefixes_handle);
+    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
+                             &prefixes_handle);
     if (!s.ok()) {
       // TODO: log error
       return Status::OK();
@@ -513,6 +616,13 @@ class HashIndexReader : public IndexReader {
       return Status::OK();
     }
 
+    RandomAccessFileReader* const file = rep->file.get();
+    const Footer& footer = rep->footer;
+    const ImmutableCFOptions& ioptions = rep->ioptions;
+    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+    MemoryAllocator* const memory_allocator =
+      GetMemoryAllocator(rep->table_options);
+
     // Read contents for the blocks
     BlockContents prefixes_contents;
     BlockFetcher prefixes_block_fetcher(
@@ -537,7 +647,8 @@ class HashIndexReader : public IndexReader {
     }
 
     BlockPrefixIndex* prefix_index = nullptr;
-    s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data,
+    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
+                                 prefixes_contents.data,
                                  prefixes_meta_contents.data, &prefix_index);
     // TODO: log error
     if (s.ok()) {
@@ -548,24 +659,39 @@ class HashIndexReader : public IndexReader {
   }
 
   InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* iter = nullptr, bool total_order_seek = true,
-      bool /*dont_care*/ = true) override {
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context) override {
+    CachableEntry<Block> index_block;
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<BlockHandle>(s);
+    }
+
     Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
+    const bool total_order_seek = read_options.total_order_seek ||
+      disable_prefix_seek;
+    // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    return index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), iter, kNullStats,
-        total_order_seek, index_key_includes_seq_, index_value_is_full_,
-        false /* block_contents_pinned */, prefix_index_.get());
-  }
+    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, total_order_seek, index_key_includes_seq(),
+        index_value_is_full(), false /* block_contents_pinned */,
+        prefix_index_.get());
 
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
 
   size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
-    usage += prefixes_contents_.usable_size();
+    size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     usage += malloc_usable_size((void*)this);
 #else
@@ -578,37 +704,22 @@ class HashIndexReader : public IndexReader {
   }
 
  private:
-  HashIndexReader(const InternalKeyComparator* icomparator,
-                  std::unique_ptr<Block>&& index_block, Statistics* stats,
-                  const bool index_key_includes_seq,
-                  const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
+  HashIndexReader(BlockBasedTable* t,
+                  CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block))
+  {}
 
-  ~HashIndexReader() override {}
-
-  std::unique_ptr<Block> index_block_;
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
-  BlockContents prefixes_contents_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
 };
 
 // Helper function to setup the cache key's prefix for the Table.
-void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
   assert(kMaxCacheKeyPrefixSize >= 10);
   rep->cache_key_prefix_size = 0;
   rep->compressed_cache_key_prefix_size = 0;
   if (rep->table_options.block_cache != nullptr) {
     GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
                         &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
-    // Create dummy offset of index reader which is beyond the file size.
-    rep->dummy_index_reader_offset =
-        file_size + rep->table_options.block_cache->NewId();
   }
   if (rep->table_options.persistent_cache != nullptr) {
     GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
@@ -814,7 +925,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // handle prefix correctly.
   rep->internal_prefix_transform.reset(
       new InternalKeySliceTransform(prefix_extractor));
-  SetupCacheKeyPrefix(rep, file_size);
+  SetupCacheKeyPrefix(rep);
   std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
 
   // page cache options
@@ -848,9 +959,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     return s;
   }
   s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(),
-                                   new_table.get(), prefix_extractor,
-                                   prefetch_all, table_options, level,
-                                   prefetch_index_and_filter_in_cache);
+                                   new_table.get(), prefetch_all, table_options,
+                                   level);
 
   if (s.ok()) {
     // Update tail prefetch stats
@@ -1116,9 +1226,8 @@ Status BlockBasedTable::ReadCompressionDictBlock(
 
 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    BlockBasedTable* new_table, const SliceTransform* prefix_extractor,
-    bool prefetch_all, const BlockBasedTableOptions& table_options,
-    const int level, const bool prefetch_index_and_filter_in_cache) {
+    BlockBasedTable* new_table, bool prefetch_all,
+    const BlockBasedTableOptions& table_options, const int level) {
   Status s;
 
   // Find filter handle and filter type
@@ -1157,10 +1266,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
                                    &rep->compression_dict_handle);
   }
 
-  bool need_upper_bound_check =
-      PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor);
-
   BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
+
+  const bool use_cache = table_options.cache_index_and_filter_blocks;
+
   // prefetch the first level of index
   const bool prefetch_index =
       prefetch_all ||
@@ -1183,39 +1292,34 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   const bool pin_filter =
       pin_all || (table_options.pin_top_level_index_and_filter &&
                   rep->filter_type == Rep::FilterType::kPartitionedFilter);
+
+  IndexReader* index_reader = nullptr;
+  if (s.ok()) {
+    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+                                     prefetch_index, pin_index, &index_reader);
+    if (s.ok()) {
+      assert(index_reader != nullptr);
+      rep->index_reader.reset(index_reader);
+      // The partitions of partitioned index are always stored in cache. They
+      // are hence follow the configuration for pin and prefetch regardless of
+      // the value of cache_index_and_filter_blocks
+      if (prefetch_all) {
+        rep->index_reader->CacheDependencies(pin_all);
+      }
+    } else {
+      delete index_reader;
+      index_reader = nullptr;
+    }
+  }
+
   // pre-fetching of blocks is turned on
   // Will use block cache for meta-blocks access
   // Always prefetch index and filter for level 0
   // TODO(ajkr): also prefetch compression dictionary block
+  // TODO(ajkr): also pin compression dictionary block when
+  // `pin_l0_filter_and_index_blocks_in_cache == true`.
   if (table_options.cache_index_and_filter_blocks) {
     assert(table_options.block_cache != nullptr);
-    if (prefetch_index) {
-      // Hack: Call NewIndexIterator() to implicitly add index to the
-      // block_cache
-      CachableEntry<IndexReader> index_entry;
-      // check prefix_extractor match only if hash based index is used
-      bool disable_prefix_seek =
-          rep->index_type == BlockBasedTableOptions::kHashSearch &&
-          need_upper_bound_check;
-      if (s.ok()) {
-        std::unique_ptr<InternalIteratorBase<BlockHandle>> iter(
-            new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
-                                        nullptr, &index_entry));
-        s = iter->status();
-      }
-      if (s.ok()) {
-        // This is the first call to NewIndexIterator() since we're in Open().
-        // On success it should give us ownership of the `CachableEntry` by
-        // populating `index_entry`.
-        assert(index_entry.GetValue() != nullptr);
-        if (prefetch_all) {
-          index_entry.GetValue()->CacheDependencies(pin_all);
-        }
-        if (pin_index) {
-          rep->index_entry = std::move(index_entry);
-        }
-      }
-    }
     if (s.ok() && prefetch_filter) {
       // Hack: Call GetFilter() to implicitly add filter to the block_cache
       auto filter_entry =
@@ -1232,24 +1336,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
       }
     }
   } else {
-    // If we don't use block cache for meta-block access, we'll pre-load these
-    // blocks, which will kept in member variables in Rep and with a same life-
-    // time as this table object.
-    IndexReader* index_reader = nullptr;
-    if (s.ok()) {
-      s = new_table->CreateIndexReader(prefetch_buffer, &index_reader,
-                                       meta_iter, level);
-    }
     std::unique_ptr<const BlockContents> compression_dict_block;
     if (s.ok()) {
-      rep->index_reader.reset(index_reader);
-      // The partitions of partitioned index are always stored in cache. They
-      // are hence follow the configuration for pin and prefetch regardless of
-      // the value of cache_index_and_filter_blocks
-      if (prefetch_index_and_filter_in_cache || level == 0) {
-        rep->index_reader->CacheDependencies(pin_all);
-      }
-
       // Set filter block
       if (rep->filter_policy) {
         const bool is_a_filter_partition = true;
@@ -1259,14 +1347,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
         rep->filter.reset(filter);
         // Refer to the comment above about paritioned indexes always being
         // cached
-        if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
+        if (filter && prefetch_all) {
           filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
         }
       }
       s = ReadCompressionDictBlock(rep, prefetch_buffer,
                                    &compression_dict_block);
-    } else {
-      delete index_reader;
     }
     if (s.ok() && !rep->compression_dict_handle.IsNull()) {
       assert(compression_dict_block != nullptr);
@@ -1350,7 +1436,7 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
 
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+    Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
     const ReadOptions& read_options, CachableEntry<Block>* block,
     const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
     bool is_index, GetContext* get_context) {
@@ -1379,6 +1465,10 @@ Status BlockBasedTable::GetDataBlockFromCache(
             : nullptr,
         statistics, get_context);
     if (cache_handle != nullptr) {
+      if (is_index) {
+        PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+      }
+
       block->SetCachedValue(
           reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
           block_cache, cache_handle);
@@ -1843,119 +1933,15 @@ BlockBasedTable::GetUncompressionDict(Rep* rep,
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
 InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
-    IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
-    GetContext* get_context) {
-  // index reader has already been pre-populated.
-  if (rep_->index_reader) {
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return rep_->index_reader->NewIterator(
-        input_iter, read_options.total_order_seek || disable_prefix_seek,
-        read_options.fill_cache);
-  }
-  // we have a pinned index block
-  if (rep_->index_entry.IsCached()) {
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return rep_->index_entry.GetValue()->NewIterator(
-        input_iter, read_options.total_order_seek || disable_prefix_seek,
-        read_options.fill_cache);
-  }
-
-  PERF_TIMER_GUARD(read_index_block_nanos);
-
-  const bool no_io = read_options.read_tier == kBlockCacheTier;
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key =
-      GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                            rep_->dummy_index_reader_offset, cache_key);
-  Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle = GetEntryFromCache(
-      block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS,
-      BLOCK_CACHE_INDEX_HIT,
-      get_context ? &get_context->get_context_stats_.num_cache_index_miss
-                  : nullptr,
-      get_context ? &get_context->get_context_stats_.num_cache_index_hit
-                  : nullptr,
-      statistics, get_context);
-
-  if (cache_handle == nullptr && no_io) {
-    if (input_iter != nullptr) {
-      input_iter->Invalidate(Status::Incomplete("no blocking io"));
-      return input_iter;
-    } else {
-      return NewErrorInternalIterator<BlockHandle>(
-          Status::Incomplete("no blocking io"));
-    }
-  }
+    IndexBlockIter* input_iter, GetContext* get_context) {
 
-  IndexReader* index_reader = nullptr;
-  if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
-    index_reader =
-        reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
-  } else {
-    // Create index reader and put it in the cache.
-    Status s;
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2");
-    s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader);
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
-    size_t charge = 0;
-    if (s.ok()) {
-      assert(index_reader != nullptr);
-      charge = index_reader->ApproximateMemoryUsage();
-      s = block_cache->Insert(
-          key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle,
-          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-    }
-
-    if (s.ok()) {
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += charge;
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-      }
-      PERF_COUNTER_ADD(index_block_read_count, 1);
-      RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-      RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-    } else {
-      if (index_reader != nullptr) {
-        delete index_reader;
-      }
-      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      // make sure if something goes wrong, index_reader shall remain intact.
-      if (input_iter != nullptr) {
-        input_iter->Invalidate(s);
-        return input_iter;
-      } else {
-        return NewErrorInternalIterator<BlockHandle>(s);
-      }
-    }
-  }
+  assert(rep_ != nullptr);
+  assert(rep_->index_reader != nullptr);
 
-  assert(cache_handle);
-  // We don't return pinned datat from index blocks, so no need
+  // We don't return pinned data from index blocks, so no need
   // to set `block_contents_pinned`.
-  auto* iter = index_reader->NewIterator(
-      input_iter, read_options.total_order_seek || disable_prefix_seek);
-
-  // the caller would like to take ownership of the index block
-  // don't call RegisterCleanup() in this case, the caller will take care of it
-  if (index_entry != nullptr) {
-    *index_entry = {index_reader, block_cache, cache_handle,
-      false /* own_value */};
-  } else {
-    iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
-  }
-
-  return iter;
+  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+                                         input_iter, get_context);
 }
 
 // Convert an index iterator value (i.e., an encoded BlockHandle)
@@ -1970,118 +1956,85 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     FilePrefetchBuffer* prefetch_buffer) {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
-  Cache* block_cache = rep->table_options.block_cache.get();
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  auto uncompression_dict_storage =
+    GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
+  const UncompressionDict& uncompression_dict =
+    uncompression_dict_storage.GetValue() == nullptr
+      ? UncompressionDict::GetEmptyDict()
+      : *uncompression_dict_storage.GetValue();
+
   CachableEntry<Block> block;
-  TBlockIter* iter;
-  {
-    const bool no_io = (ro.read_tier == kBlockCacheTier);
-    auto uncompression_dict_storage =
-        GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
-    const UncompressionDict& uncompression_dict =
-        uncompression_dict_storage.GetValue() == nullptr
-            ? UncompressionDict::GetEmptyDict()
-            : *uncompression_dict_storage.GetValue();
-    if (s.ok()) {
-      s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
-                                       uncompression_dict, &block, is_index,
-                                       get_context);
-    }
+  s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict,
+                    &block, is_index, get_context);
 
-    if (input_iter != nullptr) {
-      iter = input_iter;
-    } else {
-      iter = new TBlockIter;
-    }
-    // Didn't get any data from block caches.
-    if (s.ok() && block.GetValue() == nullptr) {
-      if (no_io) {
-        // Could not read from block_cache and can't do IO
-        iter->Invalidate(Status::Incomplete("no blocking io"));
-        return iter;
-      }
-      std::unique_ptr<Block> block_value;
-      {
-        StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
-                     READ_BLOCK_GET_MICROS);
-        s = ReadBlockFromFile(
-            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &block_value, rep->ioptions,
-            rep->blocks_maybe_compressed /*do_decompress*/,
-            rep->blocks_maybe_compressed, uncompression_dict,
-            rep->persistent_cache_options,
-            is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
-            rep->table_options.read_amp_bytes_per_bit,
-            GetMemoryAllocator(rep->table_options));
-      }
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  constexpr bool kTotalOrderSeek = true;
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned = block.IsCached() ||
+    (!block.GetValue()->own_bytes() && rep->immortal_table);
+  iter = block.GetValue()->NewIterator<TBlockIter>(
+    &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+    iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
+    index_key_is_full, block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep->cache_key_prefix_size != 0);
+      assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+        static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
       if (s.ok()) {
-        block.SetOwnedValue(block_value.release());
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
       }
     }
-    // TODO(ajkr): also pin compression dictionary block when
-    // `pin_l0_filter_and_index_blocks_in_cache == true`.
   }
 
-  if (s.ok()) {
-    assert(block.GetValue() != nullptr);
-    const bool kTotalOrderSeek = true;
-    // Block contents are pinned and it is still pinned after the iterator
-    // is destroyed as long as cleanup functions are moved to another object,
-    // when:
-    // 1. block cache handle is set to be released in cleanup function, or
-    // 2. it's pointing to immortal source. If own_bytes is true then we are
-    //    not reading data from the original source, whether immortal or not.
-    //    Otherwise, the block is pinned iff the source is immortal.
-    bool block_contents_pinned =
-        (block.IsCached() ||
-         (!block.GetValue()->own_bytes() && rep->immortal_table));
-    iter = block.GetValue()->NewIterator<TBlockIter>(
-        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
-        index_key_is_full, block_contents_pinned);
-    if (!block.IsCached()) {
-      if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
-        // insert a dummy record to block cache to track the memory usage
-        Cache::Handle* cache_handle;
-        // There are two other types of cache keys: 1) SST cache key added in
-        // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
-        // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
-        // from SST cache key(31 bytes), and use non-zero prefix to
-        // differentiate from `write_buffer_manager`
-        const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
-        char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
-        // Prefix: use rep->cache_key_prefix padded by 0s
-        memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
-        assert(rep->cache_key_prefix_size != 0);
-        assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
-        memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
-        char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
-                                   next_cache_key_id_++);
-        assert(end - cache_key <=
-               static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
-        Slice unique_key =
-            Slice(cache_key, static_cast<size_t>(end - cache_key));
-        s = block_cache->Insert(unique_key, nullptr,
-                                block.GetValue()->ApproximateMemoryUsage(),
-                                nullptr, &cache_handle);
-        if (s.ok()) {
-          if (cache_handle != nullptr) {
-            iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
-                                  cache_handle);
-          }
-        }
-      }
-    }
-
-    block.TransferTo(iter);
-  } else {
-    assert(block.GetValue() == nullptr);
-    iter->Invalidate(s);
-  }
+  block.TransferTo(iter);
   return iter;
 }
 
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
-    FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
   assert(block_entry != nullptr);
@@ -2116,7 +2069,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 
     s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
                               rep, ro, block_entry, uncompression_dict,
-                              rep->table_options.read_amp_bytes_per_bit,
+                              !is_index ?
+                                rep->table_options.read_amp_bytes_per_bit : 0,
                               is_index, get_context);
 
     // Can't find the block from the cache. If I/O is allowed, read from the
@@ -2148,7 +2102,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
             key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
             block_entry, &raw_block_contents, raw_block_comp_type,
             rep->table_options.format_version, uncompression_dict, seq_no,
-            rep->table_options.read_amp_bytes_per_bit,
+            !is_index ? rep->table_options.read_amp_bytes_per_bit : 0,
             GetMemoryAllocator(rep->table_options), is_index,
             is_index && rep->table_options
                             .cache_index_and_filter_blocks_with_high_priority
@@ -2162,6 +2116,64 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   return s;
 }
 
+Status BlockBasedTable::RetrieveBlock(
+    FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
+
+  assert(rep);
+  assert(block_entry);
+  assert(block_entry->IsEmpty());
+
+  Status s;
+  if (!is_index || rep->table_options.cache_index_and_filter_blocks) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
+                                     uncompression_dict, block_entry,
+                                     is_index, get_context);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (block_entry->GetValue() != nullptr) {
+      assert (s.ok());
+      return s;
+    }
+  }
+
+  assert(block_entry->IsEmpty());
+
+  const bool no_io = ro.read_tier == kBlockCacheTier;
+  if (no_io) {
+    return Status::Incomplete("no blocking io");
+  }
+
+  std::unique_ptr<Block> block;
+
+  {
+    StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
+      READ_BLOCK_GET_MICROS);
+    s = ReadBlockFromFile(rep->file.get(), prefetch_buffer, rep->footer, ro,
+                          handle, &block, rep->ioptions,
+                          rep->blocks_maybe_compressed,
+                          rep->blocks_maybe_compressed, uncompression_dict,
+                          rep->persistent_cache_options,
+                          rep->get_global_seqno(is_index),
+                          !is_index ?
+                            rep->table_options.read_amp_bytes_per_bit : 0,
+                          GetMemoryAllocator(rep->table_options));
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  block_entry->SetOwnedValue(block.release());
+
+  assert(s.ok());
+  return s;
+}
+
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     BlockBasedTable* table,
     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
@@ -2188,7 +2200,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
     RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
                block_cache->GetUsage(block->second.GetCacheHandle()));
     Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
+    // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
     return block->second.GetValue()->NewIterator<IndexBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
@@ -2747,7 +2759,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
     }
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
-                         /* index_entry */ nullptr, get_context);
+                         get_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -2868,7 +2880,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     }
     auto iiter = NewIndexIterator(
         read_options, need_upper_bound_check, &iiter_on_stack,
-        /* index_entry */ nullptr, sst_file_range.begin()->get_context);
+        sst_file_range.begin()->get_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -3085,45 +3097,37 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
   return s;
 }
 
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+  assert(rep_ != nullptr);
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+  if (cache == nullptr) {
+    return false;
+  }
+
+  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice cache_key = GetCacheKey(rep_->cache_key_prefix,
+                                rep_->cache_key_prefix_size, handle,
+                                cache_key_storage);
+
+  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
+  if (cache_handle == nullptr) {
+    return false;
+  }
+
+  cache->Release(cache_handle);
+
+  return true;
+}
+
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
   std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
       NewIndexIterator(options));
   iiter->Seek(key);
   assert(iiter->Valid());
-  CachableEntry<Block> block;
-
-  BlockHandle handle = iiter->value();
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  assert(block_cache != nullptr);
-
-  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
-                  cache_key_storage);
-  Slice ckey;
 
-  Status s;
-  if (!rep_->compression_dict_handle.IsNull()) {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
-                                 &compression_dict_block);
-    if (s.ok()) {
-      assert(compression_dict_block != nullptr);
-      UncompressionDict uncompression_dict(
-          compression_dict_block->data.ToString(),
-          rep_->blocks_definitely_zstd_compressed);
-      s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_,
-                                options, &block, uncompression_dict,
-                                0 /* read_amp_bytes_per_bit */);
-    }
-  } else {
-    s = GetDataBlockFromCache(
-        cache_key, ckey, block_cache, nullptr, rep_, options, &block,
-        UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */);
-  }
-  assert(s.ok());
-  return block.IsCached();
+  return TEST_BlockInCache(iiter->value());
 }
 
 BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
@@ -3151,14 +3155,11 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
 //  4. internal_comparator
 //  5. index_type
 Status BlockBasedTable::CreateIndexReader(
-    FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
-    InternalIterator* preloaded_meta_index_iter, int level) {
+    FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
+    bool pin, IndexReader** index_reader) {
   auto index_type_on_file = rep_->index_type;
 
-  auto file = rep_->file.get();
-  const InternalKeyComparator* icomparator = &rep_->internal_comparator;
-  const Footer& footer = rep_->footer;
-
   // kHashSearch requires non-empty prefix_extractor but bypass checking
   // prefix_extractor here since we have no access to MutableCFOptions.
   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
@@ -3167,25 +3168,12 @@ Status BlockBasedTable::CreateIndexReader(
 
   switch (index_type_on_file) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
-      return PartitionIndexReader::Create(
-          this, file, prefetch_buffer, footer, footer.index_handle(),
-          rep_->ioptions, icomparator, index_reader,
-          rep_->persistent_cache_options, level,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
+      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
+                                          prefetch, pin, index_reader);
     }
     case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
-          icomparator, index_reader, rep_->persistent_cache_options,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
+      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+                                             prefetch, pin, index_reader);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -3200,29 +3188,15 @@ Status BlockBasedTable::CreateIndexReader(
           ROCKS_LOG_WARN(rep_->ioptions.info_log,
                          "Unable to read the metaindex block."
                          " Fall back to binary search index.");
-          return BinarySearchIndexReader::Create(
-              file, prefetch_buffer, footer, footer.index_handle(),
-              rep_->ioptions, icomparator, index_reader,
-              rep_->persistent_cache_options,
-              rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_key_is_user_key == 0,
-              rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_value_is_delta_encoded == 0,
-              GetMemoryAllocator(rep_->table_options));
+          return BinarySearchIndexReader::Create(this, prefetch_buffer,
+                                                 use_cache, prefetch, pin,
+                                                 index_reader);
         }
         meta_index_iter = meta_iter_guard.get();
       }
 
-      return HashIndexReader::Create(
-          rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
-          rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
-          index_reader, rep_->hash_index_allow_collision,
-          rep_->persistent_cache_options,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
+      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
+                                     use_cache, prefetch, pin, index_reader);
     }
     default: {
       std::string error_message =
@@ -3261,8 +3235,10 @@ bool BlockBasedTable::TEST_filter_block_preloaded() const {
   return rep_->filter != nullptr;
 }
 
-bool BlockBasedTable::TEST_index_reader_preloaded() const {
-  return rep_->index_reader != nullptr;
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+  assert(rep_ != nullptr);
+
+  return TEST_BlockInCache(rep_->footer.index_handle());
 }
 
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
@@ -3479,12 +3455,6 @@ void BlockBasedTable::Close() {
                            rep_->filter_handle, cache_key);
     cache->Erase(key);
 
-    // Get the index block key
-    key = GetCacheKeyFromOffset(rep_->cache_key_prefix,
-                                rep_->cache_key_prefix_size,
-                                rep_->dummy_index_reader_offset, cache_key);
-    cache->Erase(key);
-
     if (!rep_->compression_dict_handle.IsNull()) {
       // Get the compression dictionary block key
       key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
@@ -3674,15 +3644,6 @@ void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
   delete filter;
 }
 
-void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) {
-  IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
-  if (index_reader->statistics() != nullptr) {
-    RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
-               index_reader->ApproximateMemoryUsage());
-  }
-  delete index_reader;
-}
-
 void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
   UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
   RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 270409b3ab6..54ce34d617b 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -150,6 +150,8 @@ class BlockBasedTable : public TableReader {
   // be close to the file length.
   uint64_t ApproximateOffsetOf(const Slice& key) override;
 
+  bool TEST_BlockInCache(const BlockHandle& handle) const;
+
   // Returns true if the block for the specified key is in cache.
   // REQUIRES: key is in this table && block cache enabled
   bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
@@ -173,54 +175,35 @@ class BlockBasedTable : public TableReader {
   ~BlockBasedTable();
 
   bool TEST_filter_block_preloaded() const;
-  bool TEST_index_reader_preloaded() const;
+  bool TEST_IndexBlockInCache() const;
 
-  // IndexReader is the interface that provide the functionality for index
+  // IndexReader is the interface that provides the functionality for index
   // access.
   class IndexReader {
    public:
-    explicit IndexReader(const InternalKeyComparator* icomparator,
-                         Statistics* stats)
-        : icomparator_(icomparator), statistics_(stats) {}
-
-    virtual ~IndexReader() {}
-
-    // Create an iterator for index access.
-    // If iter is null then a new object is created on heap and the callee will
-    // have the ownership. If a non-null iter is passed in it will be used, and
-    // the returned value is either the same as iter or a new on-heap object
-    // that
-    // wrapps the passed iter. In the latter case the return value would point
-    // to
-    // a different object then iter and the callee has the ownership of the
+    virtual ~IndexReader() = default;
+
+    // Create an iterator for index access. If iter is null, then a new object
+    // is created on the heap, and the callee will have the ownership.
+    // If a non-null iter is passed in, it will be used, and the returned value
+    // is either the same as iter or a new on-heap object that
+    // wraps the passed iter. In the latter case the return value points
+    // to a different object then iter, and the callee has the ownership of the
     // returned object.
     virtual InternalIteratorBase<BlockHandle>* NewIterator(
-        IndexBlockIter* iter = nullptr, bool total_order_seek = true,
-        bool fill_cache = true) = 0;
-
-    // The size of the index.
-    virtual size_t size() const = 0;
-    // Memory usage of the index block
-    virtual size_t usable_size() const = 0;
-    // return the statistics pointer
-    virtual Statistics* statistics() const { return statistics_; }
+        const ReadOptions& read_options, bool disable_prefix_seek,
+        IndexBlockIter* iter, GetContext* get_context) = 0;
+
     // Report an approximation of how much memory has been used other than
-    // memory
-    // that was allocated in block cache.
+    // memory that was allocated in block cache.
     virtual size_t ApproximateMemoryUsage() const = 0;
-
-    virtual void CacheDependencies(bool /* unused */) {}
-
-    // Prefetch all the blocks referenced by this index to the buffer
-    void PrefetchBlocks(FilePrefetchBuffer* buf);
-
-   protected:
-    const InternalKeyComparator* icomparator_;
-
-   private:
-    Statistics* statistics_;
+    // Cache the dependencies of the index reader (e.g. the partitions
+    // of a partitioned index).
+    virtual void CacheDependencies(bool /* pin */) {}
   };
 
+  class IndexReaderCommon;
+
   static Slice GetCacheKey(const char* cache_key_prefix,
                            size_t cache_key_prefix_size,
                            const BlockHandle& handle, char* cache_key);
@@ -271,11 +254,22 @@ class BlockBasedTable : public TableReader {
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
   static Status MaybeReadBlockAndLoadToCache(
-      FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
+      const ReadOptions& ro, const BlockHandle& handle,
+      const UncompressionDict& uncompression_dict,
       CachableEntry<Block>* block_entry, bool is_index = false,
       GetContext* get_context = nullptr);
 
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  static Status RetrieveBlock(
+      FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
+      const ReadOptions& ro, const BlockHandle& handle,
+      const UncompressionDict& uncompression_dict,
+      CachableEntry<Block>* block_entry, bool is_index,
+      GetContext* get_context);
+
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
   // were they not present in cache yet.
@@ -305,7 +299,6 @@ class BlockBasedTable : public TableReader {
   InternalIteratorBase<BlockHandle>* NewIndexIterator(
       const ReadOptions& read_options, bool need_upper_bound_check = false,
       IndexBlockIter* input_iter = nullptr,
-      CachableEntry<IndexReader>* index_entry = nullptr,
       GetContext* get_context = nullptr);
 
   // Read block cache from block caches (if set): block_cache and
@@ -316,7 +309,7 @@ class BlockBasedTable : public TableReader {
   //    dictionary.
   static Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+      Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
       const ReadOptions& read_options,
       CachableEntry<Block>* block, const UncompressionDict& uncompression_dict,
       size_t read_amp_bytes_per_bit, bool is_index = false,
@@ -359,9 +352,9 @@ class BlockBasedTable : public TableReader {
   // need to access extra meta blocks for index construction. This parameter
   // helps avoid re-reading meta index block if caller already created one.
   Status CreateIndexReader(
-      FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
-      InternalIterator* preloaded_meta_index_iter = nullptr,
-      const int level = -1);
+      FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* preloaded_meta_index_iter, bool use_cache,
+      bool prefetch, bool pin, IndexReader** index_reader);
 
   bool FullFilterKeyMayMatch(
       const ReadOptions& read_options, FilterBlockReader* filter,
@@ -398,9 +391,8 @@ class BlockBasedTable : public TableReader {
   static Status PrefetchIndexAndFilterBlocks(
       Rep* rep, FilePrefetchBuffer* prefetch_buffer,
       InternalIterator* meta_iter, BlockBasedTable* new_table,
-      const SliceTransform* prefix_extractor, bool prefetch_all,
-      const BlockBasedTableOptions& table_options, const int level,
-      const bool prefetch_index_and_filter_in_cache);
+      bool prefetch_all, const BlockBasedTableOptions& table_options,
+      const int level);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
   Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
@@ -411,7 +403,7 @@ class BlockBasedTable : public TableReader {
       const bool is_a_filter_partition,
       const SliceTransform* prefix_extractor = nullptr) const;
 
-  static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
+  static void SetupCacheKeyPrefix(Rep* rep);
 
   // Generate a cache key prefix from the file
   static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
@@ -486,18 +478,21 @@ struct BlockBasedTable::Rep {
   size_t persistent_cache_key_prefix_size = 0;
   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size = 0;
-  uint64_t dummy_index_reader_offset =
-      0;  // ID that is unique for the block cache.
   PersistentCacheOptions persistent_cache_options;
 
   // Footer contains the fixed table information
   Footer footer;
-  // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e.,
-  // non-nullptr) and used only when options.block_cache is nullptr or when
-  // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index,
-  // filter, and compression dictionary blocks via the block cache. In that case
-  // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle`
-  // are used to lookup these meta-blocks in block cache.
+  // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr)
+  // and used only when options.block_cache is nullptr or when
+  // `cache_index_and_filter_blocks == false`. Otherwise, we will get the
+  // filter and compression dictionary blocks via the block cache. In that case,
+  // `filter_handle`, and `compression_dict_handle` are used to lookup these
+  // meta-blocks in block cache.
+  //
+  // Note: the IndexReader object is always stored in this member variable;
+  // the index block itself, however, may or may not be in the block cache
+  // based on the settings above. We plan to change the handling of the
+  // filter and compression dictionary similarly.
   std::unique_ptr<IndexReader> index_reader;
   std::unique_ptr<FilterBlockReader> filter;
   std::unique_ptr<UncompressionDict> uncompression_dict;
@@ -526,12 +521,11 @@ struct BlockBasedTable::Rep {
 
   // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
   // true or in all levels when pin_top_level_index_and_filter is set in
-  // combination with partitioned index/filters: then we do use the LRU cache,
-  // but we always keep the filter & index block's handle checked out here (=we
+  // combination with partitioned filters: then we do use the LRU cache,
+  // but we always keep the filter block's handle checked out here (=we
   // don't call Release()), plus the parsed out objects the LRU cache will never
   // push flush them out, hence they're pinned
   CachableEntry<FilterBlockReader> filter_entry;
-  CachableEntry<IndexReader> index_entry;
   std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
 
   // If global_seqno is used, all Keys in this file will have the same
diff --git a/table/table_test.cc b/table/table_test.cc
index dccc4919409..aeb66f8d35f 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1993,7 +1993,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
   // preloading filter/index blocks is enabled.
   auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   ASSERT_TRUE(reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(reader->TEST_index_reader_preloaded());
+  ASSERT_FALSE(reader->TEST_IndexBlockInCache());
 
   {
     // nothing happens in the beginning
@@ -2040,7 +2040,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   // preloading filter/index blocks is prohibited.
   auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
+  ASSERT_TRUE(reader->TEST_IndexBlockInCache());
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
@@ -2612,69 +2612,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
   EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
 }
 
-TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
-  // A regression test to avoid data race described in
-  // https://github.com/facebook/rocksdb/issues/1267
-  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
-  std::vector<std::string> keys;
-  stl_wrappers::KVMap kvmap;
-  c.Add("a1", "val1");
-  Options options;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  table_options.index_type = BlockBasedTableOptions::kHashSearch;
-  table_options.cache_index_and_filter_blocks = true;
-  table_options.block_cache = NewLRUCache(0);
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions(options);
-  const MutableCFOptions moptions(options);
-  c.Finish(options, ioptions, moptions, table_options,
-           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
-
-  rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
-      {
-          {"BlockBasedTable::NewIndexIterator::thread1:1",
-           "BlockBasedTable::NewIndexIterator::thread2:2"},
-          {"BlockBasedTable::NewIndexIterator::thread2:3",
-           "BlockBasedTable::NewIndexIterator::thread1:4"},
-      },
-      {
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
-           "BlockBasedTable::NewIndexIterator::thread1:1"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
-           "BlockBasedTable::NewIndexIterator::thread1:4"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
-           "BlockBasedTable::NewIndexIterator::thread2:2"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
-           "BlockBasedTable::NewIndexIterator::thread2:3"},
-      });
-
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  ReadOptions ro;
-  auto* reader = c.GetTableReader();
-
-  std::function<void()> func1 = [&]() {
-    TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker");
-    // TODO(Zhongyi): update test to use MutableCFOptions
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
-    iter->Seek(InternalKey("a1", 0, kTypeValue).Encode());
-  };
-
-  std::function<void()> func2 = [&]() {
-    TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker");
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
-  };
-
-  auto thread1 = port::Thread(func1);
-  auto thread2 = port::Thread(func2);
-  thread1.join();
-  thread2.join();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  c.ResetTableReader();
-}
-
 // Plain table is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(PlainTableTest, BasicPlainTableProperties) {

From e62986260f12abad62d84182d106daeb147168e7 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Thu, 30 May 2019 14:01:44 -0700
Subject: [PATCH 084/572] Fix env_options_for_read spelling in CompactionJob

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5380

Differential Revision: D15563386

Pulled By: sagar0

fbshipit-source-id: 8b26aef47cfc40ff8016daf815582f21cdd40df2
---
 db/compaction_job.cc | 4 ++--
 db/compaction_job.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 91c7f437a17..9e5d46f877d 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -324,7 +324,7 @@ CompactionJob::CompactionJob(
       db_options_(db_options),
       env_options_(env_options),
       env_(db_options.env),
-      env_optiosn_for_read_(
+      env_options_for_read_(
           env_->OptimizeForCompactionTableRead(env_options, db_options_)),
       versions_(versions),
       shutting_down_(shutting_down),
@@ -836,7 +836,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
   std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, &range_del_agg, env_optiosn_for_read_));
+      sub_compact->compaction, &range_del_agg, env_options_for_read_));
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
diff --git a/db/compaction_job.h b/db/compaction_job.h
index a37c54de809..0751727d704 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -151,7 +151,7 @@ class CompactionJob {
 
   Env* env_;
   // env_option optimized for compaction table reads
-  EnvOptions env_optiosn_for_read_;
+  EnvOptions env_options_for_read_;
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
   const SequenceNumber preserve_deletes_seqnum_;

From 50e470791dafb3db017f055f79323aef9a607e43 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Thu, 30 May 2019 14:47:29 -0700
Subject: [PATCH 085/572] Organizing rocksdb/table directory by format

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5373

Differential Revision: D15559425

Pulled By: vjnadimpalli

fbshipit-source-id: 5d6d6d615582bedd96a4b879bb25d429a6de8b55
---
 CMakeLists.txt                                | 57 ++++++++---------
 Makefile                                      | 14 ++---
 TARGETS                                       | 58 ++++++++---------
 db/builder.cc                                 |  2 +-
 db/column_family.cc                           |  2 +-
 db/compaction_job.cc                          |  4 +-
 db/compaction_job_stats_test.cc               |  4 +-
 db/corruption_test.cc                         |  3 +-
 db/cuckoo_table_db_test.cc                    |  4 +-
 db/db_impl.cc                                 |  4 +-
 db/db_impl_open.cc                            |  2 +-
 db/db_iterator_test.cc                        |  2 +-
 db/db_test.cc                                 |  4 +-
 db/db_test_util.h                             |  5 +-
 db/flush_job.cc                               |  4 +-
 db/internal_stats.cc                          |  2 +-
 db/listener_test.cc                           |  5 +-
 db/plain_table_db_test.cc                     |  6 +-
 db/table_properties_collector_test.cc         |  7 ++-
 db/version_set.cc                             |  2 +-
 options/options.cc                            |  2 +-
 options/options_helper.cc                     |  5 +-
 options/options_parser.h                      |  2 +-
 src.mk                                        | 62 +++++++++----------
 .../{ => adaptive}/adaptive_table_factory.cc  |  2 +-
 table/{ => adaptive}/adaptive_table_factory.h |  0
 table/{ => block_based}/block.cc              |  6 +-
 table/{ => block_based}/block.h               |  6 +-
 .../block_based_filter_block.cc               |  2 +-
 .../block_based_filter_block.h                |  3 +-
 .../block_based_filter_block_test.cc          |  2 +-
 .../block_based_table_builder.cc              | 20 +++---
 .../block_based_table_builder.h               |  2 +-
 .../block_based_table_factory.cc              |  6 +-
 .../block_based_table_factory.h               |  0
 .../block_based_table_reader.cc               | 21 +++----
 .../block_based_table_reader.h                |  8 +--
 table/{ => block_based}/block_builder.cc      |  4 +-
 table/{ => block_based}/block_builder.h       |  2 +-
 table/{ => block_based}/block_prefix_index.cc |  2 +-
 table/{ => block_based}/block_prefix_index.h  |  0
 table/{ => block_based}/block_test.cc         |  5 +-
 table/{ => block_based}/cachable_entry.h      |  0
 table/{ => block_based}/data_block_footer.cc  |  2 +-
 table/{ => block_based}/data_block_footer.h   |  0
 .../data_block_hash_index.cc                  |  2 +-
 .../{ => block_based}/data_block_hash_index.h |  0
 .../data_block_hash_index_test.cc             |  9 +--
 table/{ => block_based}/filter_block.h        |  2 +-
 table/{ => block_based}/flush_block_policy.cc |  2 +-
 table/{ => block_based}/flush_block_policy.h  |  0
 table/{ => block_based}/full_filter_block.cc  |  2 +-
 table/{ => block_based}/full_filter_block.h   |  3 +-
 .../full_filter_block_test.cc                 |  2 +-
 table/{ => block_based}/index_builder.cc      |  5 +-
 table/{ => block_based}/index_builder.h       |  4 +-
 .../partitioned_filter_block.cc               |  6 +-
 .../partitioned_filter_block.h                | 11 ++--
 .../partitioned_filter_block_test.cc          |  8 ++-
 table/block_fetcher.cc                        |  4 +-
 table/block_fetcher.h                         |  2 +-
 table/{ => cuckoo}/cuckoo_table_builder.cc    |  6 +-
 table/{ => cuckoo}/cuckoo_table_builder.h     |  0
 .../{ => cuckoo}/cuckoo_table_builder_test.cc |  2 +-
 table/{ => cuckoo}/cuckoo_table_factory.cc    |  6 +-
 table/{ => cuckoo}/cuckoo_table_factory.h     |  0
 table/{ => cuckoo}/cuckoo_table_reader.cc     |  4 +-
 table/{ => cuckoo}/cuckoo_table_reader.h      |  0
 .../{ => cuckoo}/cuckoo_table_reader_test.cc  |  6 +-
 table/format.cc                               |  6 +-
 table/get_context.h                           |  2 +-
 table/meta_blocks.cc                          |  4 +-
 table/meta_blocks.h                           |  2 +-
 table/persistent_cache_helper.cc              |  2 +-
 table/{ => plain}/plain_table_builder.cc      |  8 +--
 table/{ => plain}/plain_table_builder.h       |  5 +-
 table/{ => plain}/plain_table_factory.cc      |  6 +-
 table/{ => plain}/plain_table_factory.h       |  0
 table/{ => plain}/plain_table_index.cc        |  2 +-
 table/{ => plain}/plain_table_index.h         |  0
 table/{ => plain}/plain_table_key_coding.cc   |  6 +-
 table/{ => plain}/plain_table_key_coding.h    |  3 +-
 table/{ => plain}/plain_table_reader.cc       | 10 +--
 table/{ => plain}/plain_table_reader.h        |  4 +-
 table/sst_file_writer.cc                      |  3 +-
 table/table_properties.cc                     |  3 +-
 table/table_reader_bench.cc                   |  4 +-
 table/table_test.cc                           | 18 +++---
 table/two_level_iterator.cc                   |  2 +-
 test_util/testutil.h                          |  4 +-
 tools/sst_dump_test.cc                        |  2 +-
 tools/sst_dump_tool.cc                        | 11 ++--
 tools/trace_analyzer_tool.cc                  |  2 +-
 util/bloom.cc                                 |  4 +-
 utilities/blob_db/blob_db_impl.cc             |  7 ++-
 utilities/memory/memory_test.cc               |  2 +-
 .../persistent_cache_bench.cc                 |  2 +-
 .../persistent_cache/persistent_cache_test.h  |  2 +-
 98 files changed, 292 insertions(+), 275 deletions(-)
 rename table/{ => adaptive}/adaptive_table_factory.cc (98%)
 rename table/{ => adaptive}/adaptive_table_factory.h (100%)
 rename table/{ => block_based}/block.cc (99%)
 rename table/{ => block_based}/block.h (99%)
 rename table/{ => block_based}/block_based_filter_block.cc (99%)
 rename table/{ => block_based}/block_based_filter_block.h (99%)
 rename table/{ => block_based}/block_based_filter_block_test.cc (99%)
 rename table/{ => block_based}/block_based_table_builder.cc (98%)
 rename table/{ => block_based}/block_based_table_builder.h (100%)
 rename table/{ => block_based}/block_based_table_factory.cc (99%)
 rename table/{ => block_based}/block_based_table_factory.h (100%)
 rename table/{ => block_based}/block_based_table_reader.cc (99%)
 rename table/{ => block_based}/block_based_table_reader.h (99%)
 rename table/{ => block_based}/block_builder.cc (98%)
 rename table/{ => block_based}/block_builder.h (98%)
 rename table/{ => block_based}/block_prefix_index.cc (99%)
 rename table/{ => block_based}/block_prefix_index.h (100%)
 rename table/{ => block_based}/block_test.cc (99%)
 rename table/{ => block_based}/cachable_entry.h (100%)
 rename table/{ => block_based}/data_block_footer.cc (97%)
 rename table/{ => block_based}/data_block_footer.h (100%)
 rename table/{ => block_based}/data_block_hash_index.cc (98%)
 rename table/{ => block_based}/data_block_hash_index.h (100%)
 rename table/{ => block_based}/data_block_hash_index_test.cc (99%)
 rename table/{ => block_based}/filter_block.h (99%)
 rename table/{ => block_based}/flush_block_policy.cc (98%)
 rename table/{ => block_based}/flush_block_policy.h (100%)
 rename table/{ => block_based}/full_filter_block.cc (99%)
 rename table/{ => block_based}/full_filter_block.h (99%)
 rename table/{ => block_based}/full_filter_block_test.cc (99%)
 rename table/{ => block_based}/index_builder.cc (98%)
 rename table/{ => block_based}/index_builder.h (99%)
 rename table/{ => block_based}/partitioned_filter_block.cc (98%)
 rename table/{ => block_based}/partitioned_filter_block.h (95%)
 rename table/{ => block_based}/partitioned_filter_block_test.cc (99%)
 rename table/{ => cuckoo}/cuckoo_table_builder.cc (99%)
 rename table/{ => cuckoo}/cuckoo_table_builder.h (100%)
 rename table/{ => cuckoo}/cuckoo_table_builder_test.cc (99%)
 rename table/{ => cuckoo}/cuckoo_table_factory.cc (94%)
 rename table/{ => cuckoo}/cuckoo_table_factory.h (100%)
 rename table/{ => cuckoo}/cuckoo_table_reader.cc (99%)
 rename table/{ => cuckoo}/cuckoo_table_reader.h (100%)
 rename table/{ => cuckoo}/cuckoo_table_reader_test.cc (99%)
 rename table/{ => plain}/plain_table_builder.cc (98%)
 rename table/{ => plain}/plain_table_builder.h (98%)
 rename table/{ => plain}/plain_table_factory.cc (98%)
 rename table/{ => plain}/plain_table_factory.h (100%)
 rename table/{ => plain}/plain_table_index.cc (99%)
 rename table/{ => plain}/plain_table_index.h (100%)
 rename table/{ => plain}/plain_table_key_coding.cc (99%)
 rename table/{ => plain}/plain_table_key_coding.h (99%)
 rename table/{ => plain}/plain_table_reader.cc (99%)
 rename table/{ => plain}/plain_table_reader.h (98%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6449047fca6..5614c83b44a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -566,36 +566,36 @@ set(SOURCES
         options/options_parser.cc
         options/options_sanity_check.cc
         port/stack_trace.cc
-        table/adaptive_table_factory.cc
-        table/block.cc
-        table/block_based_filter_block.cc
-        table/block_based_table_builder.cc
-        table/block_based_table_factory.cc
-        table/block_based_table_reader.cc
-        table/block_builder.cc
+        table/adaptive/adaptive_table_factory.cc
+        table/block_based/block.cc
+        table/block_based/block_based_filter_block.cc
+        table/block_based/block_based_table_builder.cc
+        table/block_based/block_based_table_factory.cc
+        table/block_based/block_based_table_reader.cc
+        table/block_based/block_builder.cc
+        table/block_based/block_prefix_index.cc
+        table/block_based/data_block_hash_index.cc
+        table/block_based/data_block_footer.cc
+        table/block_based/flush_block_policy.cc
+        table/block_based/full_filter_block.cc
+        table/block_based/index_builder.cc
+        table/block_based/partitioned_filter_block.cc
         table/block_fetcher.cc
-        table/block_prefix_index.cc
         table/bloom_block.cc
-        table/cuckoo_table_builder.cc
-        table/cuckoo_table_factory.cc
-        table/cuckoo_table_reader.cc
-        table/data_block_hash_index.cc
-        table/data_block_footer.cc
-        table/flush_block_policy.cc
+        table/cuckoo/cuckoo_table_builder.cc
+        table/cuckoo/cuckoo_table_factory.cc
+        table/cuckoo/cuckoo_table_reader.cc
         table/format.cc
-        table/full_filter_block.cc
         table/get_context.cc
-        table/index_builder.cc
         table/iterator.cc
         table/merging_iterator.cc
         table/meta_blocks.cc
-        table/partitioned_filter_block.cc
         table/persistent_cache_helper.cc
-        table/plain_table_builder.cc
-        table/plain_table_factory.cc
-        table/plain_table_index.cc
-        table/plain_table_key_coding.cc
-        table/plain_table_reader.cc
+        table/plain/plain_table_builder.cc
+        table/plain/plain_table_factory.cc
+        table/plain/plain_table_index.cc
+        table/plain/plain_table_key_coding.cc
+        table/plain/plain_table_reader.cc
         table/sst_file_reader.cc
         table/sst_file_writer.cc
         table/table_properties.cc
@@ -940,13 +940,14 @@ if(WITH_TESTS)
         monitoring/statistics_test.cc
         options/options_settable_test.cc
         options/options_test.cc
-        table/block_based_filter_block_test.cc
-        table/block_test.cc
+        table/block_based/block_based_filter_block_test.cc
+        table/block_based/block_test.cc
+        table/block_based/data_block_hash_index_test.cc
+        table/block_based/full_filter_block_test.cc
+       	table/block_based/partitioned_filter_block_test.cc 	
         table/cleanable_test.cc
-        table/cuckoo_table_builder_test.cc
-        table/cuckoo_table_reader_test.cc
-        table/data_block_hash_index_test.cc
-        table/full_filter_block_test.cc
+        table/cuckoo/cuckoo_table_builder_test.cc
+        table/cuckoo/cuckoo_table_reader_test.cc
         table/merger_test.cc
         table/sst_file_reader_test.cc
         table/table_test.cc
diff --git a/Makefile b/Makefile
index 16d5da0b16c..d41192ab2e0 100644
--- a/Makefile
+++ b/Makefile
@@ -1378,13 +1378,13 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-partitioned_filter_block_test: table/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1396,10 +1396,10 @@ cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS)
 table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1465,10 +1465,10 @@ rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
 rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS)
 	$(AM_LINK)
 
-cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/TARGETS b/TARGETS
index c438aa3fb45..70d6e219413 100644
--- a/TARGETS
+++ b/TARGETS
@@ -173,36 +173,36 @@ cpp_library(
         "options/options_sanity_check.cc",
         "port/port_posix.cc",
         "port/stack_trace.cc",
-        "table/adaptive_table_factory.cc",
-        "table/block.cc",
-        "table/block_based_filter_block.cc",
-        "table/block_based_table_builder.cc",
-        "table/block_based_table_factory.cc",
-        "table/block_based_table_reader.cc",
-        "table/block_builder.cc",
+        "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/block.cc",
+        "table/block_based/block_based_filter_block.cc",
+        "table/block_based/block_based_table_builder.cc",
+        "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_reader.cc",
+        "table/block_based/block_builder.cc",
+        "table/block_based/block_prefix_index.cc",
+        "table/block_based/data_block_hash_index.cc",
+        "table/block_based/data_block_footer.cc",
+        "table/block_based/flush_block_policy.cc",
+        "table/block_based/full_filter_block.cc",
+        "table/block_based/index_builder.cc",
+        "table/block_based/partitioned_filter_block.cc",
         "table/block_fetcher.cc",
-        "table/block_prefix_index.cc",
         "table/bloom_block.cc",
-        "table/cuckoo_table_builder.cc",
-        "table/cuckoo_table_factory.cc",
-        "table/cuckoo_table_reader.cc",
-        "table/data_block_footer.cc",
-        "table/data_block_hash_index.cc",
-        "table/flush_block_policy.cc",
+        "table/cuckoo/cuckoo_table_builder.cc",
+        "table/cuckoo/cuckoo_table_factory.cc",
+        "table/cuckoo/cuckoo_table_reader.cc",
         "table/format.cc",
-        "table/full_filter_block.cc",
         "table/get_context.cc",
-        "table/index_builder.cc",
         "table/iterator.cc",
         "table/merging_iterator.cc",
         "table/meta_blocks.cc",
-        "table/partitioned_filter_block.cc",
         "table/persistent_cache_helper.cc",
-        "table/plain_table_builder.cc",
-        "table/plain_table_factory.cc",
-        "table/plain_table_index.cc",
-        "table/plain_table_key_coding.cc",
-        "table/plain_table_reader.cc",
+        "table/plain/plain_table_builder.cc",
+        "table/plain/plain_table_factory.cc",
+        "table/plain/plain_table_index.cc",
+        "table/plain/plain_table_key_coding.cc",
+        "table/plain/plain_table_reader.cc",
         "table/sst_file_reader.cc",
         "table/sst_file_writer.cc",
         "table/table_properties.cc",
@@ -378,12 +378,12 @@ ROCKS_TESTS = [
     ],
     [
         "block_based_filter_block_test",
-        "table/block_based_filter_block_test.cc",
+        "table/block_based/block_based_filter_block_test.cc",
         "serial",
     ],
     [
         "block_test",
-        "table/block_test.cc",
+        "table/block_based/block_test.cc",
         "serial",
     ],
     [
@@ -488,7 +488,7 @@ ROCKS_TESTS = [
     ],
     [
         "cuckoo_table_builder_test",
-        "table/cuckoo_table_builder_test.cc",
+        "table/cuckoo/cuckoo_table_builder_test.cc",
         "serial",
     ],
     [
@@ -498,12 +498,12 @@ ROCKS_TESTS = [
     ],
     [
         "cuckoo_table_reader_test",
-        "table/cuckoo_table_reader_test.cc",
+        "table/cuckoo/cuckoo_table_reader_test.cc",
         "serial",
     ],
     [
         "data_block_hash_index_test",
-        "table/data_block_hash_index_test.cc",
+        "table/block_based/data_block_hash_index_test.cc",
         "serial",
     ],
     [
@@ -743,7 +743,7 @@ ROCKS_TESTS = [
     ],
     [
         "full_filter_block_test",
-        "table/full_filter_block_test.cc",
+        "table/block_based/full_filter_block_test.cc",
         "serial",
     ],
     [
@@ -873,7 +873,7 @@ ROCKS_TESTS = [
     ],
     [
         "partitioned_filter_block_test",
-        "table/partitioned_filter_block_test.cc",
+        "table/block_based/partitioned_filter_block_test.cc",
         "serial",
     ],
     [
diff --git a/db/builder.cc b/db/builder.cc
index 2b97ce1d608..14160f64c75 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -29,7 +29,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "util/file_reader_writer.h"
diff --git a/db/column_family.cc b/db/column_family.cc
index 325610b8844..84f521cd7b8 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -33,7 +33,7 @@
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
 #include "util/compression.h"
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 9e5d46f877d..9e22e161f28 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -49,8 +49,8 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "util/coding.h"
diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc
index 91441f5d76a..daf41386690 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction_job_stats_test.cc
@@ -48,9 +48,9 @@
 #include "rocksdb/thread_status.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "util/hash.h"
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 379c33e4599..130821ff997 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -25,8 +25,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/meta_blocks.h"
+#include "file/filename.h"
 #include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index ecd6d71ca2e..f9efbc58503 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -8,8 +8,8 @@
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "table/cuckoo_table_factory.h"
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/meta_blocks.h"
 #include "util/string_util.h"
 #include "test_util/testharness.h"
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 749bd3629a0..ec162bb961e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -77,8 +77,8 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
 #include "table/merging_iterator.h"
 #include "table/multiget_context.h"
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index 0be85031ba3..db47d141655 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -18,7 +18,7 @@
 #include "file/sst_file_manager_impl.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/rate_limiter.h"
 #include "test_util/sync_point.h"
 
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index cc1af2e0ad8..e2b9f503ffb 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -15,7 +15,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
-#include "table/flush_block_policy.h"
+#include "table/block_based/flush_block_policy.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 66df2323de2..bf0babd1a3a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -54,9 +54,9 @@
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "util/file_reader_writer.h"
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 3bc107889b4..1882cde59dc 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -41,9 +42,9 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "test_util/mock_time_env.h"
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 4930ecac7e9..c8729c66840 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -40,8 +40,8 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 57c7427e801..58332f30faf 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -23,7 +23,7 @@
 
 #include "db/column_family.h"
 #include "db/db_impl.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 663116b7b8d..881534a1f1d 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -22,8 +22,9 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+#include "file/filename.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 7648ed85ff7..bfeb54243d9 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -25,9 +25,9 @@
 #include "rocksdb/table.h"
 #include "table/bloom_block.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_key_coding.h"
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
 #include "table/table_builder.h"
 #include "util/hash.h"
 #include "util/logging.h"
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 6171b2938c2..0705cc032fe 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -12,11 +12,12 @@
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
+
+#include "table/meta_blocks.h"
 #include "options/cf_options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
-#include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
diff --git a/db/version_set.cc b/db/version_set.cc
index b9616f3730b..864fc975358 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -45,7 +45,7 @@
 #include "table/merging_iterator.h"
 #include "table/meta_blocks.h"
 #include "table/multiget_context.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
diff --git a/options/options.cc b/options/options.cc
index 057727e59fb..a5037ee78d3 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -31,7 +31,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/wal_filter.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/compression.h"
 
 namespace rocksdb {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 82e7a1fa13a..0b531a6ec5e 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -9,6 +9,8 @@
 #include <cstdlib>
 #include <unordered_set>
 #include <vector>
+
+#include "table/plain/plain_table_factory.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
@@ -20,8 +22,7 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 
diff --git a/options/options_parser.h b/options/options_parser.h
index 5aab3e7e9b6..b2a806f179f 100644
--- a/options/options_parser.h
+++ b/options/options_parser.h
@@ -12,7 +12,7 @@
 #include "options/options_sanity_check.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 
 namespace rocksdb {
 
diff --git a/src.mk b/src.mk
index 100b3355e74..a0f4043bf76 100644
--- a/src.mk
+++ b/src.mk
@@ -97,36 +97,36 @@ LIB_SOURCES =                                                   \
   options/options_sanity_check.cc                               \
   port/port_posix.cc                                            \
   port/stack_trace.cc                                           \
-  table/adaptive_table_factory.cc                               \
-  table/block.cc                                                \
-  table/block_based_filter_block.cc                             \
-  table/block_based_table_builder.cc                            \
-  table/block_based_table_factory.cc                            \
-  table/block_based_table_reader.cc                             \
-  table/block_builder.cc                                        \
-  table/block_fetcher.cc                                        \
-  table/block_prefix_index.cc                                   \
-  table/bloom_block.cc                                          \
-  table/cuckoo_table_builder.cc                                 \
-  table/cuckoo_table_factory.cc                                 \
-  table/cuckoo_table_reader.cc                                  \
-  table/data_block_hash_index.cc                                \
-  table/data_block_footer.cc                                    \
-  table/flush_block_policy.cc                                   \
+  table/adaptive/adaptive_table_factory.cc                      \
+  table/block_based/block.cc                                     \
+  table/block_based/block_based_filter_block.cc                  \
+  table/block_based/block_based_table_builder.cc                 \
+  table/block_based/block_based_table_factory.cc                 \
+  table/block_based/block_based_table_reader.cc                  \
+  table/block_based/block_builder.cc                             \
+  table/block_based/block_prefix_index.cc                        \
+  table/block_based/data_block_hash_index.cc                     \
+  table/block_based/data_block_footer.cc                         \
+  table/block_based/flush_block_policy.cc                        \
+  table/block_based/full_filter_block.cc                         \
+  table/block_based/index_builder.cc                             \
+  table/block_based/partitioned_filter_block.cc                  \
+  table/block_fetcher.cc                             			\
+  table/bloom_block.cc                               			\
+  table/cuckoo/cuckoo_table_builder.cc                          \
+  table/cuckoo/cuckoo_table_factory.cc                          \
+  table/cuckoo/cuckoo_table_reader.cc                           \
   table/format.cc                                               \
-  table/full_filter_block.cc                                    \
   table/get_context.cc                                          \
-  table/index_builder.cc                                        \
   table/iterator.cc                                             \
   table/merging_iterator.cc                                     \
   table/meta_blocks.cc                                          \
-  table/partitioned_filter_block.cc                             \
   table/persistent_cache_helper.cc                              \
-  table/plain_table_builder.cc                                  \
-  table/plain_table_factory.cc                                  \
-  table/plain_table_index.cc                                    \
-  table/plain_table_key_coding.cc                               \
-  table/plain_table_reader.cc                                   \
+  table/plain/plain_table_builder.cc                            \
+  table/plain/plain_table_factory.cc                            \
+  table/plain/plain_table_index.cc                              \
+  table/plain/plain_table_key_coding.cc                         \
+  table/plain/plain_table_reader.cc                             \
   table/sst_file_reader.cc                                      \
   table/sst_file_writer.cc                                      \
   table/table_properties.cc                                     \
@@ -319,7 +319,6 @@ MAIN_SOURCES =                                                          \
   db/obsolete_files_test.cc						\
   db/options_settable_test.cc                                           \
   db/options_file_test.cc                                               \
-  db/partitioned_filter_block_test.cc                                   \
   db/perf_context_test.cc                                               \
   db/persistent_cache_test.cc                                           \
   db/plain_table_db_test.cc                                             \
@@ -348,13 +347,14 @@ MAIN_SOURCES =                                                          \
   monitoring/iostats_context_test.cc                                    \
   monitoring/statistics_test.cc                                         \
   options/options_test.cc                                               \
-  table/block_based_filter_block_test.cc                                \
-  table/block_test.cc                                                   \
+  table/block_based/block_based_filter_block_test.cc                     \
+  table/block_based/block_test.cc                                        \
+  table/block_based/data_block_hash_index_test.cc                        \
+  table/block_based/full_filter_block_test.cc                            \
+  table/block_based/partitioned_filter_block_test.cc                     \
   table/cleanable_test.cc                                               \
-  table/cuckoo_table_builder_test.cc                                    \
-  table/cuckoo_table_reader_test.cc                                     \
-  table/data_block_hash_index_test.cc                                   \
-  table/full_filter_block_test.cc                                       \
+  table/cuckoo/cuckoo_table_builder_test.cc                             \
+  table/cuckoo/cuckoo_table_reader_test.cc                              \
   table/merger_test.cc                                                  \
   table/sst_file_reader_test.cc                                         \
   table/table_reader_bench.cc                                           \
diff --git a/table/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc
similarity index 98%
rename from table/adaptive_table_factory.cc
rename to table/adaptive/adaptive_table_factory.cc
index d5dcbc5f585..0086368a9bb 100644
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive/adaptive_table_factory.cc
@@ -4,7 +4,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/adaptive_table_factory.h"
+#include "table/adaptive/adaptive_table_factory.h"
 
 #include "table/table_builder.h"
 #include "table/format.h"
diff --git a/table/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h
similarity index 100%
rename from table/adaptive_table_factory.h
rename to table/adaptive/adaptive_table_factory.h
diff --git a/table/block.cc b/table/block_based/block.cc
similarity index 99%
rename from table/block.cc
rename to table/block_based/block.cc
index a6cc8d2705f..dfc4aa3c679 100644
--- a/table/block.cc
+++ b/table/block_based/block.cc
@@ -9,7 +9,7 @@
 //
 // Decodes the blocks generated by block_builder.cc.
 
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include <algorithm>
 #include <string>
 #include <unordered_map>
@@ -19,8 +19,8 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
-#include "table/block_prefix_index.h"
-#include "table/data_block_footer.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
diff --git a/table/block.h b/table/block_based/block.h
similarity index 99%
rename from table/block.h
rename to table/block_based/block.h
index 869d2f1f286..8bf6f535612 100644
--- a/table/block.h
+++ b/table/block_based/block.h
@@ -22,13 +22,13 @@
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
-#include "format.h"
+#include "table/format.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
-#include "table/block_prefix_index.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
 #include "table/internal_iterator.h"
 #include "util/random.h"
 #include "test_util/sync_point.h"
diff --git a/table/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
similarity index 99%
rename from table/block_based_filter_block.cc
rename to table/block_based/block_based_filter_block.cc
index 81087b243b7..fb366b5d316 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_filter_block.h"
+#include "table/block_based/block_based_filter_block.h"
 #include <algorithm>
 
 #include "db/dbformat.h"
diff --git a/table/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
similarity index 99%
rename from table/block_based_filter_block.h
rename to table/block_based/block_based_filter_block.h
index d1ff585462a..74a2285e1e9 100644
--- a/table/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -18,10 +18,11 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "table/filter_block.h"
+#include "table/block_based/filter_block.h"
 #include "util/hash.h"
 
 namespace rocksdb {
diff --git a/table/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
similarity index 99%
rename from table/block_based_filter_block_test.cc
rename to table/block_based/block_based_filter_block_test.cc
index 2cb3abc27a6..8d074275ce6 100644
--- a/table/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_filter_block.h"
+#include "table/block_based/block_based_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
 #include "util/coding.h"
diff --git a/table/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
similarity index 98%
rename from table/block_based_table_builder.cc
rename to table/block_based/block_based_table_builder.cc
index 9a1742e5f3a..034c6b238fd 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -20,6 +20,7 @@
 #include <utility>
 
 #include "db/dbformat.h"
+#include "index_builder.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -29,14 +30,15 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
 
-#include "table/block.h"
-#include "table/block_based_filter_block.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
-#include "table/full_filter_block.h"
 #include "table/table_builder.h"
 
 #include "util/coding.h"
@@ -47,8 +49,6 @@
 #include "util/string_util.h"
 #include "util/xxhash.h"
 
-#include "table/index_builder.h"
-#include "table/partitioned_filter_block.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
similarity index 100%
rename from table/block_based_table_builder.h
rename to table/block_based/block_based_table_builder.h
index a1ef3889112..0c580b445dd 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -14,11 +14,11 @@
 #include <utility>
 #include <vector>
 
+#include "table/meta_blocks.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
-#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "util/compression.h"
 
diff --git a/table/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
similarity index 99%
rename from table/block_based_table_factory.cc
rename to table/block_based/block_based_table_factory.cc
index 790a2c99ecc..609679394ea 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -7,7 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_table_factory.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -24,8 +23,9 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/flush_block_policy.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
diff --git a/table/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
similarity index 100%
rename from table/block_based_table_factory.h
rename to table/block_based/block_based_table_factory.h
diff --git a/table/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
similarity index 99%
rename from table/block_based_table_reader.cc
rename to table/block_based/block_based_table_reader.cc
index 82f96492662..725ecdb4e3f 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader.h"
 
 #include <algorithm>
 #include <array>
@@ -15,6 +15,8 @@
 #include <utility>
 #include <vector>
 
+#include "table/block_fetcher.h"
+#include "table/meta_blocks.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 
@@ -27,20 +29,17 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-
-#include "table/block.h"
-#include "table/block_based_filter_block.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_fetcher.h"
-#include "table/block_prefix_index.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
-#include "table/full_filter_block.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
-#include "table/meta_blocks.h"
 #include "table/multiget_context.h"
-#include "table/partitioned_filter_block.h"
 #include "table/persistent_cache_helper.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/two_level_iterator.h"
diff --git a/table/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
similarity index 99%
rename from table/block_based_table_reader.h
rename to table/block_based/block_based_table_reader.h
index 54ce34d617b..6d265ba755b 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -23,10 +23,10 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
-#include "table/cachable_entry.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
diff --git a/table/block_builder.cc b/table/block_based/block_builder.cc
similarity index 98%
rename from table/block_builder.cc
rename to table/block_based/block_builder.cc
index c14b4f6d3ee..a6a240c8e0a 100644
--- a/table/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -31,13 +31,13 @@
 //     num_restarts: uint32
 // restarts[i] contains the offset within the block of the ith restart point.
 
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 
 #include <assert.h>
 #include <algorithm>
 #include "db/dbformat.h"
 #include "rocksdb/comparator.h"
-#include "table/data_block_footer.h"
+#include "table/block_based/data_block_footer.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/block_builder.h b/table/block_based/block_builder.h
similarity index 98%
rename from table/block_builder.h
rename to table/block_based/block_builder.h
index 0576279f501..153e57569a2 100644
--- a/table/block_builder.h
+++ b/table/block_based/block_builder.h
@@ -13,7 +13,7 @@
 #include <stdint.h>
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/data_block_hash_index.h"
 
 namespace rocksdb {
 
diff --git a/table/block_prefix_index.cc b/table/block_based/block_prefix_index.cc
similarity index 99%
rename from table/block_prefix_index.cc
rename to table/block_based/block_prefix_index.cc
index 67c749d4c3a..0050f1f1e58 100644
--- a/table/block_prefix_index.cc
+++ b/table/block_based/block_prefix_index.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/block_prefix_index.h"
+#include "table/block_based/block_prefix_index.h"
 
 #include <vector>
 
diff --git a/table/block_prefix_index.h b/table/block_based/block_prefix_index.h
similarity index 100%
rename from table/block_prefix_index.h
rename to table/block_based/block_prefix_index.h
diff --git a/table/block_test.cc b/table/block_based/block_test.cc
similarity index 99%
rename from table/block_test.cc
rename to table/block_based/block_test.cc
index d359b4e59ca..a4c5678881e 100644
--- a/table/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+
 #include <stdio.h>
 #include <algorithm>
 #include <set>
@@ -19,8 +20,8 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "util/random.h"
 #include "test_util/testharness.h"
diff --git a/table/cachable_entry.h b/table/block_based/cachable_entry.h
similarity index 100%
rename from table/cachable_entry.h
rename to table/block_based/cachable_entry.h
diff --git a/table/data_block_footer.cc b/table/block_based/data_block_footer.cc
similarity index 97%
rename from table/data_block_footer.cc
rename to table/block_based/data_block_footer.cc
index cb9e1438152..2cf31b4c5ef 100644
--- a/table/data_block_footer.cc
+++ b/table/block_based/data_block_footer.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "data_block_footer.h"
+#include "table/block_based/data_block_footer.h"
 
 #include "rocksdb/table.h"
 
diff --git a/table/data_block_footer.h b/table/block_based/data_block_footer.h
similarity index 100%
rename from table/data_block_footer.h
rename to table/block_based/data_block_footer.h
diff --git a/table/data_block_hash_index.cc b/table/block_based/data_block_hash_index.cc
similarity index 98%
rename from table/data_block_hash_index.cc
rename to table/block_based/data_block_hash_index.cc
index adb1d7b8c26..7737a9491ee 100644
--- a/table/data_block_hash_index.cc
+++ b/table/block_based/data_block_hash_index.cc
@@ -6,7 +6,7 @@
 #include <vector>
 
 #include "rocksdb/slice.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/data_block_hash_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/data_block_hash_index.h b/table/block_based/data_block_hash_index.h
similarity index 100%
rename from table/data_block_hash_index.h
rename to table/block_based/data_block_hash_index.h
diff --git a/table/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
similarity index 99%
rename from table/data_block_hash_index_test.cc
rename to table/block_based/data_block_hash_index_test.cc
index 0511b257aa3..204e92ecbe3 100644
--- a/table/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -3,16 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+
 #include <cstdlib>
 #include <string>
 #include <unordered_map>
 
 #include "db/table_properties_collector.h"
 #include "rocksdb/slice.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/data_block_hash_index.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
diff --git a/table/filter_block.h b/table/block_based/filter_block.h
similarity index 99%
rename from table/filter_block.h
rename to table/block_based/filter_block.h
index 8abb88e5f4f..8b01214c7eb 100644
--- a/table/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 #include "db/dbformat.h"
-#include "format.h"
+#include "table/format.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
diff --git a/table/flush_block_policy.cc b/table/block_based/flush_block_policy.cc
similarity index 98%
rename from table/flush_block_policy.cc
rename to table/block_based/flush_block_policy.cc
index 1b1675828da..31576848c07 100644
--- a/table/flush_block_policy.cc
+++ b/table/block_based/flush_block_policy.cc
@@ -6,7 +6,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 
 #include <cassert>
diff --git a/table/flush_block_policy.h b/table/block_based/flush_block_policy.h
similarity index 100%
rename from table/flush_block_policy.h
rename to table/block_based/flush_block_policy.h
diff --git a/table/full_filter_block.cc b/table/block_based/full_filter_block.cc
similarity index 99%
rename from table/full_filter_block.cc
rename to table/block_based/full_filter_block.cc
index 9015e96d2ea..56dc74c6710 100644
--- a/table/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/full_filter_block.h"
+#include "table/block_based/full_filter_block.h"
 
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
 #ifdef OS_FREEBSD
diff --git a/table/full_filter_block.h b/table/block_based/full_filter_block.h
similarity index 99%
rename from table/full_filter_block.h
rename to table/block_based/full_filter_block.h
index f97952a7ced..3e5d82733b0 100644
--- a/table/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -10,12 +10,13 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "db/dbformat.h"
 #include "util/hash.h"
-#include "table/filter_block.h"
+#include "table/block_based/filter_block.h"
 
 namespace rocksdb {
 
diff --git a/table/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
similarity index 99%
rename from table/full_filter_block_test.cc
rename to table/block_based/full_filter_block_test.cc
index 0ef5c5a970c..8b99f54b03f 100644
--- a/table/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/full_filter_block.h"
+#include "table/block_based/full_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
 #include "table/full_filter_bits_builder.h"
diff --git a/table/index_builder.cc b/table/block_based/index_builder.cc
similarity index 98%
rename from table/index_builder.cc
rename to table/block_based/index_builder.cc
index 63cb80598fe..f11ecd4f4bc 100644
--- a/table/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -7,7 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/index_builder.h"
+#include "table/block_based/index_builder.h"
+
 #include <assert.h>
 #include <inttypes.h>
 
@@ -17,7 +18,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/flush_block_policy.h"
 #include "table/format.h"
-#include "table/partitioned_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace rocksdb {
diff --git a/table/index_builder.h b/table/block_based/index_builder.h
similarity index 99%
rename from table/index_builder.h
rename to table/block_based/index_builder.h
index 2f349fc5471..7e6a4bb0776 100644
--- a/table/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -17,8 +17,8 @@
 #include <unordered_map>
 
 #include "rocksdb/comparator.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 
 namespace rocksdb {
diff --git a/table/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
similarity index 98%
rename from table/partitioned_filter_block.cc
rename to table/block_based/partitioned_filter_block.cc
index 3ccc7946393..315e63306f1 100644
--- a/table/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/partitioned_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
 
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
 #ifdef OS_FREEBSD
@@ -17,8 +17,8 @@
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
similarity index 95%
rename from table/partitioned_filter_block.h
rename to table/block_based/partitioned_filter_block.h
index 2563dd2bf35..735f1c6e3eb 100644
--- a/table/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -9,15 +9,14 @@
 #include <string>
 #include <unordered_map>
 #include "db/dbformat.h"
+#include "index_builder.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/cachable_entry.h"
-#include "table/full_filter_block.h"
-#include "table/index_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/cachable_entry.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/table/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
similarity index 99%
rename from table/partitioned_filter_block_test.cc
rename to table/block_based/partitioned_filter_block_test.cc
index 4bdc2fd36f1..2bcafa9771a 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -3,13 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+
 #include <map>
 
 #include "rocksdb/filter_policy.h"
 
 #include "table/full_filter_bits_builder.h"
-#include "table/index_builder.h"
-#include "table/partitioned_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+
+#include "index_builder.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/logging.h"
@@ -96,7 +98,7 @@ class PartitionedFilterBlockTest
                partition_size * table_options_.block_size_deviation / 100;
   }
 
-  int last_offset = 10;
+  uint64_t last_offset = 10;
   BlockHandle Write(const Slice& slice) {
     BlockHandle bh(last_offset + 1, slice.size());
     slices[bh.offset()] = slice;
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 1f209210c13..6c663702900 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -15,8 +15,8 @@
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
 #include "util/coding.h"
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 0dcdfc76125..56b74b50427 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/format.h"
 #include "util/memory_allocator.h"
 
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc
similarity index 99%
rename from table/cuckoo_table_builder.cc
rename to table/cuckoo/cuckoo_table_builder.cc
index f590e6ad405..f1a64cb6a67 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo/cuckoo_table_builder.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
 
 #include <assert.h>
 #include <algorithm>
@@ -15,8 +15,8 @@
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
-#include "table/block_builder.h"
-#include "table/cuckoo_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/autovector.h"
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h
similarity index 100%
rename from table/cuckoo_table_builder.h
rename to table/cuckoo/cuckoo_table_builder.h
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc
similarity index 99%
rename from table/cuckoo_table_builder_test.cc
rename to table/cuckoo/cuckoo_table_builder_test.cc
index eeba9480592..1467e2a8d1b 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo/cuckoo_table_builder_test.cc
@@ -11,7 +11,7 @@
 #include <utility>
 
 #include "table/meta_blocks.h"
-#include "table/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
 #include "util/file_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc
similarity index 94%
rename from table/cuckoo_table_factory.cc
rename to table/cuckoo/cuckoo_table_factory.cc
index 74d18d51213..4ca29f364cf 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo/cuckoo_table_factory.cc
@@ -4,11 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
 
 #include "db/dbformat.h"
-#include "table/cuckoo_table_builder.h"
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 
 namespace rocksdb {
 
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h
similarity index 100%
rename from table/cuckoo_table_factory.h
rename to table/cuckoo/cuckoo_table_factory.h
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
similarity index 99%
rename from table/cuckoo_table_reader.cc
rename to table/cuckoo/cuckoo_table_reader.cc
index f4df2467fdb..72885be940e 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 
 #include <algorithm>
 #include <limits>
@@ -19,7 +19,7 @@
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
-#include "table/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/get_context.h"
 #include "util/arena.h"
 #include "util/coding.h"
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
similarity index 100%
rename from table/cuckoo_table_reader.h
rename to table/cuckoo/cuckoo_table_reader.h
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
similarity index 99%
rename from table/cuckoo_table_reader_test.cc
rename to table/cuckoo/cuckoo_table_reader_test.cc
index 6d596f6e115..71e231336c5 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -22,9 +22,9 @@ int main() {
 #include <string>
 #include <map>
 
-#include "table/cuckoo_table_builder.h"
-#include "table/cuckoo_table_factory.h"
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/get_context.h"
 #include "table/meta_blocks.h"
 #include "util/arena.h"
diff --git a/table/format.cc b/table/format.cc
index 476db85f731..1adcce6f3f4 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -12,12 +12,12 @@
 #include <inttypes.h>
 #include <string>
 
+#include "block_fetcher.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_fetcher.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/persistent_cache_helper.h"
 #include "util/coding.h"
 #include "util/compression.h"
diff --git a/table/get_context.h b/table/get_context.h
index 856e01a9502..8df343b3653 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -11,7 +11,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 
 namespace rocksdb {
 class MergeContext;
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 98e05a4d032..9d56c5b9c29 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -7,11 +7,11 @@
 #include <map>
 #include <string>
 
+#include "block_fetcher.h"
 #include "db/table_properties_collector.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block.h"
-#include "table/block_fetcher.h"
+#include "table/block_based/block.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "table/persistent_cache_helper.h"
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 6efd1225e19..5224c54714d 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -15,7 +15,7 @@
 #include "rocksdb/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "util/kv_map.h"
 
diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc
index 4e90697a6e5..8431f13db37 100644
--- a/table/persistent_cache_helper.cc
+++ b/table/persistent_cache_helper.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/persistent_cache_helper.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_builder.cc b/table/plain/plain_table_builder.cc
similarity index 98%
rename from table/plain_table_builder.cc
rename to table/plain/plain_table_builder.cc
index 453b6c768b5..6160d7afd9e 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_builder.h"
+#include "table/plain/plain_table_builder.h"
 
 #include <assert.h>
 
@@ -17,11 +17,11 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "db/dbformat.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/bloom_block.h"
-#include "table/plain_table_index.h"
+#include "table/plain/plain_table_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/coding.h"
diff --git a/table/plain_table_builder.h b/table/plain/plain_table_builder.h
similarity index 98%
rename from table/plain_table_builder.h
rename to table/plain/plain_table_builder.h
index 9a5b44b9c2c..0a29098d657 100644
--- a/table/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+
 #ifndef ROCKSDB_LITE
 #include <stdint.h>
 #include <string>
@@ -13,8 +14,8 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/bloom_block.h"
-#include "table/plain_table_index.h"
-#include "table/plain_table_key_coding.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
 #include "table/table_builder.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_factory.cc b/table/plain/plain_table_factory.cc
similarity index 98%
rename from table/plain_table_factory.cc
rename to table/plain/plain_table_factory.cc
index 0dccec55242..6c6905dab1f 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain/plain_table_factory.cc
@@ -4,7 +4,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 
 #include <stdint.h>
 #include <memory>
@@ -12,8 +12,8 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
-#include "table/plain_table_builder.h"
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_factory.h b/table/plain/plain_table_factory.h
similarity index 100%
rename from table/plain_table_factory.h
rename to table/plain/plain_table_factory.h
diff --git a/table/plain_table_index.cc b/table/plain/plain_table_index.cc
similarity index 99%
rename from table/plain_table_index.cc
rename to table/plain/plain_table_index.cc
index 43740923974..196be22cfe9 100644
--- a/table/plain_table_index.cc
+++ b/table/plain/plain_table_index.cc
@@ -11,7 +11,7 @@
 
 #include <inttypes.h>
 
-#include "table/plain_table_index.h"
+#include "table/plain/plain_table_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/plain_table_index.h b/table/plain/plain_table_index.h
similarity index 100%
rename from table/plain_table_index.h
rename to table/plain/plain_table_index.h
diff --git a/table/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc
similarity index 99%
rename from table/plain_table_key_coding.cc
rename to table/plain/plain_table_key_coding.cc
index 6f5ee9b4ad2..9c4b614b549 100644
--- a/table/plain_table_key_coding.cc
+++ b/table/plain/plain_table_key_coding.cc
@@ -4,13 +4,13 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_key_coding.h"
+#include "table/plain/plain_table_key_coding.h"
 
 #include <algorithm>
 #include <string>
 #include "db/dbformat.h"
-#include "table/plain_table_reader.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/plain/plain_table_factory.h"
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h
similarity index 99%
rename from table/plain_table_key_coding.h
rename to table/plain/plain_table_key_coding.h
index 93f8f7af4b5..26af3f6d8bd 100644
--- a/table/plain_table_key_coding.h
+++ b/table/plain/plain_table_key_coding.h
@@ -4,12 +4,13 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+
 #ifndef ROCKSDB_LITE
 
 #include <array>
 #include "rocksdb/slice.h"
 #include "db/dbformat.h"
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_reader.h"
 
 // The file contains three helper classes of PlainTable format,
 // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
diff --git a/table/plain_table_reader.cc b/table/plain/plain_table_reader.cc
similarity index 99%
rename from table/plain_table_reader.cc
rename to table/plain/plain_table_reader.cc
index f33afdefc38..b4aad55876b 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_reader.h"
 
 #include <string>
 #include <vector>
@@ -19,15 +19,15 @@
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/bloom_block.h"
-#include "table/filter_block.h"
+#include "table/block_based/filter_block.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_key_coding.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
 #include "table/get_context.h"
 
 #include "monitoring/histogram.h"
diff --git a/table/plain_table_reader.h b/table/plain/plain_table_reader.h
similarity index 98%
rename from table/plain_table_reader.h
rename to table/plain/plain_table_reader.h
index 12b22aaf12e..ec6e6a7febb 100644
--- a/table/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -19,8 +19,8 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_index.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
 #include "util/file_reader_writer.h"
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index 71b395fd6be..69993492d48 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -6,9 +6,10 @@
 #include "rocksdb/sst_file_writer.h"
 
 #include <vector>
+
 #include "db/dbformat.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
 #include "util/file_reader_writer.h"
 #include "test_util/sync_point.h"
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 8cfa2619591..6e481798c35 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -4,10 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/table_properties.h"
+
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/internal_iterator.h"
 #include "table/table_properties_internal.h"
 #include "util/string_util.h"
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 6b05d385e06..f2ae016c10d 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -17,10 +17,10 @@ int main() {
 #include "rocksdb/db.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
diff --git a/table/table_test.cc b/table/table_test.cc
index aeb66f8d35f..372443b536a 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -16,11 +16,13 @@
 #include <string>
 #include <vector>
 
+#include "block_fetcher.h"
 #include "cache/lru_cache.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "memtable/stl_wrappers.h"
+#include "meta_blocks.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
@@ -32,18 +34,16 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/block_fetcher.h"
-#include "table/flush_block_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
-#include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "util/compression.h"
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index a8f617dee29..ba883763e9f 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -11,7 +11,7 @@
 #include "db/pinned_iterators_manager.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/format.h"
 #include "util/arena.h"
 
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 2aab3df72c4..7890ce5f511 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -20,9 +20,9 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/internal_iterator.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index a2c226b926c..ea27f3c8d45 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -13,7 +13,7 @@
 #include "rocksdb/sst_dump_tool.h"
 
 #include "rocksdb/filter_policy.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
 #include "test_util/testharness.h"
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 5cbbfc38542..aa051da01f5 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -28,13 +29,13 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/ldb_cmd.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "util/compression.h"
 #include "util/random.h"
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index a0186925243..03057afbc78 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -41,7 +41,7 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 #include "rocksdb/write_batch.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "tools/trace_analyzer_tool.h"
 #include "util/coding.h"
diff --git a/util/bloom.cc b/util/bloom.cc
index 1da4f2aa428..bedf4a65839 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -10,9 +10,9 @@
 #include "rocksdb/filter_policy.h"
 
 #include "rocksdb/slice.h"
-#include "table/block_based_filter_block.h"
+#include "table/block_based/block_based_filter_block.h"
 #include "table/full_filter_bits_builder.h"
-#include "table/full_filter_block.h"
+#include "table/block_based/full_filter_block.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 54eb3f2dbb5..a93169c30cd 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -22,9 +23,9 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/stackable_db.h"
 #include "rocksdb/utilities/transaction.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/meta_blocks.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index c3ff640816e..224f7886bf1 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -10,7 +10,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/stackable_db.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc
index 64d75c7a518..8bc795455cd 100644
--- a/utilities/persistent_cache/persistent_cache_bench.cc
+++ b/utilities/persistent_cache/persistent_cache_bench.cc
@@ -23,7 +23,7 @@ int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h
index 33cda4ea72d..6d15d13b69b 100644
--- a/utilities/persistent_cache/persistent_cache_test.h
+++ b/utilities/persistent_cache/persistent_cache_test.h
@@ -20,7 +20,7 @@
 
 #include "db/db_test_util.h"
 #include "rocksdb/cache.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "port/port.h"
 #include "util/arena.h"
 #include "test_util/testharness.h"

From 1b59a490ef8d8da78c826b379167207dfa682b4c Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Thu, 30 May 2019 16:07:57 -0700
Subject: [PATCH 086/572] Fix flaky DBTest2.PresetCompressionDict test (#5378)

Summary:
Fix flaky DBTest2.PresetCompressionDict test.

This PR fixes two issues with the test:
1. Replaces `GetSstFiles` with `TotalSize`, which is based on `DB::GetColumnFamilyMetaData` so that only the size of the live SST files is taken into consideration when computing the total size of all sst files. Earlier, with `GetSstFiles`, even obsolete files were getting picked up.
1. In ZSTD compression, it is sometimes possible that using a trained dictionary is not better than using an untrained one. Using a trained dictionary performs well in 99% of the cases, but still in the remaining ~1% of the cases (out of 10000 runs) using an untrained dictionary gets better compression results.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5378

Differential Revision: D15559100

Pulled By: sagar0

fbshipit-source-id: c35adbf13871f520a2cec48f8bad9ff27ff7a0b4
---
 db/db_test2.cc | 58 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index d93beb4477f..109a7a377bf 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1036,8 +1036,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
   ASSERT_TRUE(index == keys_cf.size());
 }
 
-// Temporarily disable it because the test is flaky.
-TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
+TEST_F(DBTest2, PresetCompressionDict) {
   // Verifies that compression ratio improves when dictionary is enabled, and
   // improves even further when the dictionary is trained by ZSTD.
   const size_t kBlockSizeBytes = 4 << 10;
@@ -1046,7 +1045,8 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
   const int kNumL0Files = 5;
 
   Options options;
-  options.env = CurrentOptions().env; // Make sure to use any custom env that the test is configured with.
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
   options.allow_concurrent_memtable_write = false;
   options.arena_block_size = kBlockSizeBytes;
   options.create_if_missing = true;
@@ -1072,10 +1072,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
     compression_types.push_back(kZSTD);
   }
 
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
   for (auto compression_type : compression_types) {
     options.compression = compression_type;
-    size_t prev_out_bytes;
-    for (int i = 0; i < 3; ++i) {
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
       // First iteration: compress without preset dictionary
       // Second iteration: compress with preset dictionary
       // Third iteration (zstd only): compress with zstd-trained dictionary
@@ -1085,19 +1094,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
       // the non-first iterations, verify the data we get out is the same data
       // we put in.
       switch (i) {
-        case 0:
+        case kWithoutDict:
           options.compression_opts.max_dict_bytes = 0;
           options.compression_opts.zstd_max_train_bytes = 0;
           break;
-        case 1:
-          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
           options.compression_opts.zstd_max_train_bytes = 0;
           break;
-        case 2:
+        case kWithZSTDTrainedDict:
           if (compression_type != kZSTD) {
             continue;
           }
-          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
           options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
           break;
         default:
@@ -1129,23 +1138,32 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
 
-      size_t out_bytes = 0;
-      std::vector<std::string> files;
-      GetSstFiles(env_, dbname_, &files);
-      for (const auto& file : files) {
-        uint64_t curr_bytes;
-        env_->GetFileSize(dbname_ + "/" + file, &curr_bytes);
-        out_bytes += static_cast<size_t>(curr_bytes);
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
       }
 
       for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
            j++) {
         ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
       }
-      if (i) {
-        ASSERT_GT(prev_out_bytes, out_bytes);
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
       }
-      prev_out_bytes = out_bytes;
+
       DestroyAndReopen(options);
     }
   }

From f1302ebab8c39ba441a33e73b8e37d75d53efa22 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 30 May 2019 16:09:45 -0700
Subject: [PATCH 087/572] Add class-level comments to version-related classes
 (#5348)

Summary:
As title.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5348

Differential Revision: D15564595

Pulled By: riversand963

fbshipit-source-id: dd45aa86a70e0343c2e9ef702fad165163f548e6
---
 db/version_set.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/db/version_set.h b/db/version_set.h
index 28ad0c2c234..776e08e448c 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -91,6 +91,9 @@ extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
                                       const std::vector<FileMetaData*>& files,
                                       Arena* arena);
 
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, etc.
 class VersionStorageInfo {
  public:
   VersionStorageInfo(const InternalKeyComparator* internal_comparator,
@@ -537,6 +540,8 @@ class VersionStorageInfo {
 };
 
 using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the SST files owned by the column
+// family at a certain point in time.
 class Version {
  public:
   // Append to *iters a sequence of iterators that will
@@ -747,6 +752,9 @@ struct ObsoleteFileInfo {
 
 class BaseReferencedVersionBuilder;
 
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
@@ -1103,6 +1111,10 @@ class VersionSet {
                          VersionEdit* edit, InstrumentedMutex* mu);
 };
 
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
 class ReactiveVersionSet : public VersionSet {
  public:
   ReactiveVersionSet(const std::string& dbname,

From 8843129ecef255a70f186e095063b4e79b2b0c73 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 30 May 2019 17:39:43 -0700
Subject: [PATCH 088/572] Move some memory related files from util/ to memory/
 (#5382)

Summary:
Move arena, allocator, and memory tools under util to a separate memory/ directory.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5382

Differential Revision: D15564655

Pulled By: siying

fbshipit-source-id: 9cd6b5d0d3d52b39606e19221fa154596e5852a5
---
 CMakeLists.txt                                |   8 +-
 Makefile                                      |   2 +-
 TARGETS                                       |  10 +-
 cache/cache_test.cc                           |   2 +-
 db/builder.cc                                 |   2 +-
 db/column_family_test.cc                      |   4 +-
 db/compact_files_test.cc                      |   2 +-
 db/compaction.cc                              |   2 +-
 db/compaction.h                               |   2 +-
 db/compaction_iterator_test.cc                |   2 +-
 db/compaction_job.cc                          |   2 +-
 db/compaction_job_stats_test.cc               |   6 +-
 db/compaction_job_test.cc                     |   4 +-
 db/compaction_picker.cc                       |   2 +-
 db/compaction_picker_test.cc                  |   4 +-
 db/compaction_picker_universal.cc             |   2 +-
 db/comparator_db_test.cc                      |   4 +-
 db/corruption_test.cc                         |   3 +-
 db/cuckoo_table_db_test.cc                    |   2 +-
 db/db_compaction_test.cc                      |   2 +-
 db/db_filesnapshot.cc                         |   2 +-
 db/db_impl.cc                                 |   2 +-
 db/db_impl_compaction_flush.cc                |   2 +-
 db/db_impl_open.cc                            |   2 +-
 db/db_iter.cc                                 |   2 +-
 db/db_iter.h                                  |   2 +-
 db/db_iter_stress_test.cc                     |   2 +-
 db/db_iter_test.cc                            |   2 +-
 db/db_options_test.cc                         |   2 +-
 db/db_test.cc                                 |   6 +-
 db/db_test_util.h                             |   4 +-
 db/db_write_test.cc                           |   2 +-
 db/dbformat_test.cc                           |   2 +-
 db/deletefile_test.cc                         |   2 +-
 db/external_sst_file_ingestion_job.cc         |   2 +-
 db/fault_injection_test.cc                    |   4 +-
 db/file_indexer.h                             |   2 +-
 db/file_indexer_test.cc                       |   2 +-
 db/filename_test.cc                           |   2 +-
 db/flush_job.cc                               |   2 +-
 db/flush_job_test.cc                          |   4 +-
 db/forward_iterator.cc                        |   2 +-
 db/forward_iterator.h                         |   4 +-
 db/forward_iterator_bench.cc                  |   2 +-
 db/listener_test.cc                           |   7 +-
 db/log_test.cc                                |   4 +-
 db/manual_compaction_test.cc                  |   4 +-
 db/memtable.cc                                |   4 +-
 db/memtable.h                                 |   4 +-
 db/memtable_list.cc                           |   2 +-
 db/memtable_list_test.cc                      |   2 +-
 db/merge_helper_test.cc                       |   2 +-
 db/merge_test.cc                              |   8 +-
 db/obsolete_files_test.cc                     |   2 +-
 db/perf_context_test.cc                       |   2 +-
 db/plain_table_db_test.cc                     |   4 +-
 db/prefix_test.cc                             |   2 +-
 db/range_del_aggregator_bench.cc              |   2 +-
 db/table_cache.cc                             |   2 +-
 db/table_properties_collector_test.cc         |   6 +-
 db/version_builder_test.cc                    |   4 +-
 db/version_edit.cc                            |   2 +-
 db/version_edit.h                             |   6 +-
 db/version_edit_test.cc                       |   2 +-
 db/version_set.cc                             |   2 +-
 db/version_set_test.cc                        |   4 +-
 db/wal_manager.cc                             |   2 +-
 db/wal_manager_test.cc                        |   4 +-
 db/write_batch_test.cc                        |   2 +-
 db/write_callback_test.cc                     |   4 +-
 db/write_thread.cc                            |   2 +-
 env/env.cc                                    |   2 +-
 env/env_posix.cc                              |   2 +-
 env/env_test.cc                               |   6 +-
 env/io_posix.cc                               |   2 +-
 file/delete_scheduler.cc                      |   2 +-
 file/delete_scheduler_test.cc                 |   2 +-
 file/filename.cc                              |   2 +-
 file/sst_file_manager_impl.cc                 |   2 +-
 java/rocksjni/write_batch_test.cc             |   2 +-
 {util => memory}/allocator.h                  |   0
 {util => memory}/arena.cc                     |   4 +-
 {util => memory}/arena.h                      |   8 +-
 {util => memory}/arena_test.cc                |   4 +-
 {util => memory}/concurrent_arena.cc          |   2 +-
 {util => memory}/concurrent_arena.h           |   4 +-
 {util => memory}/jemalloc_nodump_allocator.cc |   2 +-
 {util => memory}/jemalloc_nodump_allocator.h  |   0
 {util => memory}/memory_allocator.h           |   0
 {util => memory}/memory_usage.h               |   0
 memtable/alloc_tracker.cc                     |   4 +-
 memtable/hash_linklist_rep.cc                 |   2 +-
 memtable/hash_skiplist_rep.cc                 |   8 +-
 memtable/inlineskiplist.h                     |   2 +-
 memtable/inlineskiplist_test.cc               |   4 +-
 memtable/memtablerep_bench.cc                 |   4 +-
 memtable/skiplist.h                           |   4 +-
 memtable/skiplist_test.cc                     |   4 +-
 memtable/skiplistrep.cc                       |   4 +-
 memtable/vectorrep.cc                         |   2 +-
 options/options_helper.cc                     |   2 +-
 options/options_parser.cc                     |   2 +-
 options/options_test.cc                       |   4 +-
 port/win/env_default.cc                       |   2 +-
 port/win/io_win.cc                            |   2 +-
 src.mk                                        |   8 +-
 table/block_based/block.h                     |   4 +-
 .../block_based_filter_block_test.cc          |   4 +-
 .../block_based/block_based_table_builder.cc  |   5 +-
 table/block_based/block_based_table_builder.h |   2 +-
 .../block_based/block_based_table_factory.cc  |   2 +-
 table/block_based/block_based_table_reader.cc | 178 ++++++++----------
 table/block_based/block_based_table_reader.h  |  24 +--
 table/block_based/block_prefix_index.cc       |   2 +-
 table/block_based/block_test.cc               |   2 +-
 .../block_based/data_block_hash_index_test.cc |   1 -
 table/block_based/filter_block.h              |   2 +-
 table/block_based/full_filter_block.h         |   4 +-
 table/block_based/full_filter_block_test.cc   |   4 +-
 table/block_based/index_builder.cc            |   2 +-
 table/block_based/partitioned_filter_block.h  |   2 +-
 .../partitioned_filter_block_test.cc          |   7 +-
 table/block_fetcher.cc                        |   2 +-
 table/block_fetcher.h                         |   2 +-
 table/cuckoo/cuckoo_table_builder_test.cc     |   4 +-
 table/cuckoo/cuckoo_table_reader.cc           |   6 +-
 table/cuckoo/cuckoo_table_reader_test.cc      |   6 +-
 table/format.cc                               |   2 +-
 table/format.h                                |   2 +-
 table/get_context.h                           |   2 +-
 table/iterator.cc                             |   2 +-
 table/merging_iterator.cc                     |   4 +-
 table/meta_blocks.cc                          |   2 +-
 table/mock_table.h                            |   4 +-
 table/plain/plain_table_builder.cc            |   6 +-
 table/plain/plain_table_index.h               |   2 +-
 table/plain/plain_table_key_coding.cc         |   2 +-
 table/plain/plain_table_key_coding.h          |   2 +-
 table/plain/plain_table_reader.cc             |   8 +-
 table/plain/plain_table_reader.h              |   4 +-
 table/sst_file_writer.cc                      |   2 +-
 table/table_reader_bench.cc                   |   4 +-
 table/table_test.cc                           |   6 +-
 table/two_level_iterator.cc                   |   2 +-
 tools/db_bench_tool.cc                        |   4 +-
 tools/db_bench_tool_test.cc                   |   2 +-
 tools/db_repl_stress.cc                       |   2 +-
 tools/db_stress.cc                            |   9 +-
 tools/reduce_levels_test.cc                   |   4 +-
 tools/sst_dump_test.cc                        |   2 +-
 tools/trace_analyzer_test.cc                  |   2 +-
 util/auto_roll_logger.h                       |   2 +-
 util/auto_roll_logger_test.cc                 |   2 +-
 util/autovector_test.cc                       |   4 +-
 util/bloom.cc                                 |   2 +-
 util/bloom_test.cc                            |   6 +-
 util/compression.h                            |   2 +-
 util/dynamic_bloom.cc                         |   2 +-
 util/dynamic_bloom_test.cc                    |   6 +-
 util/event_logger_test.cc                     |   2 +-
 util/file_reader_writer.cc                    |   2 +-
 util/file_reader_writer.h                     |   2 +-
 util/file_reader_writer_test.cc               |   2 +-
 util/filelock_test.cc                         |   4 +-
 util/hash_test.cc                             |   2 +-
 util/log_buffer.h                             |   6 +-
 util/log_write_bench.cc                       |   4 +-
 util/rate_limiter.cc                          |   2 +-
 util/rate_limiter_test.cc                     |   2 +-
 util/repeatable_thread_test.cc                |   2 +-
 util/thread_local_test.cc                     |   4 +-
 utilities/backupable/backupable_db.cc         |   2 +-
 utilities/backupable/backupable_db_test.cc    |   6 +-
 utilities/blob_db/blob_db_impl.cc             |   2 +-
 utilities/blob_db/blob_db_test.cc             |   6 +-
 .../cassandra/cassandra_functional_test.cc    |   4 +-
 utilities/cassandra/format.h                  |   2 +-
 utilities/memory/memory_test.cc               |   2 +-
 .../string_append/stringappend_test.cc        |   4 +-
 utilities/options/options_util_test.cc        |   2 +-
 .../persistent_cache/block_cache_tier.cc      |   2 +-
 utilities/persistent_cache/block_cache_tier.h |   2 +-
 .../block_cache_tier_file_buffer.h            |   2 +-
 utilities/persistent_cache/hash_table_test.cc |   4 +-
 .../persistent_cache/persistent_cache_test.h  |   4 +-
 .../optimistic_transaction_test.cc            |   6 +-
 .../transactions/pessimistic_transaction.cc   |   2 +-
 .../pessimistic_transaction_db.cc             |   2 +-
 .../transactions/transaction_lock_mgr.cc      |   2 +-
 utilities/transactions/transaction_test.cc    |   4 +-
 utilities/transactions/transaction_test.h     |   4 +-
 .../write_prepared_transaction_test.cc        |   6 +-
 .../transactions/write_prepared_txn_db.cc     |   2 +-
 utilities/ttl/ttl_test.cc                     |   2 +-
 .../write_batch_with_index.cc                 |   2 +-
 .../write_batch_with_index_test.cc            |   6 +-
 196 files changed, 397 insertions(+), 413 deletions(-)
 rename {util => memory}/allocator.h (100%)
 rename {util => memory}/arena.cc (99%)
 rename {util => memory}/arena.h (99%)
 rename {util => memory}/arena_test.cc (99%)
 rename {util => memory}/concurrent_arena.cc (97%)
 rename {util => memory}/concurrent_arena.h (99%)
 rename {util => memory}/jemalloc_nodump_allocator.cc (99%)
 rename {util => memory}/jemalloc_nodump_allocator.h (100%)
 rename {util => memory}/memory_allocator.h (100%)
 rename {util => memory}/memory_usage.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5614c83b44a..c4dc2500fb5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -542,6 +542,9 @@ set(SOURCES
         file/file_util.cc
         file/filename.cc
         file/sst_file_manager_impl.cc
+        memory/arena.cc
+        memory/concurrent_arena.cc
+        memory/jemalloc_nodump_allocator.cc
         memtable/alloc_tracker.cc
         memtable/hash_linklist_rep.cc
         memtable/hash_skiplist_rep.cc
@@ -610,14 +613,12 @@ set(SOURCES
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
-        util/arena.cc
         util/auto_roll_logger.cc
         util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
         util/compression_context_cache.cc
-        util/concurrent_arena.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/dynamic_bloom.cc
@@ -625,7 +626,6 @@ set(SOURCES
         util/file_reader_writer.cc
         util/filter_policy.cc
         util/hash.cc
-        util/jemalloc_nodump_allocator.cc
         util/log_buffer.cc
         util/murmurhash.cc
         util/random.cc
@@ -932,6 +932,7 @@ if(WITH_TESTS)
         env/env_test.cc
         env/mock_env_test.cc
         file/delete_scheduler_test.cc
+        memory/arena_test.cc
         memtable/inlineskiplist_test.cc
         memtable/skiplist_test.cc
         memtable/write_buffer_manager_test.cc
@@ -955,7 +956,6 @@ if(WITH_TESTS)
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
         tools/trace_analyzer_test.cc
-        util/arena_test.cc
         util/auto_roll_logger_test.cc
         util/autovector_test.cc
         util/bloom_test.cc
diff --git a/Makefile b/Makefile
index d41192ab2e0..244b929c418 100644
--- a/Makefile
+++ b/Makefile
@@ -1127,7 +1127,7 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
 db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
-arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/TARGETS b/TARGETS
index 70d6e219413..a59af2fa697 100644
--- a/TARGETS
+++ b/TARGETS
@@ -147,6 +147,9 @@ cpp_library(
         "file/file_util.cc",
         "file/filename.cc",
         "file/sst_file_manager_impl.cc",
+        "memory/arena.cc",
+        "memory/concurrent_arena.cc",
+        "memory/jemalloc_nodump_allocator.cc",
         "memtable/alloc_tracker.cc",
         "memtable/hash_linklist_rep.cc",
         "memtable/hash_skiplist_rep.cc",
@@ -181,8 +184,8 @@ cpp_library(
         "table/block_based/block_based_table_reader.cc",
         "table/block_based/block_builder.cc",
         "table/block_based/block_prefix_index.cc",
-        "table/block_based/data_block_hash_index.cc",
         "table/block_based/data_block_footer.cc",
+        "table/block_based/data_block_hash_index.cc",
         "table/block_based/flush_block_policy.cc",
         "table/block_based/full_filter_block.cc",
         "table/block_based/index_builder.cc",
@@ -214,7 +217,6 @@ cpp_library(
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
-        "util/arena.cc",
         "util/auto_roll_logger.cc",
         "util/bloom.cc",
         "util/build_version.cc",
@@ -222,7 +224,6 @@ cpp_library(
         "util/compaction_job_stats_impl.cc",
         "util/comparator.cc",
         "util/compression_context_cache.cc",
-        "util/concurrent_arena.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
         "util/dynamic_bloom.cc",
@@ -230,7 +231,6 @@ cpp_library(
         "util/file_reader_writer.cc",
         "util/filter_policy.cc",
         "util/hash.cc",
-        "util/jemalloc_nodump_allocator.cc",
         "util/log_buffer.cc",
         "util/murmurhash.cc",
         "util/random.cc",
@@ -353,7 +353,7 @@ cpp_library(
 ROCKS_TESTS = [
     [
         "arena_test",
-        "util/arena_test.cc",
+        "memory/arena_test.cc",
         "serial",
     ],
     [
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 377ae146876..0cc3d559502 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -16,9 +16,9 @@
 #include <vector>
 #include "cache/clock_cache.h"
 #include "cache/lru_cache.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/builder.cc b/db/builder.cc
index 14160f64c75..86aac02ab74 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -32,9 +32,9 @@
 #include "table/block_based/block_based_table_builder.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index f5d57c35b78..21b3321bea6 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -20,12 +20,12 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "util/coding.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index b97fd064e70..438fdb7c96f 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -14,9 +14,9 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction.cc b/db/compaction.cc
index 00ebd28b087..089dd66848e 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -18,8 +18,8 @@
 
 #include "db/column_family.h"
 #include "rocksdb/compaction_filter.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction.h b/db/compaction.h
index e9ded632503..598b08e7c65 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -9,8 +9,8 @@
 
 #pragma once
 #include "db/version_set.h"
+#include "memory/arena.h"
 #include "options/cf_options.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc
index b0a553136a3..99bb026b5a9 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction_iterator_test.cc
@@ -9,9 +9,9 @@
 #include <vector>
 
 #include "port/port.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 9e22e161f28..92a6fab8da8 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -53,6 +53,7 @@
 #include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/log_buffer.h"
@@ -61,7 +62,6 @@
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc
index daf41386690..35c1100f99b 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction_job_stats_test.cc
@@ -52,15 +52,15 @@
 #include "table/mock_table.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/compression.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #if !defined(IOS_CROSS_COMPILE)
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 4608cceeac1..93e55b7a03b 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -25,10 +25,10 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index c01f2884d4c..b25f6cb0890 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -22,10 +22,10 @@
 #include "db/column_family.h"
 #include "file/filename.h"
 #include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 82fc16f4f5a..dd33009eb12 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -12,10 +12,10 @@
 #include "db/compaction_picker_fifo.h"
 #include "db/compaction_picker_universal.h"
 
-#include "util/logging.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/logging.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc
index b8d23795fbc..20edd30748d 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction_picker_universal.cc
@@ -22,10 +22,10 @@
 #include "db/column_family.h"
 #include "file/filename.h"
 #include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
 #include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace {
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index ba7042049cb..de55c706ab7 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -9,11 +9,11 @@
 #include "memtable/stl_wrappers.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
 #include "util/kv_map.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 130821ff997..53c4d42d28a 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -27,10 +27,9 @@
 #include "rocksdb/write_batch.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/meta_blocks.h"
-#include "file/filename.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index f9efbc58503..6f60e2d7037 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -11,9 +11,9 @@
 #include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/meta_blocks.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 623836454db..3051e89cd37 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -13,9 +13,9 @@
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
 #include "rocksdb/utilities/convenience.h"
-#include "util/concurrent_task_limiter_impl.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
+#include "util/concurrent_task_limiter_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a1a1c8f99d6..59757aeb9f7 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -22,8 +22,8 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/mutexlock.h"
 #include "test_util/sync_point.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index ec162bb961e..5534c225f4d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -84,6 +84,7 @@
 #include "table/multiget_context.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "tools/sst_dump_tool_imp.h"
 #include "util/auto_roll_logger.h"
 #include "util/autovector.h"
@@ -97,7 +98,6 @@
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index c5cc0736665..c6025a8cc57 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -21,8 +21,8 @@
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
-#include "util/concurrent_task_limiter_impl.h"
 #include "test_util/sync_point.h"
+#include "util/concurrent_task_limiter_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index db47d141655..5dae140c7ea 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -19,8 +19,8 @@
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
 #include "table/block_based/block_based_table_factory.h"
-#include "util/rate_limiter.h"
 #include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
 
 namespace rocksdb {
 Options SanitizeOptions(const std::string& dbname, const Options& src) {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 8fc17e1446e..d953d365e0f 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -17,6 +17,7 @@
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "file/filename.h"
+#include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -24,7 +25,6 @@
 #include "rocksdb/options.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
diff --git a/db/db_iter.h b/db/db_iter.h
index 8d8af3fd292..85b546c544c 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -13,10 +13,10 @@
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
+#include "memory/arena.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index 8c3588e9abd..b864ac4eae1 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -8,9 +8,9 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 #ifdef GFLAGS
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 49e670abc28..1503886443b 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/statistics.h"
 #include "table/iterator_wrapper.h"
 #include "table/merging_iterator.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 37a9f1a365b..b899ba18b4a 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -19,9 +19,9 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/stats_history.h"
-#include "util/random.h"
 #include "test_util/sync_point.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index bf0babd1a3a..debb2ba603e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -58,14 +58,14 @@
 #include "table/mock_table.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/compression.h"
 #include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 1882cde59dc..2af202fad96 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -46,14 +46,14 @@
 #include "table/mock_table.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/compression.h"
 #include "test_util/mock_time_env.h"
+#include "util/compression.h"
 #include "util/mutexlock.h"
 
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index 322381b3867..9eca823c2b7 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -13,8 +13,8 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index e3f06fe6b65..f4665b06ca3 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -8,8 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/dbformat.h"
-#include "util/logging.h"
 #include "test_util/testharness.h"
+#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 9c67102c5f0..280d269f1c6 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -20,10 +20,10 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 26cd1127b94..aec398552c7 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -22,9 +22,9 @@
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
+#include "test_util/sync_point.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 330df7bfe48..00619d447d1 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -22,11 +22,11 @@
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
diff --git a/db/file_indexer.h b/db/file_indexer.h
index 1bef3aab0ca..2091f80292b 100644
--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@@ -12,8 +12,8 @@
 #include <functional>
 #include <limits>
 #include <vector>
+#include "memory/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
index 754cb3c4651..6942aa682d6 100644
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <string>
 #include "db/file_indexer.h"
+#include <string>
 #include "db/dbformat.h"
 #include "db/version_edit.h"
 #include "port/stack_trace.h"
diff --git a/db/filename_test.cc b/db/filename_test.cc
index dabe673d849..377d128fae0 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -11,8 +11,8 @@
 
 #include "db/dbformat.h"
 #include "port/port.h"
-#include "util/logging.h"
 #include "test_util/testharness.h"
+#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index c8729c66840..d4ae79ff29a 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -45,13 +45,13 @@
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/event_logger.h"
 #include "util/log_buffer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index d97ad9f0c2d..ef89199c98e 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -13,10 +13,10 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index f95debec62c..9e0823366d0 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -21,8 +21,8 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/merging_iterator.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 146588d961c..fb73f458edd 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -10,12 +10,12 @@
 #include <vector>
 #include <queue>
 
+#include "db/dbformat.h"
+#include "memory/arena.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
-#include "db/dbformat.h"
 #include "table/internal_iterator.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index 9d6851dab16..17b0ca16544 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -34,8 +34,8 @@ int main() { return 0; }
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "util/gflags_compat.h"
 #include "test_util/testharness.h"
+#include "util/gflags_compat.h"
 
 const int MAX_SHARDS = 100000;
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 881534a1f1d..81a0fa17678 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -24,15 +24,14 @@
 #include "rocksdb/table_properties.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/plain/plain_table_factory.h"
-#include "file/filename.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #ifndef ROCKSDB_LITE
diff --git a/db/log_test.cc b/db/log_test.cc
index 5b159acf21f..be7a3cbe7cf 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -10,12 +10,12 @@
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/random.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 namespace rocksdb {
 namespace log {
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index 35e5019ca7e..1a69a89dea0 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -8,12 +8,12 @@
 #include <sstream>
 #include <cstdlib>
 
-#include "rocksdb/db.h"
+#include "port/port.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "test_util/testharness.h"
-#include "port/port.h"
 
 using namespace rocksdb;
 
diff --git a/db/memtable.cc b/db/memtable.cc
index 0c706115de0..46acbbfa61a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -19,6 +19,8 @@
 #include "db/pinned_iterators_manager.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
@@ -31,10 +33,8 @@
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "table/merging_iterator.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 #include "util/coding.h"
-#include "util/memory_usage.h"
 #include "util/mutexlock.h"
 #include "util/util.h"
 
diff --git a/db/memtable.h b/db/memtable.h
index 709e2061e5b..6b8c4141f5a 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -19,13 +19,13 @@
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
-#include "util/allocator.h"
-#include "util/concurrent_arena.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
 
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index b50b58a1af7..2b4ac6b84da 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -22,9 +22,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index 59da8af1664..f55fbdc501a 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -13,9 +13,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc
index dc3624af53e..3386f9bd067 100644
--- a/db/merge_helper_test.cc
+++ b/db/merge_helper_test.cc
@@ -9,9 +9,9 @@
 
 #include "db/merge_helper.h"
 #include "rocksdb/comparator.h"
-#include "util/coding.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/coding.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/merge_test.cc b/db/merge_test.cc
index d3dadaa5d30..13c35d2c017 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -7,6 +7,9 @@
 #include <memory>
 #include <iostream>
 
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -14,11 +17,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "db/dbformat.h"
-#include "db/db_impl.h"
-#include "db/write_batch_internal.h"
-#include "utilities/merge_operators.h"
 #include "test_util/testharness.h"
+#include "utilities/merge_operators.h"
 
 namespace rocksdb {
 
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index c6e7d6af07a..655c659b44f 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -20,10 +20,10 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 using std::cerr;
 using std::cout;
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 42d592862c7..94eabff7ff5 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "test_util/testharness.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 bool FLAGS_random_key = false;
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index bfeb54243d9..a73dd3cb431 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -29,12 +29,12 @@
 #include "table/plain/plain_table_key_coding.h"
 #include "table/plain/plain_table_reader.h"
 #include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index e8290e76bca..3f2e794a6c4 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -26,12 +26,12 @@ int main() {
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc
index 54a86169b20..97ba6ca4f8a 100644
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@@ -23,10 +23,10 @@ int main() {
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-#include "test_util/testutil.h"
 
 #include "util/gflags_compat.h"
 
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 4efd3fdf759..14c0169c11a 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -22,10 +22,10 @@
 #include "table/multiget_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 0705cc032fe..e818f46142c 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -13,16 +13,16 @@
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 
-#include "table/meta_blocks.h"
 #include "options/cf_options.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
-#include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 5c3bd686b1c..63067857420 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -6,10 +6,10 @@
 #include <string>
 #include "db/version_edit.h"
 #include "db/version_set.h"
-#include "util/logging.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/logging.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 018517a1381..668ff60f103 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -11,10 +11,10 @@
 
 #include "db/version_set.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/event_logger.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/version_edit.h b/db/version_edit.h
index ee6499cdc3b..471b4e095ab 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -10,12 +10,12 @@
 #pragma once
 #include <algorithm>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
-#include "rocksdb/cache.h"
 #include "db/dbformat.h"
-#include "util/arena.h"
+#include "memory/arena.h"
+#include "rocksdb/cache.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 5f1ae98ba4f..23c63b7caea 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_edit.h"
-#include "util/coding.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 864fc975358..5d0529d2707 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -48,11 +48,11 @@
 #include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 41c27fdab65..9b4072dc777 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -10,10 +10,10 @@
 #include "db/version_set.h"
 #include "db/log_writer.h"
 #include "table/mock_table.h"
-#include "util/logging.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/logging.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 20b5780c877..2fe5305f8d6 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -28,13 +28,13 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index b1478e26e54..c0c47b0c34b 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -19,10 +19,10 @@
 #include "db/wal_manager.h"
 #include "env/mock_env.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 88c52522917..5de602cee81 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -18,8 +18,8 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index dbb4759fa03..aa3d077c40d 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -13,12 +13,12 @@
 
 #include "db/db_impl.h"
 #include "db/write_callback.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/write_batch.h"
-#include "port/port.h"
-#include "util/random.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 using std::string;
 
diff --git a/db/write_thread.cc b/db/write_thread.cc
index 872d32ca81b..5ee9439048b 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -9,8 +9,8 @@
 #include "db/column_family.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
-#include "util/random.h"
 #include "test_util/sync_point.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/env/env.cc b/env/env.cc
index dcf79fb7fe7..e5e0e99c0a0 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -10,11 +10,11 @@
 #include "rocksdb/env.h"
 
 #include <thread>
+#include "memory/arena.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "port/sys_time.h"
 #include "rocksdb/options.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 3f75dd6893c..bf1a9e0e5c4 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -49,12 +49,12 @@
 #include "port/port.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
 #include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "util/thread_local.h"
 #include "util/threadpool_imp.h"
 
diff --git a/env/env_test.cc b/env/env_test.cc
index 852a99c1adc..615eca8b400 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -40,13 +40,13 @@
 #include "env/env_chroot.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 #ifdef OS_LINUX
 static const size_t kPageSize = sysconf(_SC_PAGESIZE);
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 27198b1f975..313cbd8eee6 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -31,9 +31,9 @@
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
 #define F_LINUX_SPECIFIC_BASE 1024
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index 44e3110d5e7..22f28f5375f 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -13,9 +13,9 @@
 #include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
index 122a5d6177e..510753b3b45 100644
--- a/file/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -16,10 +16,10 @@
 #include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 #ifndef ROCKSDB_LITE
 
diff --git a/file/filename.cc b/file/filename.cc
index ed19b4109ff..a8fb780054a 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -17,11 +17,11 @@
 #include <stdio.h>
 #include <vector>
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index 9b7278c7d5b..d63170452c0 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -16,8 +16,8 @@
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/mutexlock.h"
 #include "test_util/sync_point.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 9d5de9a2f86..c6b8a92390e 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -22,8 +22,8 @@
 #include "rocksdb/write_buffer_manager.h"
 #include "rocksjni/portal.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
+#include "util/string_util.h"
 
 /*
  * Class:     org_rocksdb_WriteBatchTest
diff --git a/util/allocator.h b/memory/allocator.h
similarity index 100%
rename from util/allocator.h
rename to memory/allocator.h
diff --git a/util/arena.cc b/memory/arena.cc
similarity index 99%
rename from util/arena.cc
rename to memory/arena.cc
index 67e8a4db782..b774225535e 100644
--- a/util/arena.cc
+++ b/memory/arena.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena.h"
+#include "memory/arena.h"
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
 #ifdef OS_FREEBSD
 #include <malloc_np.h>
@@ -21,8 +21,8 @@
 #include <algorithm>
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/logging.h"
 #include "test_util/sync_point.h"
+#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/util/arena.h b/memory/arena.h
similarity index 99%
rename from util/arena.h
rename to memory/arena.h
index dc64154c857..fd97f57e1e5 100644
--- a/util/arena.h
+++ b/memory/arena.h
@@ -15,12 +15,12 @@
 #ifndef OS_WIN
 #include <sys/mman.h>
 #endif
-#include <cstddef>
-#include <cerrno>
-#include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "util/allocator.h"
+#include <cerrno>
+#include <cstddef>
+#include <vector>
+#include "memory/allocator.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/util/arena_test.cc b/memory/arena_test.cc
similarity index 99%
rename from util/arena_test.cc
rename to memory/arena_test.cc
index 052f2a6d5db..18296d307d0 100644
--- a/util/arena_test.cc
+++ b/memory/arena_test.cc
@@ -7,9 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena.h"
-#include "util/random.h"
+#include "memory/arena.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/util/concurrent_arena.cc b/memory/concurrent_arena.cc
similarity index 97%
rename from util/concurrent_arena.cc
rename to memory/concurrent_arena.cc
index cef77d7e75f..722eb3b60bd 100644
--- a/util/concurrent_arena.cc
+++ b/memory/concurrent_arena.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/concurrent_arena.h"
+#include "memory/concurrent_arena.h"
 #include <thread>
 #include "port/port.h"
 #include "util/random.h"
diff --git a/util/concurrent_arena.h b/memory/concurrent_arena.h
similarity index 99%
rename from util/concurrent_arena.h
rename to memory/concurrent_arena.h
index a6191100fd0..6b41ab02470 100644
--- a/util/concurrent_arena.h
+++ b/memory/concurrent_arena.h
@@ -11,9 +11,9 @@
 #include <atomic>
 #include <memory>
 #include <utility>
+#include "memory/allocator.h"
+#include "memory/arena.h"
 #include "port/likely.h"
-#include "util/allocator.h"
-#include "util/arena.h"
 #include "util/core_local.h"
 #include "util/mutexlock.h"
 #include "util/thread_local.h"
diff --git a/util/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc
similarity index 99%
rename from util/jemalloc_nodump_allocator.cc
rename to memory/jemalloc_nodump_allocator.cc
index cdd08e932e3..1f58351bef6 100644
--- a/util/jemalloc_nodump_allocator.cc
+++ b/memory/jemalloc_nodump_allocator.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/jemalloc_nodump_allocator.h"
+#include "memory/jemalloc_nodump_allocator.h"
 
 #include <string>
 #include <thread>
diff --git a/util/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h
similarity index 100%
rename from util/jemalloc_nodump_allocator.h
rename to memory/jemalloc_nodump_allocator.h
diff --git a/util/memory_allocator.h b/memory/memory_allocator.h
similarity index 100%
rename from util/memory_allocator.h
rename to memory/memory_allocator.h
diff --git a/util/memory_usage.h b/memory/memory_usage.h
similarity index 100%
rename from util/memory_usage.h
rename to memory/memory_usage.h
diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc
index a1fa4938c52..ddd40aa059f 100644
--- a/memtable/alloc_tracker.cc
+++ b/memtable/alloc_tracker.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <assert.h>
+#include "memory/allocator.h"
+#include "memory/arena.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "util/allocator.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc
index 878d2338356..e347abe6e69 100644
--- a/memtable/hash_linklist_rep.cc
+++ b/memtable/hash_linklist_rep.cc
@@ -10,13 +10,13 @@
 #include <algorithm>
 #include <atomic>
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "util/arena.h"
 #include "util/hash.h"
 
 namespace rocksdb {
diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc
index d02919cd4ef..5c74657cd31 100644
--- a/memtable/hash_skiplist_rep.cc
+++ b/memtable/hash_skiplist_rep.cc
@@ -9,14 +9,14 @@
 
 #include <atomic>
 
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "port/port.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "port/port.h"
 #include "util/murmurhash.h"
-#include "db/memtable.h"
-#include "memtable/skiplist.h"
 
 namespace rocksdb {
 namespace {
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index 1ef8f2b6dbc..c3adb2ddbd7 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -46,10 +46,10 @@
 #include <algorithm>
 #include <atomic>
 #include <type_traits>
+#include "memory/allocator.h"
 #include "port/likely.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
-#include "util/allocator.h"
 #include "util/coding.h"
 #include "util/random.h"
 
diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc
index a2f62d5304a..9670f3fc64c 100644
--- a/memtable/inlineskiplist_test.cc
+++ b/memtable/inlineskiplist_test.cc
@@ -10,11 +10,11 @@
 #include "memtable/inlineskiplist.h"
 #include <set>
 #include <unordered_set>
+#include "memory/concurrent_arena.h"
 #include "rocksdb/env.h"
-#include "util/concurrent_arena.h"
+#include "test_util/testharness.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc
index ae199096563..003d59b2a86 100644
--- a/memtable/memtablerep_bench.cc
+++ b/memtable/memtablerep_bench.cc
@@ -28,6 +28,7 @@ int main() {
 
 #include "db/dbformat.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
@@ -35,11 +36,10 @@ int main() {
 #include "rocksdb/options.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "util/arena.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index 47a89034eb9..275daa7940f 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -32,10 +32,10 @@
 
 #pragma once
 #include <assert.h>
-#include <atomic>
 #include <stdlib.h>
+#include <atomic>
+#include "memory/allocator.h"
 #include "port/port.h"
-#include "util/allocator.h"
 #include "util/random.h"
 
 namespace rocksdb {
diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc
index 054e3c9df07..33cc19b2d38 100644
--- a/memtable/skiplist_test.cc
+++ b/memtable/skiplist_test.cc
@@ -9,11 +9,11 @@
 
 #include "memtable/skiplist.h"
 #include <set>
+#include "memory/arena.h"
 #include "rocksdb/env.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc
index 32870b127d2..3955217cce7 100644
--- a/memtable/skiplistrep.cc
+++ b/memtable/skiplistrep.cc
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "memtable/inlineskiplist.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/inlineskiplist.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 namespace {
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index 827ab8a5d2b..e7acc94ad67 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "util/arena.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "memtable/stl_wrappers.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 0b531a6ec5e..388256abd9f 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -10,7 +10,6 @@
 #include <unordered_set>
 #include <vector>
 
-#include "table/plain/plain_table_factory.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
@@ -23,6 +22,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 
diff --git a/options/options_parser.cc b/options/options_parser.cc
index 9ae3dfb2785..d5b0c25a32e 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -16,10 +16,10 @@
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 
 #include "port/port.h"
 
diff --git a/options/options_test.cc b/options/options_test.cc
index 704b2db802b..429b607e4f9 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -27,11 +27,11 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/merge_operators/bytesxor.h"
 
 #ifndef GFLAGS
diff --git a/port/win/env_default.cc b/port/win/env_default.cc
index db64878bc02..584a524cf86 100644
--- a/port/win/env_default.cc
+++ b/port/win/env_default.cc
@@ -11,8 +11,8 @@
 
 #include <rocksdb/env.h>
 #include "port/win/env_win.h"
-#include "util/compression_context_cache.h"
 #include "test_util/sync_point.h"
+#include "util/compression_context_cache.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 15d1e711412..6fbf6fc6301 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -10,9 +10,9 @@
 #include "port/win/io_win.h"
 
 #include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
 #include "util/coding.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace port {
diff --git a/src.mk b/src.mk
index a0f4043bf76..c1ab36b8a61 100644
--- a/src.mk
+++ b/src.mk
@@ -71,6 +71,9 @@ LIB_SOURCES =                                                   \
   file/file_util.cc                                             \
   file/filename.cc                                              \
   file/sst_file_manager_impl.cc                                 \
+  memory/arena.cc                                               \
+  memory/concurrent_arena.cc                                    \
+  memory/jemalloc_nodump_allocator.cc                           \
   memtable/alloc_tracker.cc                                     \
   memtable/hash_linklist_rep.cc                                 \
   memtable/hash_skiplist_rep.cc                                 \
@@ -135,7 +138,6 @@ LIB_SOURCES =                                                   \
   test_util/sync_point_impl.cc                                  \
   test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
-  util/arena.cc                                                 \
   util/auto_roll_logger.cc                                      \
   util/bloom.cc                                                 \
   util/build_version.cc                                         \
@@ -143,7 +145,6 @@ LIB_SOURCES =                                                   \
   util/compaction_job_stats_impl.cc                             \
   util/comparator.cc                                            \
   util/compression_context_cache.cc                             \
-  util/concurrent_arena.cc                                      \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
   util/dynamic_bloom.cc                                         \
@@ -151,7 +152,6 @@ LIB_SOURCES =                                                   \
   util/file_reader_writer.cc                                    \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
-  util/jemalloc_nodump_allocator.cc                             \
   util/log_buffer.cc                                            \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
@@ -339,6 +339,7 @@ MAIN_SOURCES =                                                          \
   env/env_basic_test.cc                                                 \
   env/env_test.cc                                                       \
   env/mock_env_test.cc                                                  \
+  memory/arena_test.cc                                                  \
   memtable/inlineskiplist_test.cc                                       \
   memtable/memtablerep_bench.cc                                         \
   memtable/skiplist_test.cc                                             \
@@ -367,7 +368,6 @@ MAIN_SOURCES =                                                          \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
   tools/trace_analyzer_test.cc						\
-  util/arena_test.cc                                                    \
   util/auto_roll_logger_test.cc                                         \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 8bf6f535612..3c54389b08a 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -22,16 +22,16 @@
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
-#include "table/format.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_prefix_index.h"
 #include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
 #include "table/internal_iterator.h"
-#include "util/random.h"
 #include "test_util/sync_point.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index 8d074275ce6..e0ca57f1c51 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -10,11 +10,11 @@
 #include "table/block_based/block_based_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 034c6b238fd..9769e394f87 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -31,9 +31,9 @@
 #include "rocksdb/table.h"
 
 #include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_builder.h"
 #include "table/block_based/filter_block.h"
 #include "table/block_based/full_filter_block.h"
@@ -41,15 +41,14 @@
 #include "table/format.h"
 #include "table/table_builder.h"
 
+#include "memory/memory_allocator.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
 
-
 namespace rocksdb {
 
 extern const std::string kHashIndexPrefixesBlock;
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 0c580b445dd..a1ef3889112 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -14,11 +14,11 @@
 #include <utility>
 #include <vector>
 
-#include "table/meta_blocks.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "util/compression.h"
 
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 609679394ea..121cc916e25 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -23,8 +23,8 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/flush_block_policy.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "util/mutexlock.h"
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 725ecdb4e3f..944a1fde43e 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -15,10 +15,10 @@
 #include <utility>
 #include <vector>
 
-#include "table/block_fetcher.h"
-#include "table/meta_blocks.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "table/block_fetcher.h"
+#include "table/meta_blocks.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -45,12 +45,12 @@
 #include "table/two_level_iterator.h"
 
 #include "monitoring/perf_context_imp.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "util/xxhash.h"
 
 namespace rocksdb {
@@ -202,19 +202,18 @@ bool PrefixExtractorChanged(const TableProperties* table_properties,
 // it is owned by the reader or stored in the cache, or whether it is pinned
 // in the cache or not.
 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
-public:
-  IndexReaderCommon(BlockBasedTable* t,
-                    CachableEntry<Block>&& index_block)
-      : table_(t)
-      , index_block_(std::move(index_block))
-  {
+ public:
+  IndexReaderCommon(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
     assert(table_ != nullptr);
   }
 
-protected:
+ protected:
   static Status ReadIndexBlock(BlockBasedTable* table,
-    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options,
-    GetContext* get_context, CachableEntry<Block>* index_block);
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options,
+                               GetContext* get_context,
+                               CachableEntry<Block>* index_block);
 
   BlockBasedTable* table() const { return table_; }
 
@@ -230,7 +229,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
     assert(table_->get_rep() != nullptr);
 
     const TableProperties* const properties =
-      table_->get_rep()->table_properties.get();
+        table_->get_rep()->table_properties.get();
 
     return properties == nullptr || !properties->index_key_is_user_key;
   }
@@ -240,7 +239,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
     assert(table_->get_rep() != nullptr);
 
     const TableProperties* const properties =
-      table_->get_rep()->table_properties.get();
+        table_->get_rep()->table_properties.get();
 
     return properties == nullptr || !properties->index_value_is_delta_encoded;
   }
@@ -251,20 +250,20 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
 
   size_t ApproximateIndexBlockMemoryUsage() const {
     assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
-    return index_block_.GetOwnValue() ?
-      index_block_.GetValue()->ApproximateMemoryUsage() : 0;
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
   }
 
-private:
+ private:
   BlockBasedTable* table_;
   CachableEntry<Block> index_block_;
 };
 
 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
-  BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-  const ReadOptions& read_options, GetContext* get_context,
-  CachableEntry<Block>* index_block) {
-
+    BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, GetContext* get_context,
+    CachableEntry<Block>* index_block) {
   PERF_TIMER_GUARD(read_index_block_nanos);
 
   assert(table != nullptr);
@@ -275,27 +274,27 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   assert(rep != nullptr);
 
   constexpr bool is_index = true;
-  const Status s = BlockBasedTable::RetrieveBlock(prefetch_buffer,
-    rep, read_options, rep->footer.index_handle(),
-    UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
+  const Status s = BlockBasedTable::RetrieveBlock(
+      prefetch_buffer, rep, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
 
   return s;
 }
 
 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
-  const ReadOptions& read_options, GetContext* get_context,
-  CachableEntry<Block>* index_block) const {
-
+    const ReadOptions& read_options, GetContext* get_context,
+    CachableEntry<Block>* index_block) const {
   assert(index_block != nullptr);
 
   if (!index_block_.IsEmpty()) {
-    *index_block = CachableEntry<Block>(index_block_.GetValue(),
-      nullptr /* cache */, nullptr /* cache_handle */, false /* own_value */);
+    *index_block =
+        CachableEntry<Block>(index_block_.GetValue(), nullptr /* cache */,
+                             nullptr /* cache_handle */, false /* own_value */);
     return Status::OK();
   }
 
-  return ReadIndexBlock(table_, nullptr /* prefetch_buffer */,
-    read_options, get_context, index_block);
+  return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                        get_context, index_block);
 }
 
 // Index that allows binary search lookup in a two-level index structure.
@@ -335,10 +334,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   InternalIteratorBase<BlockHandle>* NewIterator(
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context) override {
-
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         &index_block);
+    const Status s =
+        GetOrReadIndexBlock(read_options, get_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -403,7 +401,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     if (!s.ok()) {
       ROCKS_LOG_WARN(rep->ioptions.info_log,
                      "Error retrieving top-level index block while trying to "
-                     "cache index partitions: %s", s.ToString().c_str());
+                     "cache index partitions: %s",
+                     s.ToString().c_str());
       return;
     }
 
@@ -474,10 +473,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
  private:
-  PartitionIndexReader(BlockBasedTable* t,
-                       CachableEntry<Block>&& index_block)
-    : IndexReaderCommon(t, std::move(index_block))
-  {}
+  PartitionIndexReader(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
 
   std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
 };
@@ -521,8 +518,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context) override {
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         &index_block);
+    const Status s =
+        GetOrReadIndexBlock(read_options, get_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -558,8 +555,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
  private:
   BinarySearchIndexReader(BlockBasedTable* t,
                           CachableEntry<Block>&& index_block)
-    : IndexReaderCommon(t, std::move(index_block))
-  {}
+      : IndexReaderCommon(t, std::move(index_block)) {}
 };
 
 // Index that leverages an internal hash table to quicken the lookup for a given
@@ -620,7 +616,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     const ImmutableCFOptions& ioptions = rep->ioptions;
     const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
     MemoryAllocator* const memory_allocator =
-      GetMemoryAllocator(rep->table_options);
+        GetMemoryAllocator(rep->table_options);
 
     // Read contents for the blocks
     BlockContents prefixes_contents;
@@ -661,8 +657,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
       const ReadOptions& read_options, bool disable_prefix_seek,
       IndexBlockIter* iter, GetContext* get_context) override {
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         &index_block);
+    const Status s =
+        GetOrReadIndexBlock(read_options, get_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -673,8 +669,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     }
 
     Statistics* kNullStats = nullptr;
-    const bool total_order_seek = read_options.total_order_seek ||
-      disable_prefix_seek;
+    const bool total_order_seek =
+        read_options.total_order_seek || disable_prefix_seek;
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
     auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
@@ -703,10 +699,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
  private:
-  HashIndexReader(BlockBasedTable* t,
-                  CachableEntry<Block>&& index_block)
-      : IndexReaderCommon(t, std::move(index_block))
-  {}
+  HashIndexReader(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
 
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
 };
@@ -1439,7 +1433,6 @@ Status BlockBasedTable::GetDataBlockFromCache(
     const ReadOptions& read_options, CachableEntry<Block>* block,
     const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
     bool is_index, GetContext* get_context) {
-
   assert(block);
   assert(block->IsEmpty());
 
@@ -1933,7 +1926,6 @@ BlockBasedTable::GetUncompressionDict(Rep* rep,
 InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
     IndexBlockIter* input_iter, GetContext* get_context) {
-
   assert(rep_ != nullptr);
   assert(rep_->index_reader != nullptr);
 
@@ -1963,11 +1955,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   auto uncompression_dict_storage =
-    GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
+      GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
   const UncompressionDict& uncompression_dict =
-    uncompression_dict_storage.GetValue() == nullptr
-      ? UncompressionDict::GetEmptyDict()
-      : *uncompression_dict_storage.GetValue();
+      uncompression_dict_storage.GetValue() == nullptr
+          ? UncompressionDict::GetEmptyDict()
+          : *uncompression_dict_storage.GetValue();
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict,
@@ -1988,12 +1980,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   // 2. it's pointing to immortal source. If own_bytes is true then we are
   //    not reading data from the original source, whether immortal or not.
   //    Otherwise, the block is pinned iff the source is immortal.
-  const bool block_contents_pinned = block.IsCached() ||
-    (!block.GetValue()->own_bytes() && rep->immortal_table);
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep->immortal_table);
   iter = block.GetValue()->NewIterator<TBlockIter>(
-    &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-    iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
-    index_key_is_full, block_contents_pinned);
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
+      index_key_is_full, block_contents_pinned);
 
   if (!block.IsCached()) {
     if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
@@ -2015,7 +2008,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                  next_cache_key_id_++);
       assert(end - cache_key <=
-        static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
       const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
       s = block_cache->Insert(unique_key, nullptr,
                               block.GetValue()->ApproximateMemoryUsage(),
@@ -2066,11 +2059,11 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
                          compressed_cache_key);
     }
 
-    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                              rep, ro, block_entry, uncompression_dict,
-                              !is_index ?
-                                rep->table_options.read_amp_bytes_per_bit : 0,
-                              is_index, get_context);
+    s = GetDataBlockFromCache(
+        key, ckey, block_cache, block_cache_compressed, rep, ro, block_entry,
+        uncompression_dict,
+        !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, is_index,
+        get_context);
 
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
@@ -2119,7 +2112,6 @@ Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
-
   assert(rep);
   assert(block_entry);
   assert(block_entry->IsEmpty());
@@ -2127,15 +2119,15 @@ Status BlockBasedTable::RetrieveBlock(
   Status s;
   if (!is_index || rep->table_options.cache_index_and_filter_blocks) {
     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
-                                     uncompression_dict, block_entry,
-                                     is_index, get_context);
+                                     uncompression_dict, block_entry, is_index,
+                                     get_context);
 
     if (!s.ok()) {
       return s;
     }
 
     if (block_entry->GetValue() != nullptr) {
-      assert (s.ok());
+      assert(s.ok());
       return s;
     }
   }
@@ -2151,16 +2143,14 @@ Status BlockBasedTable::RetrieveBlock(
 
   {
     StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
-      READ_BLOCK_GET_MICROS);
-    s = ReadBlockFromFile(rep->file.get(), prefetch_buffer, rep->footer, ro,
-                          handle, &block, rep->ioptions,
-                          rep->blocks_maybe_compressed,
-                          rep->blocks_maybe_compressed, uncompression_dict,
-                          rep->persistent_cache_options,
-                          rep->get_global_seqno(is_index),
-                          !is_index ?
-                            rep->table_options.read_amp_bytes_per_bit : 0,
-                          GetMemoryAllocator(rep->table_options));
+                 READ_BLOCK_GET_MICROS);
+    s = ReadBlockFromFile(
+        rep->file.get(), prefetch_buffer, rep->footer, ro, handle, &block,
+        rep->ioptions, rep->blocks_maybe_compressed,
+        rep->blocks_maybe_compressed, uncompression_dict,
+        rep->persistent_cache_options, rep->get_global_seqno(is_index),
+        !is_index ? rep->table_options.read_amp_bytes_per_bit : 0,
+        GetMemoryAllocator(rep->table_options));
   }
 
   if (!s.ok()) {
@@ -2756,9 +2746,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       need_upper_bound_check = PrefixExtractorChanged(
           rep_->table_properties.get(), prefix_extractor);
     }
-    auto iiter =
-        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
-                         get_context);
+    auto iiter = NewIndexIterator(read_options, need_upper_bound_check,
+                                  &iiter_on_stack, get_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -2877,9 +2866,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       need_upper_bound_check = PrefixExtractorChanged(
           rep_->table_properties.get(), prefix_extractor);
     }
-    auto iiter = NewIndexIterator(
-        read_options, need_upper_bound_check, &iiter_on_stack,
-        sst_file_range.begin()->get_context);
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         sst_file_range.begin()->get_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -3105,9 +3094,9 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
   }
 
   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice cache_key = GetCacheKey(rep_->cache_key_prefix,
-                                rep_->cache_key_prefix_size, handle,
-                                cache_key_storage);
+  Slice cache_key =
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
+                  cache_key_storage);
 
   Cache::Handle* const cache_handle = cache->Lookup(cache_key);
   if (cache_handle == nullptr) {
@@ -3187,9 +3176,8 @@ Status BlockBasedTable::CreateIndexReader(
           ROCKS_LOG_WARN(rep_->ioptions.info_log,
                          "Unable to read the metaindex block."
                          " Fall back to binary search index.");
-          return BinarySearchIndexReader::Create(this, prefetch_buffer,
-                                                 use_cache, prefetch, pin,
-                                                 index_reader);
+          return BinarySearchIndexReader::Create(
+              this, prefetch_buffer, use_cache, prefetch, pin, index_reader);
         }
         meta_index_iter = meta_iter_guard.get();
       }
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 6d265ba755b..3af617fecfa 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -263,12 +263,12 @@ class BlockBasedTable : public TableReader {
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
   // read options allow I/O).
-  static Status RetrieveBlock(
-      FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
-      const ReadOptions& ro, const BlockHandle& handle,
-      const UncompressionDict& uncompression_dict,
-      CachableEntry<Block>* block_entry, bool is_index,
-      GetContext* get_context);
+  static Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                              const Rep* rep, const ReadOptions& ro,
+                              const BlockHandle& handle,
+                              const UncompressionDict& uncompression_dict,
+                              CachableEntry<Block>* block_entry, bool is_index,
+                              GetContext* get_context);
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
@@ -310,8 +310,8 @@ class BlockBasedTable : public TableReader {
   static Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
-      const ReadOptions& read_options,
-      CachableEntry<Block>* block, const UncompressionDict& uncompression_dict,
+      const ReadOptions& read_options, CachableEntry<Block>* block,
+      const UncompressionDict& uncompression_dict,
       size_t read_amp_bytes_per_bit, bool is_index = false,
       GetContext* get_context = nullptr);
 
@@ -351,10 +351,10 @@ class BlockBasedTable : public TableReader {
   // Optionally, user can pass a preloaded meta_index_iter for the index that
   // need to access extra meta blocks for index construction. This parameter
   // helps avoid re-reading meta index block if caller already created one.
-  Status CreateIndexReader(
-      FilePrefetchBuffer* prefetch_buffer,
-      InternalIterator* preloaded_meta_index_iter, bool use_cache,
-      bool prefetch, bool pin, IndexReader** index_reader);
+  Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* preloaded_meta_index_iter,
+                           bool use_cache, bool prefetch, bool pin,
+                           IndexReader** index_reader);
 
   bool FullFilterKeyMayMatch(
       const ReadOptions& read_options, FilterBlockReader* filter,
diff --git a/table/block_based/block_prefix_index.cc b/table/block_based/block_prefix_index.cc
index 0050f1f1e58..6e24f17cf68 100644
--- a/table/block_based/block_prefix_index.cc
+++ b/table/block_based/block_prefix_index.cc
@@ -7,10 +7,10 @@
 
 #include <vector>
 
+#include "memory/arena.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index a4c5678881e..2dab4627cb6 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -23,9 +23,9 @@
 #include "table/block_based/block.h"
 #include "table/block_based/block_builder.h"
 #include "table/format.h"
-#include "util/random.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 204e92ecbe3..5ec0938714f 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -3,7 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-
 #include <cstdlib>
 #include <string>
 #include <unordered_map>
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 8b01214c7eb..378cdacfff6 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -24,11 +24,11 @@
 #include <string>
 #include <vector>
 #include "db/dbformat.h"
-#include "table/format.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "table/format.h"
 #include "table/multiget_context.h"
 #include "util/hash.h"
 
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 3e5d82733b0..61df028c920 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -11,12 +11,12 @@
 #include <string>
 #include <vector>
 
+#include "db/dbformat.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "db/dbformat.h"
-#include "util/hash.h"
 #include "table/block_based/filter_block.h"
+#include "util/hash.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 8b99f54b03f..82c43b34ed6 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -7,11 +7,11 @@
 
 #include "rocksdb/filter_policy.h"
 #include "table/full_filter_bits_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index f11ecd4f4bc..738b9e3e099 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -17,8 +17,8 @@
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/flush_block_policy.h"
-#include "table/format.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace rocksdb {
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 735f1c6e3eb..6860bf82fec 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -15,8 +15,8 @@
 #include "rocksdb/slice_transform.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/full_filter_block.h"
 #include "table/block_based/cachable_entry.h"
+#include "table/block_based/full_filter_block.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 2bcafa9771a..9a1a4d526f1 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -3,20 +3,19 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-
 #include <map>
 
 #include "rocksdb/filter_policy.h"
 
-#include "table/full_filter_bits_builder.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/full_filter_bits_builder.h"
 
 #include "index_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/logging.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 6c663702900..263abbfcf80 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -12,6 +12,7 @@
 #include <inttypes.h>
 #include <string>
 
+#include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
@@ -24,7 +25,6 @@
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 56b74b50427..6451d6d2acc 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include "memory/memory_allocator.h"
 #include "table/block_based/block.h"
 #include "table/format.h"
-#include "util/memory_allocator.h"
 
 namespace rocksdb {
 
diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc
index 1467e2a8d1b..f9d46c03bd8 100644
--- a/table/cuckoo/cuckoo_table_builder_test.cc
+++ b/table/cuckoo/cuckoo_table_builder_test.cc
@@ -10,11 +10,11 @@
 #include <map>
 #include <utility>
 
-#include "table/meta_blocks.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
-#include "util/file_reader_writer.h"
+#include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 extern const uint64_t kCuckooTableMagicNumber;
diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
index 72885be940e..905528e9bbf 100644
--- a/table/cuckoo/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -15,13 +15,13 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "memory/arena.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
-#include "table/internal_iterator.h"
-#include "table/meta_blocks.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/get_context.h"
-#include "util/arena.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index 71e231336c5..681e0dfdf3e 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -22,17 +22,17 @@ int main() {
 #include <string>
 #include <map>
 
+#include "memory/arena.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/get_context.h"
 #include "table/meta_blocks.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/table/format.cc b/table/format.cc
index 1adcce6f3f4..3f95fd4d44b 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "block_fetcher.h"
+#include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
@@ -24,7 +25,6 @@
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/format.h b/table/format.h
index 84242303ec7..baad78070ca 100644
--- a/table/format.h
+++ b/table/format.h
@@ -22,11 +22,11 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 
+#include "memory/memory_allocator.h"
 #include "options/cf_options.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/file_reader_writer.h"
-#include "util/memory_allocator.h"
 
 namespace rocksdb {
 
diff --git a/table/get_context.h b/table/get_context.h
index 8df343b3653..ddce33fb3be 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
-#include <string>
 #include <db/dbformat.h>
+#include <string>
 #include "db/merge_context.h"
 #include "db/read_callback.h"
 #include "rocksdb/env.h"
diff --git a/table/iterator.cc b/table/iterator.cc
index 0475b9d1342..97a0cef5e08 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/iterator.h"
+#include "memory/arena.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 85a2fcc0324..1a0d4df8995 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -12,6 +12,7 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
@@ -19,11 +20,10 @@
 #include "table/internal_iterator.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
+#include "test_util/sync_point.h"
 #include "util/autovector.h"
 #include "util/heap.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 9d56c5b9c29..341a1185579 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -16,9 +16,9 @@
 #include "table/internal_iterator.h"
 #include "table/persistent_cache_helper.h"
 #include "table/table_properties_internal.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/mock_table.h b/table/mock_table.h
index f99941863a9..42e28266d99 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -12,16 +12,16 @@
 #include <string>
 #include <utility>
 
-#include "util/kv_map.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
-#include "util/mutexlock.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/kv_map.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 namespace mock {
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index 6160d7afd9e..4d50d817643 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -12,18 +12,18 @@
 #include <limits>
 #include <map>
 
+#include "db/dbformat.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/plain/plain_table_factory.h"
-#include "db/dbformat.h"
 #include "table/block_based/block_builder.h"
 #include "table/bloom_block.h"
-#include "table/plain/plain_table_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h
index 1457fd00d81..7c8ed1953b0 100644
--- a/table/plain/plain_table_index.h
+++ b/table/plain/plain_table_index.h
@@ -11,10 +11,10 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "memory/arena.h"
 #include "monitoring/histogram.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
-#include "util/arena.h"
 #include "util/hash.h"
 #include "util/murmurhash.h"
 
diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc
index 9c4b614b549..c84f337eb42 100644
--- a/table/plain/plain_table_key_coding.cc
+++ b/table/plain/plain_table_key_coding.cc
@@ -9,8 +9,8 @@
 #include <algorithm>
 #include <string>
 #include "db/dbformat.h"
-#include "table/plain/plain_table_reader.h"
 #include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
diff --git a/table/plain/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h
index 26af3f6d8bd..5f65d5a6560 100644
--- a/table/plain/plain_table_key_coding.h
+++ b/table/plain/plain_table_key_coding.h
@@ -8,8 +8,8 @@
 #ifndef ROCKSDB_LITE
 
 #include <array>
-#include "rocksdb/slice.h"
 #include "db/dbformat.h"
+#include "rocksdb/slice.h"
 #include "table/plain/plain_table_reader.h"
 
 // The file contains three helper classes of PlainTable format,
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index b4aad55876b..38852059bf9 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -20,19 +20,19 @@
 #include "rocksdb/statistics.h"
 
 #include "table/block_based/block.h"
-#include "table/bloom_block.h"
 #include "table/block_based/filter_block.h"
+#include "table/bloom_block.h"
 #include "table/format.h"
+#include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
-#include "table/two_level_iterator.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_key_coding.h"
-#include "table/get_context.h"
+#include "table/two_level_iterator.h"
 
+#include "memory/arena.h"
 #include "monitoring/histogram.h"
 #include "monitoring/perf_context_imp.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index ec6e6a7febb..6c1c12ab8bb 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -13,15 +13,15 @@
 #include <stdint.h>
 
 #include "db/dbformat.h"
+#include "memory/arena.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/table_reader.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_index.h"
-#include "util/arena.h"
+#include "table/table_reader.h"
 #include "util/dynamic_bloom.h"
 #include "util/file_reader_writer.h"
 
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index 69993492d48..b53f3161e3e 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -11,8 +11,8 @@
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
-#include "util/file_reader_writer.h"
 #include "test_util/sync_point.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index f2ae016c10d..822c2294bb7 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -22,10 +22,10 @@ int main() {
 #include "table/internal_iterator.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
-#include "util/gflags_compat.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/table/table_test.cc b/table/table_test.cc
index 372443b536a..c59c9d8c33f 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -46,12 +46,12 @@
 #include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
-#include "util/compression.h"
-#include "util/random.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/random.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index ba883763e9f..7ff73cd4e4f 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -9,11 +9,11 @@
 
 #include "table/two_level_iterator.h"
 #include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block.h"
 #include "table/format.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 12caa2809ad..30aafb66069 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -60,6 +60,8 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -68,8 +70,6 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "test_util/testutil.h"
-#include "test_util/transaction_test_util.h"
 #include "util/xxhash.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/merge_operators.h"
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index 52a1f9b91eb..4eb5472acec 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -10,9 +10,9 @@
 #include "rocksdb/db_bench_tool.h"
 #include "options/options_parser.h"
 #include "rocksdb/utilities/options_util.h"
-#include "util/random.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 #ifdef GFLAGS
 #include "util/gflags_compat.h"
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 41ae4c2761e..c7ad71738fa 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -18,8 +18,8 @@ int main() {
 #include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/types.h"
-#include "util/gflags_compat.h"
 #include "test_util/testutil.h"
+#include "util/gflags_compat.h"
 
 // Run a thread to perform Put's.
 // Another thread uses GetUpdatesSince API to keep getting the updates.
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index b9ab1a2df11..c112cb348ff 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1977,8 +1977,9 @@ class StressTest {
       }
       // Check if the multiget batch crossed the ops_per_open boundary. If it
       // did, then we should vote to reopen
-      if (i != 0 && (i % ops_per_open == 0 ||
-          i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) {
+      if (i != 0 &&
+          (i % ops_per_open == 0 ||
+           i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) {
         {
           thread->stats.FinishedSingleOp();
           MutexLock l(thread->shared->GetMutex());
@@ -2173,7 +2174,7 @@ class StressTest {
             snap_state);
       }
       while (!thread->snapshot_queue.empty() &&
-          i >= thread->snapshot_queue.front().first) {
+             i >= thread->snapshot_queue.front().first) {
         auto snap_state = thread->snapshot_queue.front().second;
         assert(snap_state.snapshot);
         // Note: this is unsafe as the cf might be dropped concurrently. But it
@@ -2202,7 +2203,7 @@ class StressTest {
           // number of ops
           multiget_batch_size = static_cast<int>(
               std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
-                                       FLAGS_ops_per_thread - i - 1));
+                       FLAGS_ops_per_thread - i - 1));
           // If its the last iteration, ensure that multiget_batch_size is 1
           multiget_batch_size = std::max(multiget_batch_size, 1);
           rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index a76416b6c1d..3aa0e3cf36d 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -10,10 +10,10 @@
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/ldb_cmd.h"
-#include "tools/ldb_cmd_impl.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "tools/ldb_cmd_impl.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index ea27f3c8d45..d3b1f0e581d 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -15,9 +15,9 @@
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index 2f31c5d8249..7c242f60f26 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -27,9 +27,9 @@ int main() {
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/trace_reader_writer.h"
-#include "tools/trace_analyzer_tool.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "tools/trace_analyzer_tool.h"
 #include "util/trace_replay.h"
 
 namespace rocksdb {
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index 5a2049b6405..a5b2139fcaf 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -13,8 +13,8 @@
 #include "file/filename.h"
 #include "port/port.h"
 #include "port/util_logger.h"
-#include "util/mutexlock.h"
 #include "test_util/sync_point.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index 3adbdbb1363..87de5ed5b9f 100644
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -19,9 +19,9 @@
 #include <vector>
 #include "port/port.h"
 #include "rocksdb/db.h"
-#include "util/logging.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/logging.h"
 
 namespace rocksdb {
 namespace {
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
index edb7af9eaf2..6b1b36e8d18 100644
--- a/util/autovector_test.cc
+++ b/util/autovector_test.cc
@@ -9,10 +9,10 @@
 #include <utility>
 
 #include "rocksdb/env.h"
-#include "util/autovector.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/autovector.h"
+#include "util/string_util.h"
 
 using std::cout;
 using std::endl;
diff --git a/util/bloom.cc b/util/bloom.cc
index bedf4a65839..953a42fa213 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -11,8 +11,8 @@
 
 #include "rocksdb/slice.h"
 #include "table/block_based/block_based_filter_block.h"
-#include "table/full_filter_bits_builder.h"
 #include "table/block_based/full_filter_block.h"
+#include "table/full_filter_bits_builder.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 87cd9da5569..7a13728308c 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -17,13 +17,13 @@ int main() {
 
 #include <vector>
 
+#include "memory/arena.h"
 #include "rocksdb/filter_policy.h"
 #include "table/full_filter_bits_builder.h"
-#include "util/arena.h"
-#include "util/gflags_compat.h"
-#include "util/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/logging.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
diff --git a/util/compression.h b/util/compression.h
index b901ceb3518..aa8af74499b 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -20,11 +20,11 @@
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
 #include <string>
 
+#include "memory/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/memory_allocator.h"
 #include "util/string_util.h"
 
 #ifdef SNAPPY
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 8e90efd89a7..4dfccb0bf36 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -7,9 +7,9 @@
 
 #include <algorithm>
 
+#include "memory/allocator.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
-#include "util/allocator.h"
 #include "util/hash.h"
 
 namespace rocksdb {
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index a8a7000f648..036e0128008 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -24,13 +24,13 @@ int main() {
 #include <vector>
 
 #include "dynamic_bloom.h"
+#include "memory/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
diff --git a/util/event_logger_test.cc b/util/event_logger_test.cc
index 16c6c59f70e..1ee0c4d9787 100644
--- a/util/event_logger_test.cc
+++ b/util/event_logger_test.cc
@@ -5,8 +5,8 @@
 
 #include <string>
 
-#include "util/event_logger.h"
 #include "test_util/testharness.h"
+#include "util/event_logger.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 3003a1ebac0..2c4e0a39f67 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -15,9 +15,9 @@
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
-#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 317c1d6c78c..5ec332fc7a1 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -15,8 +15,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/rate_limiter.h"
-#include "util/aligned_buffer.h"
 #include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 18bb65a72bb..a4a9458d642 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -6,9 +6,9 @@
 #include "util/file_reader_writer.h"
 #include <algorithm>
 #include <vector>
-#include "util/random.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/util/filelock_test.cc b/util/filelock_test.cc
index bd0fc7c4221..3244563d7c3 100644
--- a/util/filelock_test.cc
+++ b/util/filelock_test.cc
@@ -6,10 +6,10 @@
 #include "rocksdb/status.h"
 #include "rocksdb/env.h"
 
-#include <vector>
 #include <fcntl.h>
-#include "util/coding.h"
+#include <vector>
 #include "test_util/testharness.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 6618c5a4bc1..8973f926bc3 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -9,8 +9,8 @@
 
 #include <vector>
 
-#include "util/hash.h"
 #include "test_util/testharness.h"
+#include "util/hash.h"
 
 // The hash algorithm is part of the file format, for example for the Bloom
 // filters. Test that the hash values are stable for a set of random strings of
diff --git a/util/log_buffer.h b/util/log_buffer.h
index e356b93a746..16fb243117d 100644
--- a/util/log_buffer.h
+++ b/util/log_buffer.h
@@ -5,11 +5,11 @@
 
 #pragma once
 
+#include <ctime>
+#include "memory/arena.h"
+#include "port/sys_time.h"
 #include "rocksdb/env.h"
-#include "util/arena.h"
 #include "util/autovector.h"
-#include "port/sys_time.h"
-#include <ctime>
 
 namespace rocksdb {
 
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index dd5322151e3..9efa43f8a3c 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -13,10 +13,10 @@ int main() {
 
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
-#include "util/gflags_compat.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index 93665837fc4..0ee06a121ba 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -11,8 +11,8 @@
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/aligned_buffer.h"
 #include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
 
 namespace rocksdb {
 
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index 3316a75b571..d9f17cc3ac6 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -19,9 +19,9 @@
 
 #include "db/db_test_util.h"
 #include "rocksdb/env.h"
-#include "util/random.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc
index 29af340d7cb..8a993e3699e 100644
--- a/util/repeatable_thread_test.cc
+++ b/util/repeatable_thread_test.cc
@@ -7,9 +7,9 @@
 #include <memory>
 
 #include "db/db_test_util.h"
-#include "util/repeatable_thread.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/repeatable_thread.h"
 
 class RepeatableThreadTest : public testing::Test {
  public:
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index 787638138c0..9926c391745 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -7,12 +7,12 @@
 #include <atomic>
 #include <string>
 
-#include "rocksdb/env.h"
 #include "port/port.h"
-#include "util/autovector.h"
+#include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/autovector.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 816c9718b2d..dcd88ffdb8c 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -14,13 +14,13 @@
 #include "port/port.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
 #include "util/channel.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index c7377064f82..05006d6a3eb 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -22,14 +22,14 @@
 #include "rocksdb/types.h"
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/options_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index a93169c30cd..04b7eb73e2b 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -27,6 +27,7 @@
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_builder.h"
 #include "table/meta_blocks.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
@@ -34,7 +35,6 @@
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 19dce3f87d7..19b8b0c727a 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -18,12 +18,12 @@
 #include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
-#include "util/cast_util.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/random.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
 #include "utilities/blob_db/blob_index.h"
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 347846d075c..431ef697929 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -4,16 +4,16 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <iostream>
-#include "rocksdb/db.h"
 #include "db/db_impl.h"
+#include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
 #include "util/random.h"
-#include "utilities/merge_operators.h"
 #include "utilities/cassandra/cassandra_compaction_filter.h"
 #include "utilities/cassandra/merge_operator.h"
 #include "utilities/cassandra/test_utils.h"
+#include "utilities/merge_operators.h"
 
 using namespace rocksdb;
 
diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h
index 562c1aff3ff..b7f6e32f6ba 100644
--- a/utilities/cassandra/format.h
+++ b/utilities/cassandra/format.h
@@ -56,8 +56,8 @@
 
 #pragma once
 #include <chrono>
-#include <vector>
 #include <memory>
+#include <vector>
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "test_util/testharness.h"
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index 224f7886bf1..29903d460f2 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -11,9 +11,9 @@
 #include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/stackable_db.h"
 #include "table/block_based/block_based_table_factory.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index 160bd347bd2..f0b83f621eb 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -12,11 +12,11 @@
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
-#include "test_util/testharness.h"
-#include "util/random.h"
 
 using namespace rocksdb;
 
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 342db490280..5b8015152ff 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_util.h"
-#include "util/random.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index 775ef29cf8d..5baf64772cc 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -11,9 +11,9 @@
 #include <vector>
 
 #include "port/port.h"
+#include "test_util/sync_point.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
-#include "test_util/sync_point.h"
 #include "utilities/persistent_cache/block_cache_tier_file.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h
index 670463a87f9..00dd9a173e9 100644
--- a/utilities/persistent_cache/block_cache_tier.h
+++ b/utilities/persistent_cache/block_cache_tier.h
@@ -27,10 +27,10 @@
 #include "utilities/persistent_cache/block_cache_tier_metadata.h"
 #include "utilities/persistent_cache/persistent_cache_util.h"
 
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
diff --git a/utilities/persistent_cache/block_cache_tier_file_buffer.h b/utilities/persistent_cache/block_cache_tier_file_buffer.h
index 9d9465c6ca9..e4f8f5ba4b2 100644
--- a/utilities/persistent_cache/block_cache_tier_file_buffer.h
+++ b/utilities/persistent_cache/block_cache_tier_file_buffer.h
@@ -9,7 +9,7 @@
 #include <string>
 
 #include "include/rocksdb/comparator.h"
-#include "util/arena.h"
+#include "memory/arena.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc
index 51ad211e929..9cc1534973e 100644
--- a/utilities/persistent_cache/hash_table_test.cc
+++ b/utilities/persistent_cache/hash_table_test.cc
@@ -9,9 +9,9 @@
 #include <string>
 
 #include "db/db_test_util.h"
-#include "util/arena.h"
-#include "util/random.h"
+#include "memory/arena.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 #include "utilities/persistent_cache/hash_table.h"
 #include "utilities/persistent_cache/hash_table_evictable.h"
 
diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h
index 6d15d13b69b..29c334442c5 100644
--- a/utilities/persistent_cache/persistent_cache_test.h
+++ b/utilities/persistent_cache/persistent_cache_test.h
@@ -19,10 +19,10 @@
 #include <vector>
 
 #include "db/db_test_util.h"
+#include "memory/arena.h"
+#include "port/port.h"
 #include "rocksdb/cache.h"
 #include "table/block_based/block_builder.h"
-#include "port/port.h"
-#include "util/arena.h"
 #include "test_util/testharness.h"
 #include "utilities/persistent_cache/volatile_tier_impl.h"
 
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index e3105a2139c..4f075d0d9fc 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -9,15 +9,15 @@
 #include <string>
 #include <thread>
 
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
+#include "test_util/testharness.h"
+#include "test_util/transaction_test_util.h"
 #include "util/crc32c.h"
 #include "util/logging.h"
 #include "util/random.h"
-#include "test_util/testharness.h"
-#include "test_util/transaction_test_util.h"
-#include "port/port.h"
 
 using std::string;
 
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index fd9da17aac4..93d75a8357f 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -19,9 +19,9 @@
 #include "rocksdb/snapshot.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 #include "utilities/transactions/transaction_util.h"
 
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 95c88594ca9..8920f85fb76 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -20,9 +20,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
-#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 #include "utilities/transactions/write_prepared_txn_db.h"
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index 173e012d88a..757b77fde4e 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -23,9 +23,9 @@
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/hash.h"
-#include "test_util/sync_point.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index d183401f42f..0750b249bbb 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -24,12 +24,12 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/random.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 8dfa6b053c5..22dc208f523 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -22,12 +22,12 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/random.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 5287cca2038..c0a7e278054 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -28,13 +28,13 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
 #include "test_util/fault_injection_test_env.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-#include "util/string_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 0508a596e43..b4a71f5ea6c 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -21,10 +21,10 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "test_util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index c7d8f52aa52..38c6affab8f 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -9,8 +9,8 @@
 #include <memory>
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "util/string_util.h"
 #include "test_util/testharness.h"
+#include "util/string_util.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index adec3475cdd..0f8f6c1d622 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -13,11 +13,11 @@
 #include "db/db_impl.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "options/db_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
-#include "util/arena.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index f8875d9ac1f..3e0a33c3525 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -9,14 +9,14 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <memory>
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include <map>
+#include <memory>
 #include "db/column_family.h"
 #include "port/stack_trace.h"
-#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 

From b9f590065872db9b818874ba4bf4402ddd476cc3 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 30 May 2019 19:29:34 -0700
Subject: [PATCH 089/572] Fix WAL replay by skipping old write batches (#5170)

Summary:
1. Fix a bug in WAL replay in which write batches with old sequence numbers are mistakenly inserted into memtables.
2. Add support for benchmarking secondary instance to db_bench_tool.
With changes made in this PR, we can start benchmarking secondary instance
using two processes. It is also possible to vary the frequency at which the
secondary instance tries to catch up with the primary. The info log of the
secondary can be found in a directory whose path can be specified with
'-secondary_path'.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5170

Differential Revision: D15564608

Pulled By: riversand963

fbshipit-source-id: ce97688ed3d33f69d3a0b9266ebbbbf887aa0ec8
---
 HISTORY.md              |   2 +-
 db/db_impl_secondary.cc |  52 +++++++++----------
 db/db_secondary_test.cc |  49 ++++++++++++++++++
 tools/db_bench_tool.cc  | 107 +++++++++++++++++++++++++++++++---------
 4 files changed, 159 insertions(+), 51 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 55366b006fc..f645d5cc268 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,7 +22,7 @@
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
 
 ### Bug Fixes
-
+* Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_impl_secondary.cc b/db/db_impl_secondary.cc
index 586158ef7ce..a8ea921a260 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl_secondary.cc
@@ -102,7 +102,7 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
   // numbers smaller than the smallest log in log_readers_, so there is no
   // need to pass these logs to RecoverLogFiles
   uint64_t log_number_min = 0;
-  if (log_readers_.size() > 0) {
+  if (!log_readers_.empty()) {
     log_number_min = log_readers_.begin()->first;
   }
   for (size_t i = 0; i < filenames.size(); i++) {
@@ -202,11 +202,19 @@ Status DBImplSecondary::RecoverLogFiles(
             record.size(), Status::Corruption("log record too small"));
         continue;
       }
+      SequenceNumber seq = versions_->LastSequence();
       WriteBatchInternal::SetContents(&batch, record);
+      SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+      // If the write batch's sequence number is smaller than the last sequence
+      // number of the db, then we should skip this write batch because its
+      // data must reside in an SST that has already been added in the prior
+      // MANIFEST replay.
+      if (seq_of_batch < seq) {
+        continue;
+      }
       std::vector<uint32_t> column_family_ids;
       status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
       if (status.ok()) {
-        SequenceNumber seq = versions_->LastSequence();
         for (const auto id : column_family_ids) {
           ColumnFamilyData* cfd =
               versions_->GetColumnFamilySet()->GetColumnFamily(id);
@@ -235,10 +243,13 @@ Status DBImplSecondary::RecoverLogFiles(
             cfd->SetMemtable(new_mem);
           }
         }
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, true, log_number, this,
+            false /* concurrent_memtable_writes */, next_sequence,
+            &has_valid_writes, seq_per_batch_, batch_per_txn_);
       }
-      // do not check sequence number because user may toggle disableWAL
-      // between writes which breaks sequence number continuity guarantee
-
       // If column family was not found, it might mean that the WAL write
       // batch references to the column family that was dropped after the
       // insert. We don't want to fail the whole write batch in that case --
@@ -246,14 +257,6 @@ Status DBImplSecondary::RecoverLogFiles(
       // That's why we set ignore missing column families to true
       // passing null flush_scheduler will disable memtable flushing which is
       // needed for secondary instances
-      if (status.ok()) {
-        bool has_valid_writes = false;
-        status = WriteBatchInternal::InsertInto(
-            &batch, column_family_memtables_.get(),
-            nullptr /* flush_scheduler */, true, log_number, this,
-            false /* concurrent_memtable_writes */, next_sequence,
-            &has_valid_writes, seq_per_batch_, batch_per_txn_);
-      }
       if (status.ok()) {
         for (const auto id : column_family_ids) {
           ColumnFamilyData* cfd =
@@ -269,31 +272,28 @@ Status DBImplSecondary::RecoverLogFiles(
             iter->second = log_number;
           }
         }
+        auto last_sequence = *next_sequence - 1;
+        if ((*next_sequence != kMaxSequenceNumber) &&
+            (versions_->LastSequence() <= last_sequence)) {
+          versions_->SetLastAllocatedSequence(last_sequence);
+          versions_->SetLastPublishedSequence(last_sequence);
+          versions_->SetLastSequence(last_sequence);
+        }
       } else {
         // We are treating this as a failure while reading since we read valid
         // blocks that do not form coherent data
         reader->GetReporter()->Corruption(record.size(), status);
-        continue;
       }
     }
-
     if (!status.ok()) {
       return status;
     }
-
-    auto last_sequence = *next_sequence - 1;
-    if ((*next_sequence != kMaxSequenceNumber) &&
-        (versions_->LastSequence() <= last_sequence)) {
-      versions_->SetLastAllocatedSequence(last_sequence);
-      versions_->SetLastPublishedSequence(last_sequence);
-      versions_->SetLastSequence(last_sequence);
-    }
   }
   // remove logreaders from map after successfully recovering the WAL
   if (log_readers_.size() > 1) {
-    auto eraseIter = log_readers_.begin();
-    std::advance(eraseIter, log_readers_.size() - 1);
-    log_readers_.erase(log_readers_.begin(), eraseIter);
+    auto erase_iter = log_readers_.begin();
+    std::advance(erase_iter, log_readers_.size() - 1);
+    log_readers_.erase(log_readers_.begin(), erase_iter);
   }
   return status;
 }
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 50a0923b4c8..23132434f1f 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -526,6 +526,55 @@ TEST_F(DBSecondaryTest, SwitchManifest) {
 }
 
 TEST_F(DBSecondaryTest, SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
   const int kNumKeysPerMemtable = 1;
   const std::string kCFName1 = "pikachu";
   Options options;
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 30aafb66069..b98fb42c458 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -752,6 +752,19 @@ DEFINE_uint64(blob_db_bytes_per_sync, 0, "Bytes to sync blob file at.");
 DEFINE_uint64(blob_db_file_size, 256 * 1024 * 1024,
               "Target size of each blob file.");
 
+// Secondary DB instance Options
+DEFINE_bool(use_secondary_db, false,
+            "Open a RocksDB secondary instance. A primary instance can be "
+            "running in another db_bench process.");
+
+DEFINE_string(secondary_path, "",
+              "Path to a directory used by the secondary instance to store "
+              "private files, e.g. info log.");
+
+DEFINE_int32(secondary_update_interval, 5,
+             "Secondary instance attempts to catch up with the primary every "
+             "secondary_update_interval seconds.");
+
 #endif  // ROCKSDB_LITE
 
 DEFINE_bool(report_bg_io_stats, false,
@@ -2571,36 +2584,38 @@ class Benchmark {
     return base_name + ToString(id);
   }
 
-void VerifyDBFromDB(std::string& truth_db_name) {
-  DBWithColumnFamilies truth_db;
-  auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
-  if (!s.ok()) {
-    fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-    exit(1);
-  }
-  ReadOptions ro;
-  ro.total_order_seek = true;
-  std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
-  std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
-  // Verify that all the key/values in truth_db are retrivable in db with ::Get
-  fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
-  for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
+  void VerifyDBFromDB(std::string& truth_db_name) {
+    DBWithColumnFamilies truth_db;
+    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
+    std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
+    // Verify that all the key/values in truth_db are retrivable in db with
+    // ::Get
+    fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
+    for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
       std::string value;
       s = db_.db->Get(ro, truth_iter->key(), &value);
       assert(s.ok());
       // TODO(myabandeh): provide debugging hints
       assert(Slice(value) == truth_iter->value());
+    }
+    // Verify that the db iterator does not give any extra key/value
+    fprintf(stderr, "Verifying db == truth_db...\n");
+    for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
+         db_iter->Next(), truth_iter->Next()) {
+      assert(truth_iter->Valid());
+      assert(truth_iter->value() == db_iter->value());
+    }
+    // No more key should be left unchecked in truth_db
+    assert(!truth_iter->Valid());
+    fprintf(stderr, "...Verified\n");
   }
-  // Verify that the db iterator does not give any extra key/value
-  fprintf(stderr, "Verifying db == truth_db...\n");
-  for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next(), truth_iter->Next()) {
-    assert(truth_iter->Valid());
-    assert(truth_iter->value() == db_iter->value());
-  }
-  // No more key should be left unchecked in truth_db
-  assert(!truth_iter->Valid());
-  fprintf(stderr, "...Verified\n");
-}
 
   void Run() {
     if (!SanityCheck()) {
@@ -2934,6 +2949,12 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       }
     }
 
+    if (secondary_update_thread_) {
+      secondary_update_stopped_.store(1, std::memory_order_relaxed);
+      secondary_update_thread_->join();
+      secondary_update_thread_.reset();
+    }
+
 #ifndef ROCKSDB_LITE
     if (name != "replay" && FLAGS_trace_file != "") {
       Status s = db_.db->EndTrace();
@@ -2953,10 +2974,17 @@ void VerifyDBFromDB(std::string& truth_db_name) {
                   ->ToString()
                   .c_str());
     }
+    if (FLAGS_use_secondary_db) {
+      fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
+              secondary_db_updates_);
+    }
   }
 
  private:
   std::shared_ptr<TimestampEmulator> timestamp_emulator_;
+  std::unique_ptr<port::Thread> secondary_update_thread_;
+  std::atomic<int> secondary_update_stopped_{0};
+  uint64_t secondary_db_updates_ = 0;
 
   struct ThreadArg {
     Benchmark* bm;
@@ -3618,6 +3646,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
       exit(1);
     }
+    if (FLAGS_use_secondary_db &&
+        (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
+      fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
+      exit(1);
+    }
 #endif  // ROCKSDB_LITE
 
   }
@@ -3845,6 +3878,32 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       if (s.ok()) {
         db->db = ptr;
       }
+    } else if (FLAGS_use_secondary_db) {
+      if (FLAGS_secondary_path.empty()) {
+        std::string default_secondary_path;
+        FLAGS_env->GetTestDirectory(&default_secondary_path);
+        default_secondary_path += "/dbbench_secondary";
+        FLAGS_secondary_path = default_secondary_path;
+      }
+      s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
+      if (s.ok() && FLAGS_secondary_update_interval > 0) {
+        secondary_update_thread_.reset(new port::Thread(
+            [this](int interval, DBWithColumnFamilies* _db) {
+              while (0 == secondary_update_stopped_.load(
+                              std::memory_order_relaxed)) {
+                Status secondary_update_status =
+                    _db->db->TryCatchUpWithPrimary();
+                if (!secondary_update_status.ok()) {
+                  fprintf(stderr, "Failed to catch up with primary: %s\n",
+                          secondary_update_status.ToString().c_str());
+                  break;
+                }
+                ++secondary_db_updates_;
+                FLAGS_env->SleepForMicroseconds(interval * 1000000);
+              }
+            },
+            FLAGS_secondary_update_interval, db));
+      }
 #endif  // ROCKSDB_LITE
     } else {
       s = DB::Open(options, db_name, &db->db);

From ff9d286877dd3ec74fc829cf57935bfb479a2182 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Thu, 30 May 2019 21:29:44 -0700
Subject: [PATCH 090/572] Reorder DBImpl's private section (#5385)

Summary:
The methods and fields in the private section of DBImpl were all intermingled, making it hard to figure out where the fields/methods start and where they end. I cleaned up the code a little so that all the type declaration are at the beginning, followed by methods, and all the data fields are at the end. This follows
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5385

Differential Revision: D15566978

Pulled By: sagar0

fbshipit-source-id: 4618a7d819ad4e2d7cc9ae1af2c59f400140bb1b
---
 db/db_impl.h | 376 ++++++++++++++++++++++++++-------------------------
 1 file changed, 189 insertions(+), 187 deletions(-)

diff --git a/db/db_impl.h b/db/db_impl.h
index f2544e85941..4c418d6f38f 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -339,7 +339,7 @@ class DBImpl : public DB {
       TablePropertiesCollection* props) override;
 
 #endif  // ROCKSDB_LITE
-  
+
   // ---- End of implementations of the DB interface ----
 
   // Function that Get and KeyMayExist call with no_io true or false
@@ -372,14 +372,13 @@ class DBImpl : public DB {
   // depends also on data written to the WAL but not to the memtable.
   SequenceNumber TEST_GetLastVisibleSequence() const;
 
-#ifndef ROCKSDB_LITE  
+#ifndef ROCKSDB_LITE
   // Similar to Write() but will call the callback once on the single write
   // thread to determine whether it is safe to perform the write.
   virtual Status WriteWithCallback(const WriteOptions& write_options,
                                    WriteBatch* my_batch,
                                    WriteCallback* callback);
 
-  
   // Returns the sequence number that is guaranteed to be smaller than or equal
   // to the sequence number of any key that could be inserted into the current
   // memtables. It can then be assumed that any write with a larger(or equal)
@@ -811,7 +810,7 @@ class DBImpl : public DB {
   size_t TEST_EstiamteStatsHistorySize() const;
 
 #endif  // NDEBUG
-  
+
  protected:
   Env* const env_;
   const std::string dbname_;
@@ -1007,7 +1006,10 @@ class DBImpl : public DB {
   friend class DBBlobIndexTest;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 #endif
+
   struct CompactionState;
+  struct PrepickedCompaction;
+  struct PurgeFileInfo;
 
   struct WriteContext {
     SuperVersionContext superversion_context;
@@ -1024,8 +1026,138 @@ class DBImpl : public DB {
     }
   };
 
-  struct PrepickedCompaction;
-  struct PurgeFileInfo;
+  // Class to maintain directories for all database paths other than main one.
+  class Directories {
+   public:
+    Status SetDirectories(Env* env, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
+
+    Directory* GetDataDir(size_t path_id) const;
+
+    Directory* GetWalDir() {
+      if (wal_dir_) {
+        return wal_dir_.get();
+      }
+      return db_dir_.get();
+    }
+
+    Directory* GetDbDir() { return db_dir_.get(); }
+
+   private:
+    std::unique_ptr<Directory> db_dir_;
+    std::vector<std::unique_ptr<Directory>> data_dirs_;
+    std::unique_ptr<Directory> wal_dir_;
+  };
+
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
+      delete writer;
+      writer = nullptr;
+      return s;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a std::unique_ptr as a member.
+    log::Writer* writer;  // own
+    // true for some prefix of logs_
+    bool getting_synced = false;
+  };
+
+  // PurgeFileInfo is a structure to hold information of files to be deleted in
+  // purge_queue_
+  struct PurgeFileInfo {
+    std::string fname;
+    std::string dir_to_sync;
+    FileType type;
+    uint64_t number;
+    int job_id;
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+                  int jid)
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+  };
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Information for a manual compaction
+  struct ManualCompactionState {
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    uint32_t output_path_id;
+    Status status;
+    bool done;
+    bool in_progress;            // compaction request being processed?
+    bool incomplete;             // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin;    // nullptr means beginning of key range
+    const InternalKey* end;      // nullptr means end of key range
+    InternalKey* manual_end;     // how far we are compacting
+    InternalKey tmp_storage;     // Used to keep track of compaction progress
+    InternalKey tmp_storage1;    // Used to keep track of compaction progress
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
+    Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
+  };
+
+  struct CompactionArg {
+    // caller retains ownership of `db`.
+    DBImpl* db;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
+  };
 
   Status ResumeImpl();
 
@@ -1079,34 +1211,6 @@ class DBImpl : public DB {
       SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
       Env::Priority thread_pri);
 
-  // Argument required by background flush thread.
-  struct BGFlushArg {
-    BGFlushArg()
-        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
-    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
-               SuperVersionContext* superversion_context)
-        : cfd_(cfd),
-          max_memtable_id_(max_memtable_id),
-          superversion_context_(superversion_context) {}
-
-    // Column family to flush.
-    ColumnFamilyData* cfd_;
-    // Maximum ID of memtable to flush. In this column family, memtables with
-    // IDs smaller than this value must be flushed before this flush completes.
-    uint64_t max_memtable_id_;
-    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
-    // installs a new superversion for the column family. This operation
-    // requires a SuperVersionContext object (currently embedded in JobContext).
-    SuperVersionContext* superversion_context_;
-  };
-
-  // Argument passed to flush thread.
-  struct FlushThreadArg {
-    DBImpl* db_;
-
-    Env::Priority thread_pri_;
-  };
-
   // Flush the memtables of (multiple) column families to multiple files on
   // persistent storage.
   Status FlushMemTablesToOutputFiles(
@@ -1345,6 +1449,57 @@ class DBImpl : public DB {
 
   void WaitForBackgroundWork();
 
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function. Background threads carry
+  // sv_context which can have new_superversion already
+  // allocated.
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+                              const DBPropertyInfo& property_info,
+                              bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+  bool HasPendingManualCompaction();
+  bool HasExclusiveManualCompaction();
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
+  bool HaveManualCompaction(ColumnFamilyData* cfd);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::list<uint64_t>::iterator* pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
+
+  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                   size_t preallocate_block_size, log::Writer** new_log);
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -1390,37 +1545,7 @@ class DBImpl : public DB {
   // expesnive mutex_ lock during WAL write, which update log_empty_.
   bool log_empty_;
 
-  struct LogFileNumberSize {
-    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
-    void AddSize(uint64_t new_size) { size += new_size; }
-    uint64_t number;
-    uint64_t size = 0;
-    bool getting_flushed = false;
-  };
-  struct LogWriterNumber {
-    // pass ownership of _writer
-    LogWriterNumber(uint64_t _number, log::Writer* _writer)
-        : number(_number), writer(_writer) {}
 
-    log::Writer* ReleaseWriter() {
-      auto* w = writer;
-      writer = nullptr;
-      return w;
-    }
-    Status ClearWriter() {
-      Status s = writer->WriteBuffer();
-      delete writer;
-      writer = nullptr;
-      return s;
-    }
-
-    uint64_t number;
-    // Visual Studio doesn't support deque's member to be noncopyable because
-    // of a std::unique_ptr as a member.
-    log::Writer* writer;  // own
-    // true for some prefix of logs_
-    bool getting_synced = false;
-  };
   // Without two_write_queues, read and writes to alive_log_files_ are
   // protected by mutex_. However since back() is never popped, and push_back()
   // is done only from write_thread_, the same thread can access the item
@@ -1467,30 +1592,6 @@ class DBImpl : public DB {
 
   bool stats_slice_initialized_ = false;
 
-  // Class to maintain directories for all database paths other than main one.
-  class Directories {
-   public:
-    Status SetDirectories(Env* env, const std::string& dbname,
-                          const std::string& wal_dir,
-                          const std::vector<DbPath>& data_paths);
-
-    Directory* GetDataDir(size_t path_id) const;
-
-    Directory* GetWalDir() {
-      if (wal_dir_) {
-        return wal_dir_.get();
-      }
-      return db_dir_.get();
-    }
-
-    Directory* GetDbDir() { return db_dir_.get(); }
-
-   private:
-    std::unique_ptr<Directory> db_dir_;
-    std::vector<std::unique_ptr<Directory>> data_dirs_;
-    std::unique_ptr<Directory> wal_dir_;
-  };
-
   Directories directories_;
 
   WriteBufferManager* write_buffer_manager_;
@@ -1526,19 +1627,6 @@ class DBImpl : public DB {
   // State is protected with db mutex.
   std::list<uint64_t> pending_outputs_;
 
-  // PurgeFileInfo is a structure to hold information of files to be deleted in
-  // purge_queue_
-  struct PurgeFileInfo {
-    std::string fname;
-    std::string dir_to_sync;
-    FileType type;
-    uint64_t number;
-    int job_id;
-    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
-                  int jid)
-        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
-  };
-
   // flush_queue_ and compaction_queue_ hold column families that we need to
   // flush and compact, respectively.
   // A column family is inserted into flush_queue_ when it satisfies condition
@@ -1595,42 +1683,8 @@ class DBImpl : public DB {
   // number of background obsolete file purge jobs, submitted to the HIGH pool
   int bg_purge_scheduled_;
 
-  // Information for a manual compaction
-  struct ManualCompactionState {
-    ColumnFamilyData* cfd;
-    int input_level;
-    int output_level;
-    uint32_t output_path_id;
-    Status status;
-    bool done;
-    bool in_progress;            // compaction request being processed?
-    bool incomplete;             // only part of requested range compacted
-    bool exclusive;              // current behavior of only one manual
-    bool disallow_trivial_move;  // Force actual compaction to run
-    const InternalKey* begin;    // nullptr means beginning of key range
-    const InternalKey* end;      // nullptr means end of key range
-    InternalKey* manual_end;     // how far we are compacting
-    InternalKey tmp_storage;     // Used to keep track of compaction progress
-    InternalKey tmp_storage1;    // Used to keep track of compaction progress
-  };
-  struct PrepickedCompaction {
-    // background compaction takes ownership of `compaction`.
-    Compaction* compaction;
-    // caller retains ownership of `manual_compaction_state` as it is reused
-    // across background compactions.
-    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
-    // task limiter token is requested during compaction picking.
-    std::unique_ptr<TaskLimiterToken> task_token;
-  };
   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
-  struct CompactionArg {
-    // caller retains ownership of `db`.
-    DBImpl* db;
-    // background compaction takes ownership of `prepicked_compaction`.
-    PrepickedCompaction* prepicked_compaction;
-  };
-
   // shall we disable deletion of obsolete files
   // if 0 the deletion is enabled.
   // if non-zero, files will not be getting deleted
@@ -1726,58 +1780,6 @@ class DBImpl : public DB {
   // REQUIRES: mutex locked
   std::unique_ptr<rocksdb::RepeatableThread> thread_persist_stats_;
 
-  // No copying allowed
-  DBImpl(const DBImpl&);
-  void operator=(const DBImpl&);
-
-  // Background threads call this function, which is just a wrapper around
-  // the InstallSuperVersion() function. Background threads carry
-  // sv_context which can have new_superversion already
-  // allocated.
-  // All ColumnFamily state changes go through this function. Here we analyze
-  // the new state and we schedule background work if we detect that the new
-  // state needs flush or compaction.
-  void InstallSuperVersionAndScheduleWork(
-      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
-      const MutableCFOptions& mutable_cf_options);
-
-
-  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
-                              const DBPropertyInfo& property_info,
-                              bool is_locked, uint64_t* value);
-  bool GetPropertyHandleOptionsStatistics(std::string* value);
-
-  bool HasPendingManualCompaction();
-  bool HasExclusiveManualCompaction();
-  void AddManualCompaction(ManualCompactionState* m);
-  void RemoveManualCompaction(ManualCompactionState* m);
-  bool ShouldntRunManualCompaction(ManualCompactionState* m);
-  bool HaveManualCompaction(ColumnFamilyData* cfd);
-  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
-#ifndef ROCKSDB_LITE
-  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
-                              const Status& st,
-                              const CompactionJobStats& compaction_job_stats,
-                              const int job_id, const Version* current,
-                              CompactionJobInfo* compaction_job_info) const;
-  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
-  // and return the current file_number in 'next_file_number'.
-  // Write a version edit to the MANIFEST.
-  Status ReserveFileNumbersBeforeIngestion(
-      ColumnFamilyData* cfd, uint64_t num,
-      std::list<uint64_t>::iterator* pending_output_elem,
-      uint64_t* next_file_number);
-#endif  //! ROCKSDB_LITE
-
-  bool ShouldPurge(uint64_t file_number) const;
-  void MarkAsGrabbedForPurge(uint64_t file_number);
-
-  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
-  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
-
-  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
-                   size_t preallocate_block_size, log::Writer** new_log);
-
   // When set, we use a separate queue for writes that dont write to memtable.
   // In 2PC these are the writes at Prepare phase.
   const bool two_write_queues_;

From ab8f6c01a6c48fd7b8c752a3ef0ef8640065dd48 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 30 May 2019 21:30:41 -0700
Subject: [PATCH 091/572] move LevelCompactionPicker to a separate file (#5369)

Summary:
In order to improve code readability, this PR moves LevelCompactionBuilder and LevelCompactionPicker to compaction_picker_level.h and .cc
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5369

Differential Revision: D15540172

Pulled By: miasantreble

fbshipit-source-id: c1a578b93f127cd63661b53f32b356e6edd349af
---
 CMakeLists.txt                |   3 +-
 TARGETS                       |   1 +
 db/column_family.cc           |   1 +
 db/compaction_picker.cc       | 534 --------------------------------
 db/compaction_picker.h        |  17 --
 db/compaction_picker_level.cc | 558 ++++++++++++++++++++++++++++++++++
 db/compaction_picker_level.h  |  32 ++
 db/compaction_picker_test.cc  |   1 +
 src.mk                        |   1 +
 9 files changed, 596 insertions(+), 552 deletions(-)
 create mode 100644 db/compaction_picker_level.cc
 create mode 100644 db/compaction_picker_level.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4dc2500fb5..3ddea95deaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -485,6 +485,7 @@ set(SOURCES
         db/compaction_job.cc
         db/compaction_picker.cc
         db/compaction_picker_fifo.cc
+        db/compaction_picker_level.cc
         db/compaction_picker_universal.cc
         db/convenience.cc
         db/db_filesnapshot.cc
@@ -945,7 +946,7 @@ if(WITH_TESTS)
         table/block_based/block_test.cc
         table/block_based/data_block_hash_index_test.cc
         table/block_based/full_filter_block_test.cc
-       	table/block_based/partitioned_filter_block_test.cc 	
+        table/block_based/partitioned_filter_block_test.cc
         table/cleanable_test.cc
         table/cuckoo/cuckoo_table_builder_test.cc
         table/cuckoo/cuckoo_table_reader_test.cc
diff --git a/TARGETS b/TARGETS
index a59af2fa697..dc39f87bcef 100644
--- a/TARGETS
+++ b/TARGETS
@@ -88,6 +88,7 @@ cpp_library(
         "db/compaction_job.cc",
         "db/compaction_picker.cc",
         "db/compaction_picker_fifo.cc",
+        "db/compaction_picker_level.cc",
         "db/compaction_picker_universal.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
diff --git a/db/column_family.cc b/db/column_family.cc
index 84f521cd7b8..fde1996aeaf 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -21,6 +21,7 @@
 
 #include "db/compaction_picker.h"
 #include "db/compaction_picker_fifo.h"
+#include "db/compaction_picker_level.h"
 #include "db/compaction_picker_universal.h"
 #include "db/db_impl.h"
 #include "db/internal_stats.h"
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index b25f6cb0890..bfe13828b18 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -1111,538 +1111,4 @@ bool CompactionPicker::GetOverlappingL0Files(
   return true;
 }
 
-bool LevelCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage) const {
-  if (!vstorage->ExpiredTtlFiles().empty()) {
-    return true;
-  }
-  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
-    return true;
-  }
-  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
-    return true;
-  }
-  if (!vstorage->FilesMarkedForCompaction().empty()) {
-    return true;
-  }
-  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
-    if (vstorage->CompactionScore(i) >= 1) {
-      return true;
-    }
-  }
-  return false;
-}
-
-namespace {
-// A class to build a leveled compaction step-by-step.
-class LevelCompactionBuilder {
- public:
-  LevelCompactionBuilder(const std::string& cf_name,
-                         VersionStorageInfo* vstorage,
-                         CompactionPicker* compaction_picker,
-                         LogBuffer* log_buffer,
-                         const MutableCFOptions& mutable_cf_options,
-                         const ImmutableCFOptions& ioptions)
-      : cf_name_(cf_name),
-        vstorage_(vstorage),
-        compaction_picker_(compaction_picker),
-        log_buffer_(log_buffer),
-        mutable_cf_options_(mutable_cf_options),
-        ioptions_(ioptions) {}
-
-  // Pick and return a compaction.
-  Compaction* PickCompaction();
-
-  // Pick the initial files to compact to the next level. (or together
-  // in Intra-L0 compactions)
-  void SetupInitialFiles();
-
-  // If the initial files are from L0 level, pick other L0
-  // files if needed.
-  bool SetupOtherL0FilesIfNeeded();
-
-  // Based on initial files, setup other files need to be compacted
-  // in this compaction, accordingly.
-  bool SetupOtherInputsIfNeeded();
-
-  Compaction* GetCompaction();
-
-  // For the specfied level, pick a file that we want to compact.
-  // Returns false if there is no file to compact.
-  // If it returns true, inputs->files.size() will be exactly one.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return false.
-  bool PickFileToCompact();
-
-  // For L0->L0, picks the longest span of files that aren't currently
-  // undergoing compaction for which work-per-deleted-file decreases. The span
-  // always starts from the newest L0 file.
-  //
-  // Intra-L0 compaction is independent of all other files, so it can be
-  // performed even when L0->base_level compactions are blocked.
-  //
-  // Returns true if `inputs` is populated with a span of files to be compacted;
-  // otherwise, returns false.
-  bool PickIntraL0Compaction();
-
-  void PickExpiredTtlFiles();
-
-  void PickFilesMarkedForPeriodicCompaction();
-
-  const std::string& cf_name_;
-  VersionStorageInfo* vstorage_;
-  CompactionPicker* compaction_picker_;
-  LogBuffer* log_buffer_;
-  int start_level_ = -1;
-  int output_level_ = -1;
-  int parent_index_ = -1;
-  int base_index_ = -1;
-  double start_level_score_ = 0;
-  bool is_manual_ = false;
-  CompactionInputFiles start_level_inputs_;
-  std::vector<CompactionInputFiles> compaction_inputs_;
-  CompactionInputFiles output_level_inputs_;
-  std::vector<FileMetaData*> grandparents_;
-  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
-
-  const MutableCFOptions& mutable_cf_options_;
-  const ImmutableCFOptions& ioptions_;
-  // Pick a path ID to place a newly generated file, with its level
-  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
-                            const MutableCFOptions& mutable_cf_options,
-                            int level);
-
-  static const int kMinFilesForIntraL0Compaction = 4;
-};
-
-void LevelCompactionBuilder::PickExpiredTtlFiles() {
-  if (vstorage_->ExpiredTtlFiles().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
-    // If it's being compacted it has nothing to do here.
-    // If this assert() fails that means that some function marked some
-    // files as being_compacted, but didn't call ComputeCompactionScore()
-    assert(!level_file.second->being_compacted);
-    start_level_ = level_file.first;
-    output_level_ =
-        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-
-    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
-        (start_level_ == 0 &&
-         !compaction_picker_->level0_compactions_in_progress()->empty())) {
-      return false;
-    }
-
-    start_level_inputs_.files = {level_file.second};
-    start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
-    if (continuation(level_file)) {
-      // found the compaction!
-      return;
-    }
-  }
-
-  start_level_inputs_.files.clear();
-}
-
-void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
-  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
-    // If it's being compacted it has nothing to do here.
-    // If this assert() fails that means that some function marked some
-    // files as being_compacted, but didn't call ComputeCompactionScore()
-    assert(!level_file.second->being_compacted);
-    output_level_ = start_level_ = level_file.first;
-
-    if (start_level_ == 0 &&
-        !compaction_picker_->level0_compactions_in_progress()->empty()) {
-      return false;
-    }
-
-    start_level_inputs_.files = {level_file.second};
-    start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
-    if (continuation(level_file)) {
-      // found the compaction!
-      return;
-    }
-  }
-
-  start_level_inputs_.files.clear();
-}
-
-void LevelCompactionBuilder::SetupInitialFiles() {
-  // Find the compactions by size on all levels.
-  bool skipped_l0_to_base = false;
-  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
-    start_level_score_ = vstorage_->CompactionScore(i);
-    start_level_ = vstorage_->CompactionScoreLevel(i);
-    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
-    if (start_level_score_ >= 1) {
-      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
-        // If L0->base_level compaction is pending, don't schedule further
-        // compaction from base level. Otherwise L0->base_level compaction
-        // may starve.
-        continue;
-      }
-      output_level_ =
-          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-      if (PickFileToCompact()) {
-        // found the compaction!
-        if (start_level_ == 0) {
-          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
-          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
-        } else {
-          // L1+ score = `Level files size` / `MaxBytesForLevel`
-          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
-        }
-        break;
-      } else {
-        // didn't find the compaction, clear the inputs
-        start_level_inputs_.clear();
-        if (start_level_ == 0) {
-          skipped_l0_to_base = true;
-          // L0->base_level may be blocked due to ongoing L0->base_level
-          // compactions. It may also be blocked by an ongoing compaction from
-          // base_level downwards.
-          //
-          // In these cases, to reduce L0 file count and thus reduce likelihood
-          // of write stalls, we can attempt compacting a span of files within
-          // L0.
-          if (PickIntraL0Compaction()) {
-            output_level_ = 0;
-            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // if we didn't find a compaction, check if there are any files marked for
-  // compaction
-  if (start_level_inputs_.empty()) {
-    parent_index_ = base_index_ = -1;
-
-    compaction_picker_->PickFilesMarkedForCompaction(
-        cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
-    if (!start_level_inputs_.empty()) {
-      is_manual_ = true;
-      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
-      return;
-    }
-  }
-
-  // Bottommost Files Compaction on deleting tombstones
-  if (start_level_inputs_.empty()) {
-    size_t i;
-    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
-         ++i) {
-      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
-      assert(!level_and_file.second->being_compacted);
-      start_level_inputs_.level = output_level_ = start_level_ =
-          level_and_file.first;
-      start_level_inputs_.files = {level_and_file.second};
-      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                     &start_level_inputs_)) {
-        break;
-      }
-    }
-    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
-      start_level_inputs_.clear();
-    } else {
-      assert(!start_level_inputs_.empty());
-      compaction_reason_ = CompactionReason::kBottommostFiles;
-      return;
-    }
-  }
-
-  // TTL Compaction
-  if (start_level_inputs_.empty()) {
-    PickExpiredTtlFiles();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kTtl;
-      return;
-    }
-  }
-
-  // Periodic Compaction
-  if (start_level_inputs_.empty()) {
-    PickFilesMarkedForPeriodicCompaction();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kPeriodicCompaction;
-      return;
-    }
-  }
-}
-
-bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
-  if (start_level_ == 0 && output_level_ != 0) {
-    return compaction_picker_->GetOverlappingL0Files(
-        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
-  }
-  return true;
-}
-
-bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
-  // Setup input files from output level. For output to L0, we only compact
-  // spans of files that do not interact with any pending compactions, so don't
-  // need to consider other levels.
-  if (output_level_ != 0) {
-    output_level_inputs_.level = output_level_;
-    if (!compaction_picker_->SetupOtherInputs(
-            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
-            &output_level_inputs_, &parent_index_, base_index_)) {
-      return false;
-    }
-
-    compaction_inputs_.push_back(start_level_inputs_);
-    if (!output_level_inputs_.empty()) {
-      compaction_inputs_.push_back(output_level_inputs_);
-    }
-
-    // In some edge cases we could pick a compaction that will be compacting
-    // a key range that overlap with another running compaction, and both
-    // of them have the same output level. This could happen if
-    // (1) we are running a non-exclusive manual compaction
-    // (2) AddFile ingest a new file into the LSM tree
-    // We need to disallow this from happening.
-    if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
-                                                            output_level_)) {
-      // This compaction output could potentially conflict with the output
-      // of a currently running compaction, we cannot run it.
-      return false;
-    }
-    compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
-                                        output_level_inputs_, &grandparents_);
-  } else {
-    compaction_inputs_.push_back(start_level_inputs_);
-  }
-  return true;
-}
-
-Compaction* LevelCompactionBuilder::PickCompaction() {
-  // Pick up the first file to start compaction. It may have been extended
-  // to a clean cut.
-  SetupInitialFiles();
-  if (start_level_inputs_.empty()) {
-    return nullptr;
-  }
-  assert(start_level_ >= 0 && output_level_ >= 0);
-
-  // If it is a L0 -> base level compaction, we need to set up other L0
-  // files if needed.
-  if (!SetupOtherL0FilesIfNeeded()) {
-    return nullptr;
-  }
-
-  // Pick files in the output level and expand more files in the start level
-  // if needed.
-  if (!SetupOtherInputsIfNeeded()) {
-    return nullptr;
-  }
-
-  // Form a compaction object containing the files we picked.
-  Compaction* c = GetCompaction();
-
-  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
-
-  return c;
-}
-
-Compaction* LevelCompactionBuilder::GetCompaction() {
-  auto c = new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
-      output_level_,
-      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
-                          ioptions_.compaction_style, vstorage_->base_level(),
-                          ioptions_.level_compaction_dynamic_level_bytes),
-      mutable_cf_options_.max_compaction_bytes,
-      GetPathId(ioptions_, mutable_cf_options_, output_level_),
-      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
-                         output_level_, vstorage_->base_level()),
-      GetCompressionOptions(ioptions_, vstorage_, output_level_),
-      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
-      start_level_score_, false /* deletion_compaction */, compaction_reason_);
-
-  // If it's level 0 compaction, make sure we don't execute any other level 0
-  // compactions in parallel
-  compaction_picker_->RegisterCompaction(c);
-
-  // Creating a compaction influences the compaction score because the score
-  // takes running compactions into account (by skipping files that are already
-  // being compacted). Since we just changed compaction score, we recalculate it
-  // here
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
-  return c;
-}
-
-/*
- * Find the optimal path to place a file
- * Given a level, finds the path where levels up to it will fit in levels
- * up to and including this path
- */
-uint32_t LevelCompactionBuilder::GetPathId(
-    const ImmutableCFOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, int level) {
-  uint32_t p = 0;
-  assert(!ioptions.cf_paths.empty());
-
-  // size remaining in the most recent path
-  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
-
-  uint64_t level_size;
-  int cur_level = 0;
-
-  // max_bytes_for_level_base denotes L1 size.
-  // We estimate L0 size to be the same as L1.
-  level_size = mutable_cf_options.max_bytes_for_level_base;
-
-  // Last path is the fallback
-  while (p < ioptions.cf_paths.size() - 1) {
-    if (level_size <= current_path_size) {
-      if (cur_level == level) {
-        // Does desired level fit in this path?
-        return p;
-      } else {
-        current_path_size -= level_size;
-        if (cur_level > 0) {
-          if (ioptions.level_compaction_dynamic_level_bytes) {
-            // Currently, level_compaction_dynamic_level_bytes is ignored when
-            // multiple db paths are specified. https://github.com/facebook/
-            // rocksdb/blob/master/db/column_family.cc.
-            // Still, adding this check to avoid accidentally using
-            // max_bytes_for_level_multiplier_additional
-            level_size = static_cast<uint64_t>(
-                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
-          } else {
-            level_size = static_cast<uint64_t>(
-                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
-                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
-          }
-        }
-        cur_level++;
-        continue;
-      }
-    }
-    p++;
-    current_path_size = ioptions.cf_paths[p].target_size;
-  }
-  return p;
-}
-
-bool LevelCompactionBuilder::PickFileToCompact() {
-  // level 0 files are overlapping. So we cannot pick more
-  // than one concurrent compactions at this level. This
-  // could be made better by looking at key-ranges that are
-  // being compacted at level 0.
-  if (start_level_ == 0 &&
-      !compaction_picker_->level0_compactions_in_progress()->empty()) {
-    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
-    return false;
-  }
-
-  start_level_inputs_.clear();
-
-  assert(start_level_ >= 0);
-
-  // Pick the largest file in this level that is not already
-  // being compacted
-  const std::vector<int>& file_size =
-      vstorage_->FilesByCompactionPri(start_level_);
-  const std::vector<FileMetaData*>& level_files =
-      vstorage_->LevelFiles(start_level_);
-
-  unsigned int cmp_idx;
-  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
-       cmp_idx < file_size.size(); cmp_idx++) {
-    int index = file_size[cmp_idx];
-    auto* f = level_files[index];
-
-    // do not pick a file to compact if it is being compacted
-    // from n-1 level.
-    if (f->being_compacted) {
-      continue;
-    }
-
-    start_level_inputs_.files.push_back(f);
-    start_level_inputs_.level = start_level_;
-    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                    &start_level_inputs_) ||
-        compaction_picker_->FilesRangeOverlapWithCompaction(
-            {start_level_inputs_}, output_level_)) {
-      // A locked (pending compaction) input-level file was pulled in due to
-      // user-key overlap.
-      start_level_inputs_.clear();
-      continue;
-    }
-
-    // Now that input level is fully expanded, we check whether any output files
-    // are locked due to pending compaction.
-    //
-    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
-    // level files are locked, not just the extra ones pulled in for user-key
-    // overlap.
-    InternalKey smallest, largest;
-    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
-    CompactionInputFiles output_level_inputs;
-    output_level_inputs.level = output_level_;
-    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
-                                    &output_level_inputs.files);
-    if (!output_level_inputs.empty() &&
-        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                    &output_level_inputs)) {
-      start_level_inputs_.clear();
-      continue;
-    }
-    base_index_ = index;
-    break;
-  }
-
-  // store where to start the iteration in the next call to PickCompaction
-  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
-
-  return start_level_inputs_.size() > 0;
-}
-
-bool LevelCompactionBuilder::PickIntraL0Compaction() {
-  start_level_inputs_.clear();
-  const std::vector<FileMetaData*>& level_files =
-      vstorage_->LevelFiles(0 /* level */);
-  if (level_files.size() <
-          static_cast<size_t>(
-              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
-      level_files[0]->being_compacted) {
-    // If L0 isn't accumulating much files beyond the regular trigger, don't
-    // resort to L0->L0 compaction yet.
-    return false;
-  }
-  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
-                               port::kMaxUint64,
-                               mutable_cf_options_.max_compaction_bytes,
-                               &start_level_inputs_);
-}
-}  // namespace
-
-Compaction* LevelCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
-                                 mutable_cf_options, ioptions_);
-  return builder.PickCompaction();
-}
-
 }  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 05895a26753..437c8d30473 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -236,23 +236,6 @@ class CompactionPicker {
   const InternalKeyComparator* const icmp_;
 };
 
-// Picking compactions for leveled compaction. See wiki page
-// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
-// for description of Leveled compaction.
-class LevelCompactionPicker : public CompactionPicker {
- public:
-  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
-                        const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
-
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override;
-};
-
 #ifndef ROCKSDB_LITE
 // A dummy compaction that never triggers any automatic
 // compaction.
diff --git a/db/compaction_picker_level.cc b/db/compaction_picker_level.cc
new file mode 100644
index 00000000000..70fe46c5b81
--- /dev/null
+++ b/db/compaction_picker_level.cc
@@ -0,0 +1,558 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker_level.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test_util/sync_point.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+  LevelCompactionBuilder(const std::string& cf_name,
+                         VersionStorageInfo* vstorage,
+                         CompactionPicker* compaction_picker,
+                         LogBuffer* log_buffer,
+                         const MutableCFOptions& mutable_cf_options,
+                         const ImmutableCFOptions& ioptions)
+      : cf_name_(cf_name),
+        vstorage_(vstorage),
+        compaction_picker_(compaction_picker),
+        log_buffer_(log_buffer),
+        mutable_cf_options_(mutable_cf_options),
+        ioptions_(ioptions) {}
+
+  // Pick and return a compaction.
+  Compaction* PickCompaction();
+
+  // Pick the initial files to compact to the next level. (or together
+  // in Intra-L0 compactions)
+  void SetupInitialFiles();
+
+  // If the initial files are from L0 level, pick other L0
+  // files if needed.
+  bool SetupOtherL0FilesIfNeeded();
+
+  // Based on initial files, setup other files need to be compacted
+  // in this compaction, accordingly.
+  bool SetupOtherInputsIfNeeded();
+
+  Compaction* GetCompaction();
+
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickFileToCompact();
+
+  // For L0->L0, picks the longest span of files that aren't currently
+  // undergoing compaction for which work-per-deleted-file decreases. The span
+  // always starts from the newest L0 file.
+  //
+  // Intra-L0 compaction is independent of all other files, so it can be
+  // performed even when L0->base_level compactions are blocked.
+  //
+  // Returns true if `inputs` is populated with a span of files to be compacted;
+  // otherwise, returns false.
+  bool PickIntraL0Compaction();
+
+  void PickExpiredTtlFiles();
+
+  void PickFilesMarkedForPeriodicCompaction();
+
+  const std::string& cf_name_;
+  VersionStorageInfo* vstorage_;
+  CompactionPicker* compaction_picker_;
+  LogBuffer* log_buffer_;
+  int start_level_ = -1;
+  int output_level_ = -1;
+  int parent_index_ = -1;
+  int base_index_ = -1;
+  double start_level_score_ = 0;
+  bool is_manual_ = false;
+  CompactionInputFiles start_level_inputs_;
+  std::vector<CompactionInputFiles> compaction_inputs_;
+  CompactionInputFiles output_level_inputs_;
+  std::vector<FileMetaData*> grandparents_;
+  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+  const MutableCFOptions& mutable_cf_options_;
+  const ImmutableCFOptions& ioptions_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
+  static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+  if (vstorage_->ExpiredTtlFiles().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    start_level_ = level_file.first;
+    output_level_ =
+        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+
+    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
+  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    output_level_ = start_level_ = level_file.first;
+
+    if (start_level_ == 0 &&
+        !compaction_picker_->level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+  // Find the compactions by size on all levels.
+  bool skipped_l0_to_base = false;
+  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+    start_level_score_ = vstorage_->CompactionScore(i);
+    start_level_ = vstorage_->CompactionScoreLevel(i);
+    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+    if (start_level_score_ >= 1) {
+      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+        // If L0->base_level compaction is pending, don't schedule further
+        // compaction from base level. Otherwise L0->base_level compaction
+        // may starve.
+        continue;
+      }
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+      if (PickFileToCompact()) {
+        // found the compaction!
+        if (start_level_ == 0) {
+          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+        } else {
+          // L1+ score = `Level files size` / `MaxBytesForLevel`
+          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+        }
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        start_level_inputs_.clear();
+        if (start_level_ == 0) {
+          skipped_l0_to_base = true;
+          // L0->base_level may be blocked due to ongoing L0->base_level
+          // compactions. It may also be blocked by an ongoing compaction from
+          // base_level downwards.
+          //
+          // In these cases, to reduce L0 file count and thus reduce likelihood
+          // of write stalls, we can attempt compacting a span of files within
+          // L0.
+          if (PickIntraL0Compaction()) {
+            output_level_ = 0;
+            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  if (start_level_inputs_.empty()) {
+    parent_index_ = base_index_ = -1;
+
+    compaction_picker_->PickFilesMarkedForCompaction(
+        cf_name_, vstorage_, &start_level_, &output_level_,
+        &start_level_inputs_);
+    if (!start_level_inputs_.empty()) {
+      is_manual_ = true;
+      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+      return;
+    }
+  }
+
+  // Bottommost Files Compaction on deleting tombstones
+  if (start_level_inputs_.empty()) {
+    size_t i;
+    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+         ++i) {
+      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+      assert(!level_and_file.second->being_compacted);
+      start_level_inputs_.level = output_level_ = start_level_ =
+          level_and_file.first;
+      start_level_inputs_.files = {level_and_file.second};
+      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                     &start_level_inputs_)) {
+        break;
+      }
+    }
+    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+      start_level_inputs_.clear();
+    } else {
+      assert(!start_level_inputs_.empty());
+      compaction_reason_ = CompactionReason::kBottommostFiles;
+      return;
+    }
+  }
+
+  // TTL Compaction
+  if (start_level_inputs_.empty()) {
+    PickExpiredTtlFiles();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kTtl;
+      return;
+    }
+  }
+
+  // Periodic Compaction
+  if (start_level_inputs_.empty()) {
+    PickFilesMarkedForPeriodicCompaction();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kPeriodicCompaction;
+      return;
+    }
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+  if (start_level_ == 0 && output_level_ != 0) {
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+  }
+  return true;
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+  // Setup input files from output level. For output to L0, we only compact
+  // spans of files that do not interact with any pending compactions, so don't
+  // need to consider other levels.
+  if (output_level_ != 0) {
+    output_level_inputs_.level = output_level_;
+    if (!compaction_picker_->SetupOtherInputs(
+            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+            &output_level_inputs_, &parent_index_, base_index_)) {
+      return false;
+    }
+
+    compaction_inputs_.push_back(start_level_inputs_);
+    if (!output_level_inputs_.empty()) {
+      compaction_inputs_.push_back(output_level_inputs_);
+    }
+
+    // In some edge cases we could pick a compaction that will be compacting
+    // a key range that overlap with another running compaction, and both
+    // of them have the same output level. This could happen if
+    // (1) we are running a non-exclusive manual compaction
+    // (2) AddFile ingest a new file into the LSM tree
+    // We need to disallow this from happening.
+    if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
+                                                            output_level_)) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      return false;
+    }
+    compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+                                        output_level_inputs_, &grandparents_);
+  } else {
+    compaction_inputs_.push_back(start_level_inputs_);
+  }
+  return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+  // Pick up the first file to start compaction. It may have been extended
+  // to a clean cut.
+  SetupInitialFiles();
+  if (start_level_inputs_.empty()) {
+    return nullptr;
+  }
+  assert(start_level_ >= 0 && output_level_ >= 0);
+
+  // If it is a L0 -> base level compaction, we need to set up other L0
+  // files if needed.
+  if (!SetupOtherL0FilesIfNeeded()) {
+    return nullptr;
+  }
+
+  // Pick files in the output level and expand more files in the start level
+  // if needed.
+  if (!SetupOtherInputsIfNeeded()) {
+    return nullptr;
+  }
+
+  // Form a compaction object containing the files we picked.
+  Compaction* c = GetCompaction();
+
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+  return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+  auto c = new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
+      output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options_.max_compaction_bytes,
+      GetPathId(ioptions_, mutable_cf_options_, output_level_),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level_, vstorage_->base_level()),
+      GetCompressionOptions(ioptions_, vstorage_, output_level_),
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      start_level_score_, false /* deletion_compaction */, compaction_reason_);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  compaction_picker_->RegisterCompaction(c);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.cf_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/master/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.cf_paths[p].target_size;
+  }
+  return p;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (start_level_ == 0 &&
+      !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+    return false;
+  }
+
+  start_level_inputs_.clear();
+
+  assert(start_level_ >= 0);
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  const std::vector<int>& file_size =
+      vstorage_->FilesByCompactionPri(start_level_);
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_size.size(); cmp_idx++) {
+    int index = file_size[cmp_idx];
+    auto* f = level_files[index];
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    start_level_inputs_.files.push_back(f);
+    start_level_inputs_.level = start_level_;
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_)) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+      continue;
+    }
+
+    // Now that input level is fully expanded, we check whether any output files
+    // are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      start_level_inputs_.clear();
+      continue;
+    }
+    base_index_ = index;
+    break;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+
+  return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+  start_level_inputs_.clear();
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(0 /* level */);
+  if (level_files.size() <
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+      level_files[0]->being_compacted) {
+    // If L0 isn't accumulating much files beyond the regular trigger, don't
+    // resort to L0->L0 compaction yet.
+    return false;
+  }
+  return FindIntraL0Compaction(
+      level_files, kMinFilesForIntraL0Compaction, port::kMaxUint64,
+      mutable_cf_options_.max_compaction_bytes, &start_level_inputs_);
+}
+}  // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
+                                 mutable_cf_options, ioptions_);
+  return builder.PickCompaction();
+}
+}  // namespace rocksdb
diff --git a/db/compaction_picker_level.h b/db/compaction_picker_level.h
new file mode 100644
index 00000000000..1d37fe50eaf
--- /dev/null
+++ b/db/compaction_picker_level.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction_picker.h"
+
+namespace rocksdb {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index dd33009eb12..c3e9e450ff0 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -10,6 +10,7 @@
 #include <utility>
 #include "db/compaction.h"
 #include "db/compaction_picker_fifo.h"
+#include "db/compaction_picker_level.h"
 #include "db/compaction_picker_universal.h"
 
 #include "test_util/testharness.h"
diff --git a/src.mk b/src.mk
index c1ab36b8a61..44013bc2e1d 100644
--- a/src.mk
+++ b/src.mk
@@ -12,6 +12,7 @@ LIB_SOURCES =                                                   \
   db/compaction_job.cc                                          \
   db/compaction_picker.cc                                       \
   db/compaction_picker_fifo.cc                                  \
+  db/compaction_picker_level.cc                                 \
   db/compaction_picker_universal.cc                             \
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \

From 83f7a8eed0592cfe275ca5247069adb0acdf75d3 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 31 May 2019 08:24:05 -0700
Subject: [PATCH 092/572] Fix compilation error in LITE mode (#5391)

Summary:
Add macro ROCKSDB_LITE to fix compilation.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5391

Differential Revision: D15574522

Pulled By: riversand963

fbshipit-source-id: 95aea83c5d9b2bf98a3ba0ef9167b63c9be2988b
---
 tools/db_bench_tool.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index b98fb42c458..d80502f16fa 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2974,10 +2974,13 @@ class Benchmark {
                   ->ToString()
                   .c_str());
     }
+
+#ifndef ROCKSDB_LITE
     if (FLAGS_use_secondary_db) {
       fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
               secondary_db_updates_);
     }
+#endif  // ROCKSDB_LITE
   }
 
  private:

From 0834bbd0b108b7ddc66f963be657d6719515a687 Mon Sep 17 00:00:00 2001
From: qinzuoyan <qinzuoyan@gmail.com>
Date: Fri, 31 May 2019 10:40:39 -0700
Subject: [PATCH 093/572] Configure ccache in CMakeLists.txt to speed up
 compilation

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5388

Differential Revision: D15579052

Pulled By: siying

fbshipit-source-id: ee58770fe023f40b9aa189a225e4c7ef50613ea9
---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ddea95deaf..9a4d9deb1b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,12 @@ endif()
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
+
 option(WITH_JEMALLOC "build with JeMalloc" OFF)
 option(WITH_SNAPPY "build with SNAPPY" OFF)
 option(WITH_LZ4 "build with lz4" OFF)

From cb094e13bbadb4031ecab95e084418da60973312 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 31 May 2019 10:45:20 -0700
Subject: [PATCH 094/572] Auto roll logger to enforce options.keep_log_file_num
 immediately after a new file is created (#5370)

Summary:
Right now, with auto roll logger, options.keep_log_file_num enforcement is triggered by events like DB reopen or full obsolete scan happens. In the mean time, the size and number of log files can grow without a limit. We put a stronger enforcement to the option, so that the number of log files can always under control.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5370

Differential Revision: D15570413

Pulled By: siying

fbshipit-source-id: 0916c3c4d42ab8fdd29389ee7fd7e1557b03176e
---
 HISTORY.md                               |   1 +
 file/filename.cc                         |  32 ++++
 file/filename.h                          |   8 +
 util/auto_roll_logger.cc                 | 103 +++++++++++-
 util/auto_roll_logger.h                  |  33 ++--
 util/auto_roll_logger_test.cc            | 194 ++++++++++++++++++++---
 utilities/convenience/info_log_finder.cc |  29 +---
 7 files changed, 328 insertions(+), 72 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f645d5cc268..b9b6998c6f5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
 * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/file/filename.cc b/file/filename.cc
index a8fb780054a..6f00d15ebca 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -407,4 +407,36 @@ Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
   return file->Sync(db_options->use_fsync);
 }
 
+Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                       const std::string& dbname, std::string* parent_dir,
+                       std::vector<std::string>* info_log_list) {
+  assert(parent_dir != nullptr);
+  assert(info_log_list != nullptr);
+  uint64_t number = 0;
+  FileType type;
+
+  if (!db_log_dir.empty()) {
+    *parent_dir = db_log_dir;
+  } else {
+    *parent_dir = dbname;
+  }
+
+  InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+  std::vector<std::string> file_names;
+  Status s = env->GetChildren(*parent_dir, &file_names);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto& f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace rocksdb
diff --git a/file/filename.h b/file/filename.h
index eea6b1b02fd..db06f4664e2 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -169,4 +169,12 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname);
 extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
                            WritableFileWriter* file);
 
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                              const std::string& dbname,
+                              std::string* parent_dir,
+                              std::vector<std::string>* file_names);
 }  // namespace rocksdb
diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc
index ae6061aed43..9e8d6750319 100644
--- a/util/auto_roll_logger.cc
+++ b/util/auto_roll_logger.cc
@@ -4,12 +4,53 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "util/auto_roll_logger.h"
+#include <algorithm>
+#include "file/filename.h"
+#include "util/logging.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
 // -- AutoRollLogger
+
+AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
+                               const std::string& db_log_dir,
+                               size_t log_max_size,
+                               size_t log_file_time_to_roll,
+                               size_t keep_log_file_num,
+                               const InfoLogLevel log_level)
+    : Logger(log_level),
+      dbname_(dbname),
+      db_log_dir_(db_log_dir),
+      env_(env),
+      status_(Status::OK()),
+      kMaxLogFileSize(log_max_size),
+      kLogFileTimeToRoll(log_file_time_to_roll),
+      kKeepLogFileNum(keep_log_file_num),
+      cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+      ctime_(cached_now),
+      cached_now_access_count(0),
+      call_NowMicros_every_N_records_(100),
+      mutex_() {
+  Status s = env->GetAbsolutePath(dbname, &db_absolute_path_);
+  if (s.IsNotSupported()) {
+    db_absolute_path_ = dbname;
+  } else {
+    status_ = s;
+  }
+  log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+  if (env_->FileExists(log_fname_).ok()) {
+    RollLogFile();
+  }
+  GetExistingFiles();
+  ResetLogger();
+  s = TrimOldLogFiles();
+  if (!status_.ok()) {
+    status_ = s;
+  }
+}
+
 Status AutoRollLogger::ResetLogger() {
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
   status_ = env_->NewLogger(log_fname_, &logger_);
@@ -44,6 +85,58 @@ void AutoRollLogger::RollLogFile() {
     now++;
   } while (env_->FileExists(old_fname).ok());
   env_->RenameFile(log_fname_, old_fname);
+  old_log_files_.push(old_fname);
+}
+
+void AutoRollLogger::GetExistingFiles() {
+  {
+    // Empty the queue to avoid duplicated entries in the queue.
+    std::queue<std::string> empty;
+    std::swap(old_log_files_, empty);
+  }
+
+  std::string parent_dir;
+  std::vector<std::string> info_log_files;
+  Status s =
+      GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
+  if (status_.ok()) {
+    status_ = s;
+  }
+  // We need to sort the file before enqueing it so that when we
+  // delete file from the front, it is the oldest file.
+  std::sort(info_log_files.begin(), info_log_files.end());
+
+  for (const std::string& f : info_log_files) {
+    old_log_files_.push(parent_dir + "/" + f);
+  }
+}
+
+Status AutoRollLogger::TrimOldLogFiles() {
+  // Here we directly list info files and delete them through Env.
+  // The deletion isn't going through DB, so there are shortcomes:
+  // 1. the deletion is not rate limited by SstFileManager
+  // 2. there is a chance that an I/O will be issued here
+  // Since it's going to be complicated to pass DB object down to
+  // here, we take a simple approach to keep the code easier to
+  // maintain.
+
+  // old_log_files_.empty() is helpful for the corner case that
+  // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but
+  // it's essentially the same thing, and checking empty before accessing
+  // the queue feels safer.
+  while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) {
+    Status s = env_->DeleteFile(old_log_files_.front());
+    // Remove the file from the tracking anyway. It's possible that
+    // DB cleaned up the old log file, or people cleaned it up manually.
+    old_log_files_.pop();
+    // To make the file really go away, we should sync parent directory.
+    // Since there isn't any consistency issue involved here, skipping
+    // this part to avoid one I/O here.
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
 }
 
 std::string AutoRollLogger::ValistToString(const char* format,
@@ -78,12 +171,19 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
         (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
       RollLogFile();
       Status s = ResetLogger();
+      Status s2 = TrimOldLogFiles();
+
       if (!s.ok()) {
         // can't really log the error if creating a new LOG file failed
         return;
       }
 
       WriteHeaderInfo();
+
+      if (!s2.ok()) {
+        ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s",
+                       s2.ToString().c_str());
+      }
     }
 
     // pin down the current logger_ instance before releasing the mutex.
@@ -153,7 +253,8 @@ Status CreateLoggerFromOptions(const std::string& dbname,
   if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
     AutoRollLogger* result = new AutoRollLogger(
         env, dbname, options.db_log_dir, options.max_log_file_size,
-        options.log_file_time_to_roll, options.info_log_level);
+        options.log_file_time_to_roll, options.keep_log_file_num,
+        options.info_log_level);
     Status s = result->GetStatus();
     if (!s.ok()) {
       delete result;
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index a5b2139fcaf..a14fbfd5892 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -8,6 +8,7 @@
 
 #pragma once
 #include <list>
+#include <queue>
 #include <string>
 
 #include "file/filename.h"
@@ -24,25 +25,8 @@ class AutoRollLogger : public Logger {
  public:
   AutoRollLogger(Env* env, const std::string& dbname,
                  const std::string& db_log_dir, size_t log_max_size,
-                 size_t log_file_time_to_roll,
-                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
-      : Logger(log_level),
-        dbname_(dbname),
-        db_log_dir_(db_log_dir),
-        env_(env),
-        status_(Status::OK()),
-        kMaxLogFileSize(log_max_size),
-        kLogFileTimeToRoll(log_file_time_to_roll),
-        cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
-        ctime_(cached_now),
-        cached_now_access_count(0),
-        call_NowMicros_every_N_records_(100),
-        mutex_() {
-    env->GetAbsolutePath(dbname, &db_absolute_path_);
-    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
-    RollLogFile();
-    ResetLogger();
-  }
+                 size_t log_file_time_to_roll, size_t keep_log_file_num,
+                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL);
 
   using Logger::Logv;
   void Logv(const char* format, va_list ap) override;
@@ -110,6 +94,11 @@ class AutoRollLogger : public Logger {
   bool LogExpired();
   Status ResetLogger();
   void RollLogFile();
+  // Read all names of old log files into old_log_files_
+  // If there is any error, put the error code in status_
+  void GetExistingFiles();
+  // Delete old log files if it excceeds the limit.
+  Status TrimOldLogFiles();
   // Log message to logger without rolling
   void LogInternal(const char* format, ...);
   // Serialize the va_list to a string
@@ -126,8 +115,14 @@ class AutoRollLogger : public Logger {
   Status status_;
   const size_t kMaxLogFileSize;
   const size_t kLogFileTimeToRoll;
+  const size_t kKeepLogFileNum;
   // header information
   std::list<std::string> headers_;
+  // List of all existing info log files. Used for enforcing number of
+  // info log files.
+  // Full path is stored here. It consumes signifianctly more memory
+  // than only storing file name. Can optimize if it causes a problem.
+  std::queue<std::string> old_log_files_;
   // to avoid frequent env->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index 87de5ed5b9f..ff47719d490 100644
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -41,6 +41,21 @@ class NoSleepEnv : public EnvWrapper {
 };
 }  // namespace
 
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call ROCKS_LOG_INFO(logger, log_message) directly.
+namespace {
+void LogMessage(Logger* logger, const char* message) {
+  ROCKS_LOG_INFO(logger, "%s", message);
+}
+
+void LogMessage(const InfoLogLevel log_level, Logger* logger,
+                const char* message) {
+  Log(log_level, logger, "%s", message);
+}
+}  // namespace
+
 class AutoRollLoggerTest : public testing::Test {
  public:
   static void InitTestDb() {
@@ -62,6 +77,41 @@ class AutoRollLoggerTest : public testing::Test {
                              const std::string& log_message);
   void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time,
                              const std::string& log_message);
+  // return list of files under kTestDir that contains "LOG"
+  std::vector<std::string> GetLogFiles() {
+    std::vector<std::string> ret;
+    std::vector<std::string> files;
+    Status s = default_env->GetChildren(kTestDir, &files);
+    // Should call ASSERT_OK() here but it doesn't compile. It's not
+    // worth the time figuring out why.
+    EXPECT_TRUE(s.ok());
+    for (const auto& f : files) {
+      if (f.find("LOG") != std::string::npos) {
+        ret.push_back(f);
+      }
+    }
+    return ret;
+  }
+
+  // Delete all log files under kTestDir
+  void CleanupLogFiles() {
+    for (const std::string& f : GetLogFiles()) {
+      ASSERT_OK(default_env->DeleteFile(kTestDir + "/" + f));
+    }
+  }
+
+  void RollNTimesBySize(Logger* auto_roll_logger, size_t file_num,
+                        size_t max_log_file_size) {
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < file_num + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+  }
 
   static const std::string kSampleMessage;
   static const std::string kTestDir;
@@ -77,21 +127,6 @@ const std::string AutoRollLoggerTest::kLogFile(
     test::PerThreadDBPath("db_log_test") + "/LOG");
 Env* AutoRollLoggerTest::default_env = Env::Default();
 
-// In this test we only want to Log some simple log message with
-// no format. LogMessage() provides such a simple interface and
-// avoids the [format-security] warning which occurs when you
-// call ROCKS_LOG_INFO(logger, log_message) directly.
-namespace {
-void LogMessage(Logger* logger, const char* message) {
-  ROCKS_LOG_INFO(logger, "%s", message);
-}
-
-void LogMessage(const InfoLogLevel log_level, Logger* logger,
-                const char* message) {
-  Log(log_level, logger, "%s", message);
-}
-}  // namespace
-
 void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
                                                size_t log_max_size,
                                                const std::string& log_message) {
@@ -159,8 +194,10 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
 TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
     InitTestDb();
     size_t log_max_size = 1024 * 5;
+    size_t keep_log_file_num = 10;
 
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0,
+                          keep_log_file_num);
 
     RollLogFileBySizeTest(&logger, log_max_size,
                           kSampleMessage + ":RollLogFileBySize");
@@ -171,11 +208,12 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
 
   size_t time = 2;
   size_t log_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
 
   InitTestDb();
   // -- Test the existence of file during the server restart.
   ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
-  AutoRollLogger logger(&nse, kTestDir, "", log_size, time);
+  AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num);
   ASSERT_OK(default_env->FileExists(kLogFile));
 
   RollLogFileByTimeTest(&nse, &logger, time,
@@ -192,28 +230,30 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
   // treated as "singed".
   size_t kZero = 0;
   size_t log_size = 1024;
+  size_t keep_log_file_num = 10;
 
-  AutoRollLogger* logger = new AutoRollLogger(
-    Env::Default(), kTestDir, "", log_size, 0);
+  AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "",
+                                              log_size, 0, keep_log_file_num);
 
   LogMessage(logger, kSampleMessage.c_str());
   ASSERT_GT(logger->GetLogFileSize(), kZero);
   delete logger;
 
   // reopens the log file and an empty log file will be created.
-  logger = new AutoRollLogger(
-    Env::Default(), kTestDir, "", log_size, 0);
+  logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10);
   ASSERT_EQ(logger->GetLogFileSize(), kZero);
   delete logger;
 }
 
 TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
   size_t time = 2, log_max_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
 
   InitTestDb();
 
   NoSleepEnv nse(Env::Default());
-  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time);
+  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time,
+                        keep_log_file_num);
 
   // Test the ability to roll by size
   RollLogFileBySizeTest(&logger, log_max_size,
@@ -269,6 +309,107 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
                         kSampleMessage + ":CreateLoggerFromOptions - both");
   RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
+
+  // Set keep_log_file_num
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+
+    CleanupLogFiles();
+  }
+
+  // Set keep_log_file_num and dbname is different from
+  // db_log_dir.
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    options.db_log_dir = kTestDir;
+    ASSERT_OK(CreateLoggerFromOptions("/dummy/db/name", options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+    for (const auto& f : files) {
+      ASSERT_TRUE(f.find("dummy") != std::string::npos);
+    }
+
+    // Cleaning up those files.
+    CleanupLogFiles();
+  }
+}
+
+TEST_F(AutoRollLoggerTest, AutoDeleting) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    // In the first attemp, db_log_dir is not set, while in the
+    // second it is set.
+    std::string dbname = (attempt == 0) ? kTestDir : "/test/dummy/dir";
+    std::string db_log_dir = (attempt == 0) ? "" : kTestDir;
+
+    InitTestDb();
+    const size_t kMaxFileSize = 512;
+    {
+      size_t log_num = 8;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      RollNTimesBySize(&logger, log_num, kMaxFileSize);
+
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+    // Shrink number of files
+    {
+      size_t log_num = 5;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    // Increase number of files again.
+    {
+      size_t log_num = 7;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      ASSERT_EQ(6, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    CleanupLogFiles();
+  }
 }
 
 TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
@@ -322,7 +463,7 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   // an extra-scope to force the AutoRollLogger to flush the log file when it
   // becomes out of scope.
   {
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
     for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -360,7 +501,7 @@ TEST_F(AutoRollLoggerTest, Close) {
 
   size_t log_size = 8192;
   size_t log_lines = 0;
-  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
   for (int log_level = InfoLogLevel::HEADER_LEVEL;
        log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
     logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -446,8 +587,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
 
     InitTestDb();
 
-    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
-                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"",
+                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/0,
+                          /*keep_log_file_num=*/10);
 
     if (test_num == 0) {
       // Log some headers explicitly using Header()
diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc
index 3e599961630..646362aa2c2 100644
--- a/utilities/convenience/info_log_finder.cc
+++ b/utilities/convenience/info_log_finder.cc
@@ -14,35 +14,12 @@
 namespace rocksdb {
 
 Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
-  uint64_t number = 0;
-  FileType type;
-  std::string path;
-
   if (!db) {
     return Status::InvalidArgument("DB pointer is not valid");
   }
-
+  std::string parent_path;
   const Options& options = db->GetOptions();
-  if (!options.db_log_dir.empty()) {
-    path = options.db_log_dir;
-  } else {
-    path = db->GetName();
-  }
-  InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), db->GetName());
-  auto* env = options.env;
-  std::vector<std::string> file_names;
-  Status s = env->GetChildren(path, &file_names);
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  for (auto f : file_names) {
-    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
-        (type == kInfoLogFile)) {
-      info_log_list->push_back(f);
-    }
-  }
-  return Status::OK();
+  return GetInfoLogFiles(options.env, options.db_log_dir, db->GetName(),
+                         &parent_path, info_log_list);
 }
 }  // namespace rocksdb

From a3609b7dde4b8a37602c74d5cf08a502a067198e Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 31 May 2019 11:37:21 -0700
Subject: [PATCH 095/572] Improve const correctness in BlockBasedTableReader
 (#5383)

Summary:
Many methods are passing around pointers to non-const objects when in fact
they do not/should not modify said objects. The patch makes the semantics
clearer and also helps from a thread safety point-of-view by changing some
pointers to pointers-to-const and marking some instance methods as const.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5383

Differential Revision: D15562770

Pulled By: ltamasi

fbshipit-source-id: 89361dadbb8b25bbe54d17e8da28fee24a2419af
---
 table/block_based/block_based_table_reader.cc | 36 ++++++++++---------
 table/block_based/block_based_table_reader.h  | 26 ++++++--------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 944a1fde43e..b7fba779f47 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -203,19 +203,20 @@ bool PrefixExtractorChanged(const TableProperties* table_properties,
 // in the cache or not.
 class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
  public:
-  IndexReaderCommon(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
       : table_(t), index_block_(std::move(index_block)) {
     assert(table_ != nullptr);
   }
 
  protected:
-  static Status ReadIndexBlock(BlockBasedTable* table,
+  static Status ReadIndexBlock(const BlockBasedTable* table,
                                FilePrefetchBuffer* prefetch_buffer,
                                const ReadOptions& read_options,
                                GetContext* get_context,
                                CachableEntry<Block>* index_block);
 
-  BlockBasedTable* table() const { return table_; }
+  const BlockBasedTable* table() const { return table_; }
 
   const InternalKeyComparator* internal_comparator() const {
     assert(table_ != nullptr);
@@ -256,12 +257,12 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
   }
 
  private:
-  BlockBasedTable* table_;
+  const BlockBasedTable* table_;
   CachableEntry<Block> index_block_;
 };
 
 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
-    BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
     const ReadOptions& read_options, GetContext* get_context,
     CachableEntry<Block>* index_block) {
   PERF_TIMER_GUARD(read_index_block_nanos);
@@ -304,7 +305,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   // `PartitionIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(BlockBasedTable* table,
+  static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
                        bool prefetch, bool pin, IndexReader** index_reader) {
     assert(table != nullptr);
@@ -473,7 +474,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
  private:
-  PartitionIndexReader(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
       : IndexReaderCommon(t, std::move(index_block)) {}
 
   std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
@@ -488,7 +490,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
   // `BinarySearchIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(BlockBasedTable* table,
+  static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
                        bool prefetch, bool pin, IndexReader** index_reader) {
     assert(table != nullptr);
@@ -553,7 +555,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
  private:
-  BinarySearchIndexReader(BlockBasedTable* t,
+  BinarySearchIndexReader(const BlockBasedTable* t,
                           CachableEntry<Block>&& index_block)
       : IndexReaderCommon(t, std::move(index_block)) {}
 };
@@ -562,7 +564,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
 // key.
 class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
  public:
-  static Status Create(BlockBasedTable* table,
+  static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer,
                        InternalIterator* meta_index_iter, bool use_cache,
                        bool prefetch, bool pin, IndexReader** index_reader) {
@@ -699,7 +701,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
  private:
-  HashIndexReader(BlockBasedTable* t, CachableEntry<Block>&& index_block)
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
       : IndexReaderCommon(t, std::move(index_block)) {}
 
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
@@ -1188,7 +1190,7 @@ Status BlockBasedTable::ReadRangeDelBlock(
 }
 
 Status BlockBasedTable::ReadCompressionDictBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+    const Rep* rep, FilePrefetchBuffer* prefetch_buffer,
     std::unique_ptr<const BlockContents>* compression_dict_block) {
   assert(compression_dict_block != nullptr);
   Status s;
@@ -1842,7 +1844,7 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 }
 
 CachableEntry<UncompressionDict>
-BlockBasedTable::GetUncompressionDict(Rep* rep,
+BlockBasedTable::GetUncompressionDict(const Rep* rep,
                                       FilePrefetchBuffer* prefetch_buffer,
                                       bool no_io, GetContext* get_context) {
   if (!rep->table_options.cache_index_and_filter_blocks) {
@@ -1925,7 +1927,7 @@ BlockBasedTable::GetUncompressionDict(Rep* rep,
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
 InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
-    IndexBlockIter* input_iter, GetContext* get_context) {
+    IndexBlockIter* input_iter, GetContext* get_context) const {
   assert(rep_ != nullptr);
   assert(rep_->index_reader != nullptr);
 
@@ -1941,7 +1943,7 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
 // If input_iter is not null, update this iter and return it
 template <typename TBlockIter>
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
+    const Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
     TBlockIter* input_iter, bool is_index, bool key_includes_seq,
     bool index_key_is_full, GetContext* get_context, Status s,
     FilePrefetchBuffer* prefetch_buffer) {
@@ -2164,7 +2166,7 @@ Status BlockBasedTable::RetrieveBlock(
 }
 
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
-    BlockBasedTable* table,
+    const BlockBasedTable* table,
     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
     bool index_key_includes_seq, bool index_key_is_full)
     : table_(table),
@@ -2214,7 +2216,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
 bool BlockBasedTable::PrefixMayMatch(
     const Slice& internal_key, const ReadOptions& read_options,
     const SliceTransform* options_prefix_extractor,
-    const bool need_upper_bound_check) {
+    const bool need_upper_bound_check) const {
   if (!rep_->filter_policy) {
     return true;
   }
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 3af617fecfa..f6f610ca2ac 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -112,7 +112,7 @@ class BlockBasedTable : public TableReader {
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
                       const SliceTransform* options_prefix_extractor,
-                      const bool need_upper_bound_check);
+                      const bool need_upper_bound_check) const;
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
@@ -215,18 +215,12 @@ class BlockBasedTable : public TableReader {
   struct Rep;
 
   Rep* get_rep() { return rep_; }
+  const Rep* get_rep() const { return rep_; }
 
   // input_iter: if it is not null, update this one and return it as Iterator
   template <typename TBlockIter>
   static TBlockIter* NewDataBlockIterator(
-      Rep* rep, const ReadOptions& ro, const Slice& index_value,
-      TBlockIter* input_iter = nullptr, bool is_index = false,
-      bool key_includes_seq = true, bool index_key_is_full = true,
-      GetContext* get_context = nullptr,
-      FilePrefetchBuffer* prefetch_buffer = nullptr);
-  template <typename TBlockIter>
-  static TBlockIter* NewDataBlockIterator(
-      Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
+      const Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
       TBlockIter* input_iter = nullptr, bool is_index = false,
       bool key_includes_seq = true, bool index_key_is_full = true,
       GetContext* get_context = nullptr, Status s = Status(),
@@ -283,7 +277,7 @@ class BlockBasedTable : public TableReader {
       const SliceTransform* prefix_extractor = nullptr) const;
 
   static CachableEntry<UncompressionDict> GetUncompressionDict(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
+      const Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
       GetContext* get_context);
 
   // Get the iterator from the index reader.
@@ -299,7 +293,7 @@ class BlockBasedTable : public TableReader {
   InternalIteratorBase<BlockHandle>* NewIndexIterator(
       const ReadOptions& read_options, bool need_upper_bound_check = false,
       IndexBlockIter* input_iter = nullptr,
-      GetContext* get_context = nullptr);
+      GetContext* get_context = nullptr) const;
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
@@ -386,7 +380,7 @@ class BlockBasedTable : public TableReader {
       InternalIterator* meta_iter,
       const InternalKeyComparator& internal_comparator);
   static Status ReadCompressionDictBlock(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+      const Rep* rep, FilePrefetchBuffer* prefetch_buffer,
       std::unique_ptr<const BlockContents>* compression_dict_block);
   static Status PrefetchIndexAndFilterBlocks(
       Rep* rep, FilePrefetchBuffer* prefetch_buffer,
@@ -430,7 +424,7 @@ class BlockBasedTable::PartitionedIndexIteratorState
     : public TwoLevelIteratorState {
  public:
   PartitionedIndexIteratorState(
-      BlockBasedTable* table,
+      const BlockBasedTable* table,
       std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
       const bool index_key_includes_seq, const bool index_key_is_full);
   InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
@@ -438,7 +432,7 @@ class BlockBasedTable::PartitionedIndexIteratorState
 
  private:
   // Don't own table_
-  BlockBasedTable* table_;
+  const BlockBasedTable* table_;
   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
   bool index_key_includes_seq_;
   bool index_key_is_full_;
@@ -561,7 +555,7 @@ struct BlockBasedTable::Rep {
 template <class TBlockIter, typename TValue = Slice>
 class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  public:
-  BlockBasedTableIterator(BlockBasedTable* table,
+  BlockBasedTableIterator(const BlockBasedTable* table,
                           const ReadOptions& read_options,
                           const InternalKeyComparator& icomp,
                           InternalIteratorBase<BlockHandle>* index_iter,
@@ -681,7 +675,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void CheckOutOfBound();
 
  private:
-  BlockBasedTable* table_;
+  const BlockBasedTable* table_;
   const ReadOptions read_options_;
   const InternalKeyComparator& icomp_;
   UserComparatorWrapper user_comparator_;

From 49c5a12dbee3aa65907e772b254d753c6d391da1 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 31 May 2019 11:52:59 -0700
Subject: [PATCH 096/572] Organizing rocksdb/db directory

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5390

Differential Revision: D15579388

Pulled By: vjnadimpalli

fbshipit-source-id: 5bfc95e31554b8ff05b97b76d6534113f527f366
---
 CMakeLists.txt                                | 42 +++++++++----------
 Makefile                                      | 10 ++---
 TARGETS                                       | 42 +++++++++----------
 db/builder.cc                                 |  2 +-
 db/column_family.cc                           | 10 ++---
 db/column_family_test.cc                      |  2 +-
 db/compact_files_test.cc                      |  2 +-
 db/compacted_db_impl.cc                       |  2 +-
 db/compacted_db_impl.h                        |  2 +-
 db/{ => compaction}/compaction.cc             |  3 +-
 db/{ => compaction}/compaction.h              |  0
 .../compaction_iteration_stats.h              |  0
 db/{ => compaction}/compaction_iterator.cc    |  3 +-
 db/{ => compaction}/compaction_iterator.h     |  4 +-
 .../compaction_iterator_test.cc               |  2 +-
 db/{ => compaction}/compaction_job.cc         |  4 +-
 db/{ => compaction}/compaction_job.h          |  2 +-
 .../compaction_job_stats_test.cc              |  2 +-
 db/{ => compaction}/compaction_job_test.cc    |  2 +-
 db/{ => compaction}/compaction_picker.cc      |  2 +-
 db/{ => compaction}/compaction_picker.h       |  2 +-
 db/{ => compaction}/compaction_picker_fifo.cc |  2 +-
 db/{ => compaction}/compaction_picker_fifo.h  |  2 +-
 .../compaction_picker_level.cc                |  3 +-
 db/{ => compaction}/compaction_picker_level.h |  2 +-
 db/{ => compaction}/compaction_picker_test.cc |  9 ++--
 .../compaction_picker_universal.cc            |  2 +-
 .../compaction_picker_universal.h             |  2 +-
 db/convenience.cc                             |  2 +-
 db/corruption_test.cc                         |  2 +-
 db/cuckoo_table_db_test.cc                    |  2 +-
 db/db_filesnapshot.cc                         |  2 +-
 db/{ => db_impl}/db_impl.cc                   |  4 +-
 db/{ => db_impl}/db_impl.h                    |  4 +-
 db/{ => db_impl}/db_impl_compaction_flush.cc  |  2 +-
 db/{ => db_impl}/db_impl_debug.cc             |  2 +-
 db/{ => db_impl}/db_impl_experimental.cc      |  2 +-
 db/{ => db_impl}/db_impl_files.cc             |  2 +-
 db/{ => db_impl}/db_impl_open.cc              |  2 +-
 db/{ => db_impl}/db_impl_readonly.cc          |  4 +-
 db/{ => db_impl}/db_impl_readonly.h           |  2 +-
 db/{ => db_impl}/db_impl_secondary.cc         |  2 +-
 db/{ => db_impl}/db_impl_secondary.h          |  2 +-
 db/{ => db_impl}/db_impl_write.cc             |  2 +-
 db/{ => db_impl}/db_secondary_test.cc         |  2 +-
 db/db_iter.h                                  |  2 +-
 db/db_options_test.cc                         |  2 +-
 db/db_test.cc                                 |  2 +-
 db/db_test_util.h                             |  2 +-
 db/deletefile_test.cc                         |  2 +-
 db/error_handler.cc                           |  2 +-
 db/experimental.cc                            |  2 +-
 db/fault_injection_test.cc                    |  2 +-
 db/forward_iterator.cc                        |  2 +-
 db/in_memory_stats_history.cc                 |  2 +-
 db/internal_stats.cc                          |  2 +-
 db/listener_test.cc                           |  2 +-
 db/memtable_list.cc                           |  2 +-
 db/merge_test.cc                              |  5 ++-
 db/obsolete_files_test.cc                     |  2 +-
 db/options_file_test.cc                       |  2 +-
 db/plain_table_db_test.cc                     |  2 +-
 db/prefix_test.cc                             |  2 +-
 db/range_del_aggregator.cc                    |  2 +-
 db/range_del_aggregator.h                     |  2 +-
 db/repair.cc                                  |  2 +-
 db/repair_test.cc                             |  2 +-
 db/table_properties_collector_test.cc         |  2 +-
 db/version_set.cc                             |  2 +-
 db/version_set.h                              |  4 +-
 db/wal_manager_test.cc                        |  2 +-
 db/write_batch.cc                             |  2 +-
 db/write_callback_test.cc                     |  2 +-
 file/sst_file_manager_impl.cc                 |  2 +-
 file/sst_file_manager_impl.h                  |  2 +-
 src.mk                                        | 42 +++++++++----------
 table/table_reader_bench.cc                   |  2 +-
 tools/db_bench_tool.cc                        |  2 +-
 tools/db_stress.cc                            |  2 +-
 tools/ldb_cmd.cc                              |  3 +-
 tools/reduce_levels_test.cc                   |  2 +-
 tools/trace_analyzer_tool.cc                  |  2 +-
 util/trace_replay.cc                          |  3 +-
 utilities/backupable/backupable_db_test.cc    |  2 +-
 utilities/blob_db/blob_db_impl.cc             |  2 +-
 utilities/blob_db/blob_file.cc                |  3 +-
 .../cassandra/cassandra_functional_test.cc    |  2 +-
 utilities/checkpoint/checkpoint_test.cc       |  2 +-
 utilities/debug.cc                            |  2 +-
 utilities/memory/memory_test.cc               |  2 +-
 utilities/memory/memory_util.cc               |  2 +-
 .../transactions/optimistic_transaction.cc    |  2 +-
 .../optimistic_transaction_db_impl.cc         |  2 +-
 .../transactions/pessimistic_transaction.cc   |  2 +-
 .../pessimistic_transaction_db.cc             |  2 +-
 utilities/transactions/transaction_base.cc    |  2 +-
 utilities/transactions/transaction_test.cc    |  2 +-
 utilities/transactions/transaction_test.h     |  2 +-
 utilities/transactions/transaction_util.cc    |  2 +-
 .../write_prepared_transaction_test.cc        |  2 +-
 utilities/transactions/write_prepared_txn.cc  |  2 +-
 .../transactions/write_prepared_txn_db.cc     |  2 +-
 .../transactions/write_unprepared_txn.cc      |  2 +-
 utilities/ttl/db_ttl_impl.h                   |  2 +-
 .../write_batch_with_index.cc                 |  2 +-
 105 files changed, 186 insertions(+), 184 deletions(-)
 rename db/{ => compaction}/compaction.cc (99%)
 rename db/{ => compaction}/compaction.h (100%)
 rename db/{ => compaction}/compaction_iteration_stats.h (100%)
 rename db/{ => compaction}/compaction_iterator.cc (99%)
 rename db/{ => compaction}/compaction_iterator.h (99%)
 rename db/{ => compaction}/compaction_iterator_test.cc (99%)
 rename db/{ => compaction}/compaction_job.cc (99%)
 rename db/{ => compaction}/compaction_job.h (99%)
 rename db/{ => compaction}/compaction_job_stats_test.cc (99%)
 rename db/{ => compaction}/compaction_job_test.cc (99%)
 rename db/{ => compaction}/compaction_picker.cc (99%)
 rename db/{ => compaction}/compaction_picker.h (99%)
 rename db/{ => compaction}/compaction_picker_fifo.cc (99%)
 rename db/{ => compaction}/compaction_picker_fifo.h (98%)
 rename db/{ => compaction}/compaction_picker_level.cc (99%)
 rename db/{ => compaction}/compaction_picker_level.h (96%)
 rename db/{ => compaction}/compaction_picker_test.cc (99%)
 rename db/{ => compaction}/compaction_picker_universal.cc (99%)
 rename db/{ => compaction}/compaction_picker_universal.h (98%)
 rename db/{ => db_impl}/db_impl.cc (99%)
 rename db/{ => db_impl}/db_impl.h (99%)
 rename db/{ => db_impl}/db_impl_compaction_flush.cc (99%)
 rename db/{ => db_impl}/db_impl_debug.cc (99%)
 rename db/{ => db_impl}/db_impl_experimental.cc (99%)
 rename db/{ => db_impl}/db_impl_files.cc (99%)
 rename db/{ => db_impl}/db_impl_open.cc (99%)
 rename db/{ => db_impl}/db_impl_readonly.cc (99%)
 rename db/{ => db_impl}/db_impl_readonly.h (99%)
 rename db/{ => db_impl}/db_impl_secondary.cc (99%)
 rename db/{ => db_impl}/db_impl_secondary.h (99%)
 rename db/{ => db_impl}/db_impl_write.cc (99%)
 rename db/{ => db_impl}/db_secondary_test.cc (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a4d9deb1b6..4c2fa7119c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -486,24 +486,24 @@ set(SOURCES
         db/c.cc
         db/column_family.cc
         db/compacted_db_impl.cc
-        db/compaction.cc
-        db/compaction_iterator.cc
-        db/compaction_job.cc
-        db/compaction_picker.cc
-        db/compaction_picker_fifo.cc
-        db/compaction_picker_level.cc
-        db/compaction_picker_universal.cc
+        db/compaction/compaction.cc
+        db/compaction/compaction_iterator.cc
+        db/compaction/compaction_picker.cc        
+        db/compaction/compaction_job.cc
+        db/compaction/compaction_picker_fifo.cc
+        db/compaction/compaction_picker_level.cc
+        db/compaction/compaction_picker_universal.cc
         db/convenience.cc
         db/db_filesnapshot.cc
-        db/db_impl.cc
-        db/db_impl_write.cc
-        db/db_impl_compaction_flush.cc
-        db/db_impl_files.cc
-        db/db_impl_open.cc
-        db/db_impl_debug.cc
-        db/db_impl_experimental.cc
-        db/db_impl_readonly.cc
-        db/db_impl_secondary.cc
+        db/db_impl/db_impl.cc
+        db/db_impl/db_impl_write.cc
+        db/db_impl/db_impl_compaction_flush.cc
+        db/db_impl/db_impl_files.cc
+        db/db_impl/db_impl_open.cc
+        db/db_impl/db_impl_debug.cc
+        db/db_impl/db_impl_experimental.cc
+        db/db_impl/db_impl_readonly.cc
+        db/db_impl/db_impl_secondary.cc
         db/db_info_dumper.cc
         db/db_iter.cc
         db/dbformat.cc
@@ -868,10 +868,10 @@ if(WITH_TESTS)
         cache/lru_cache_test.cc
         db/column_family_test.cc
         db/compact_files_test.cc
-        db/compaction_iterator_test.cc
-        db/compaction_job_stats_test.cc
-        db/compaction_job_test.cc
-        db/compaction_picker_test.cc
+        db/compaction/compaction_job_stats_test.cc
+        db/compaction/compaction_job_test.cc
+        db/compaction/compaction_iterator_test.cc
+        db/compaction/compaction_picker_test.cc
         db/comparator_db_test.cc
         db/corruption_test.cc
         db/cuckoo_table_db_test.cc
@@ -894,7 +894,7 @@ if(WITH_TESTS)
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
-        db/db_secondary_test.cc
+        db/db_impl/db_secondary_test.cc
         db/db_sst_test.cc
         db/db_statistics_test.cc
         db/db_table_properties_test.cc
diff --git a/Makefile b/Makefile
index 244b929c418..5181154a212 100644
--- a/Makefile
+++ b/Makefile
@@ -1339,13 +1339,13 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i
 flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_iterator_test: db/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_job_stats_test: db/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1417,7 +1417,7 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
 version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1585,7 +1585,7 @@ range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test
 sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 #-------------------------------------------------
diff --git a/TARGETS b/TARGETS
index dc39f87bcef..edddc7b99be 100644
--- a/TARGETS
+++ b/TARGETS
@@ -83,24 +83,24 @@ cpp_library(
         "db/c.cc",
         "db/column_family.cc",
         "db/compacted_db_impl.cc",
-        "db/compaction.cc",
-        "db/compaction_iterator.cc",
-        "db/compaction_job.cc",
-        "db/compaction_picker.cc",
-        "db/compaction_picker_fifo.cc",
-        "db/compaction_picker_level.cc",
-        "db/compaction_picker_universal.cc",
+        "db/compaction/compaction.cc",
+        "db/compaction/compaction_iterator.cc",
+        "db/compaction/compaction_job.cc",
+        "db/compaction/compaction_picker.cc",
+        "db/compaction/compaction_picker_fifo.cc",
+        "db/compaction/compaction_picker_level.cc",
+        "db/compaction/compaction_picker_universal.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
-        "db/db_impl.cc",
-        "db/db_impl_compaction_flush.cc",
-        "db/db_impl_debug.cc",
-        "db/db_impl_experimental.cc",
-        "db/db_impl_files.cc",
-        "db/db_impl_open.cc",
-        "db/db_impl_readonly.cc",
-        "db/db_impl_secondary.cc",
-        "db/db_impl_write.cc",
+        "db/db_impl/db_impl.cc",
+        "db/db_impl/db_impl_compaction_flush.cc",
+        "db/db_impl/db_impl_debug.cc",
+        "db/db_impl/db_impl_experimental.cc",
+        "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_open.cc",
+        "db/db_impl/db_impl_readonly.cc",
+        "db/db_impl/db_impl_secondary.cc",
+        "db/db_impl/db_impl_write.cc",
         "db/db_info_dumper.cc",
         "db/db_iter.cc",
         "db/dbformat.cc",
@@ -454,22 +454,22 @@ ROCKS_TESTS = [
     ],
     [
         "compaction_iterator_test",
-        "db/compaction_iterator_test.cc",
+        "db/compaction/compaction_iterator_test.cc",
         "serial",
     ],
     [
         "compaction_job_stats_test",
-        "db/compaction_job_stats_test.cc",
+        "db/compaction/compaction_job_stats_test.cc",
         "serial",
     ],
     [
         "compaction_job_test",
-        "db/compaction_job_test.cc",
+        "db/compaction/compaction_job_test.cc",
         "serial",
     ],
     [
         "compaction_picker_test",
-        "db/compaction_picker_test.cc",
+        "db/compaction/compaction_picker_test.cc",
         "serial",
     ],
     [
@@ -609,7 +609,7 @@ ROCKS_TESTS = [
     ],
     [
         "db_secondary_test",
-        "db/db_secondary_test.cc",
+        "db/db_impl/db_secondary_test.cc",
         "serial",
     ],
     [
diff --git a/db/builder.cc b/db/builder.cc
index 86aac02ab74..67d764ad18b 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -13,7 +13,7 @@
 #include <deque>
 #include <vector>
 
-#include "db/compaction_iterator.h"
+#include "db/compaction/compaction_iterator.h"
 #include "db/dbformat.h"
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
diff --git a/db/column_family.cc b/db/column_family.cc
index fde1996aeaf..ce22a00aac3 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -19,11 +19,11 @@
 #include <algorithm>
 #include <limits>
 
-#include "db/compaction_picker.h"
-#include "db/compaction_picker_fifo.h"
-#include "db/compaction_picker_level.h"
-#include "db/compaction_picker_universal.h"
-#include "db/db_impl.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 21b3321bea6..9374a135866 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -12,8 +12,8 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
 #include "db/db_test_util.h"
+#include "db/db_impl/db_impl.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index 438fdb7c96f..92975da87c1 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -10,7 +10,7 @@
 #include <thread>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc
index acdaad4ec29..88928391ad2 100644
--- a/db/compacted_db_impl.cc
+++ b/db/compacted_db_impl.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 #include "db/compacted_db_impl.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "table/get_context.h"
 
diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index 5c574b4b9a5..8c1a1428c81 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -5,7 +5,7 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include <vector>
 #include <string>
 
diff --git a/db/compaction.cc b/db/compaction/compaction.cc
similarity index 99%
rename from db/compaction.cc
rename to db/compaction/compaction.cc
index 089dd66848e..5dc7e83c8fc 100644
--- a/db/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -7,8 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction.h"
-
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -17,6 +15,7 @@
 #include <vector>
 
 #include "db/column_family.h"
+#include "db/compaction/compaction.h"
 #include "rocksdb/compaction_filter.h"
 #include "test_util/sync_point.h"
 #include "util/string_util.h"
diff --git a/db/compaction.h b/db/compaction/compaction.h
similarity index 100%
rename from db/compaction.h
rename to db/compaction/compaction.h
diff --git a/db/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h
similarity index 100%
rename from db/compaction_iteration_stats.h
rename to db/compaction/compaction_iteration_stats.h
diff --git a/db/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
similarity index 99%
rename from db/compaction_iterator.cc
rename to db/compaction/compaction_iterator.cc
index 7e060969962..135018f5148 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_iterator.h"
-
+#include "db/compaction/compaction_iterator.h"
 #include "db/snapshot_checker.h"
 #include "port/likely.h"
 #include "rocksdb/listener.h"
diff --git a/db/compaction_iterator.h b/db/compaction/compaction_iterator.h
similarity index 99%
rename from db/compaction_iterator.h
rename to db/compaction/compaction_iterator.h
index 6ab43b1becf..9744ab8dfc8 100644
--- a/db/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -10,8 +10,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/compaction.h"
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/compaction/compaction.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
similarity index 99%
rename from db/compaction_iterator_test.cc
rename to db/compaction/compaction_iterator_test.cc
index 99bb026b5a9..ddda79a4cfe 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -3,11 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_iterator.h"
 
 #include <string>
 #include <vector>
 
+#include "db/compaction/compaction_iterator.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
diff --git a/db/compaction_job.cc b/db/compaction/compaction_job.cc
similarity index 99%
rename from db/compaction_job.cc
rename to db/compaction/compaction_job.cc
index 92a6fab8da8..3866d70ee00 100644
--- a/db/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -7,7 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_job.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -24,8 +23,9 @@
 #include <utility>
 #include <vector>
 
+#include "db/compaction/compaction_job.h"
 #include "db/builder.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/error_handler.h"
diff --git a/db/compaction_job.h b/db/compaction/compaction_job.h
similarity index 99%
rename from db/compaction_job.h
rename to db/compaction/compaction_job.h
index 0751727d704..1387fffb1c1 100644
--- a/db/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction_iterator.h"
+#include "db/compaction/compaction_iterator.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
diff --git a/db/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
similarity index 99%
rename from db/compaction_job_stats_test.cc
rename to db/compaction/compaction_job_stats_test.cc
index 35c1100f99b..91310e9f112 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -21,7 +21,7 @@
 #include <unordered_set>
 #include <utility>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
diff --git a/db/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
similarity index 99%
rename from db/compaction_job_test.cc
rename to db/compaction/compaction_job_test.cc
index 93e55b7a03b..838cda5eaca 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -17,7 +17,7 @@
 #include <tuple>
 
 #include "db/column_family.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
diff --git a/db/compaction_picker.cc b/db/compaction/compaction_picker.cc
similarity index 99%
rename from db/compaction_picker.cc
rename to db/compaction/compaction_picker.cc
index bfe13828b18..4276ea9cb41 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/compaction_picker.h b/db/compaction/compaction_picker.h
similarity index 99%
rename from db/compaction_picker.h
rename to db/compaction/compaction_picker.h
index 437c8d30473..53477014cf6 100644
--- a/db/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -15,7 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/compaction.h"
+#include "db/compaction/compaction.h"
 #include "db/version_set.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
diff --git a/db/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
similarity index 99%
rename from db/compaction_picker_fifo.cc
rename to db/compaction/compaction_picker_fifo.cc
index eadb31f9ee5..ffb5a9f6495 100644
--- a/db/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_fifo.h"
 #ifndef ROCKSDB_LITE
 
 #ifndef __STDC_FORMAT_MACROS
diff --git a/db/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
similarity index 98%
rename from db/compaction_picker_fifo.h
rename to db/compaction/compaction_picker_fifo.h
index 9da107c5d4a..a4e63803cf8 100644
--- a/db/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -10,7 +10,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
 namespace rocksdb {
 class FIFOCompactionPicker : public CompactionPicker {
diff --git a/db/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
similarity index 99%
rename from db/compaction_picker_level.cc
rename to db/compaction/compaction_picker_level.cc
index 70fe46c5b81..aeb368ea20a 100644
--- a/db/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -7,8 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker_level.h"
-
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -17,6 +15,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/compaction/compaction_picker_level.h"
 #include "test_util/sync_point.h"
 #include "util/log_buffer.h"
 
diff --git a/db/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
similarity index 96%
rename from db/compaction_picker_level.h
rename to db/compaction/compaction_picker_level.h
index 1d37fe50eaf..9fc196698a1 100644
--- a/db/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -9,7 +9,7 @@
 
 #pragma once
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
 namespace rocksdb {
 // Picking compactions for leveled compaction. See wiki page
diff --git a/db/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
similarity index 99%
rename from db/compaction_picker_test.cc
rename to db/compaction/compaction_picker_test.cc
index c3e9e450ff0..bab93227a4f 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -3,15 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_picker.h"
 
 #include <limits>
 #include <string>
 #include <utility>
-#include "db/compaction.h"
-#include "db/compaction_picker_fifo.h"
-#include "db/compaction_picker_level.h"
-#include "db/compaction_picker_universal.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
 
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
diff --git a/db/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
similarity index 99%
rename from db/compaction_picker_universal.cc
rename to db/compaction/compaction_picker_universal.cc
index 20edd30748d..465245715fd 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker_universal.h"
+#include "db/compaction/compaction_picker_universal.h"
 #ifndef ROCKSDB_LITE
 
 #ifndef __STDC_FORMAT_MACROS
diff --git a/db/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
similarity index 98%
rename from db/compaction_picker_universal.h
rename to db/compaction/compaction_picker_universal.h
index 375e5998e25..2c44735d95f 100644
--- a/db/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -10,7 +10,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
 namespace rocksdb {
 class UniversalCompactionPicker : public CompactionPicker {
diff --git a/db/convenience.cc b/db/convenience.cc
index 71c237f60c0..c11653fb190 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -8,7 +8,7 @@
 
 #include "rocksdb/convenience.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 
 namespace rocksdb {
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 53c4d42d28a..9e83c9080e6 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -16,7 +16,7 @@
 #include <inttypes.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
 #include "file/filename.h"
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 6f60e2d7037..135a34c2e09 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 59757aeb9f7..ac544793ee4 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -14,7 +14,7 @@
 #include <stdint.h>
 #include <algorithm>
 #include <string>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "file/file_util.h"
diff --git a/db/db_impl.cc b/db/db_impl/db_impl.cc
similarity index 99%
rename from db/db_impl.cc
rename to db/db_impl/db_impl.cc
index 5534c225f4d..196e38f14fa 100644
--- a/db/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -28,7 +28,7 @@
 #include <vector>
 
 #include "db/builder.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
 #include "db/db_info_dumper.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
diff --git a/db/db_impl.h b/db/db_impl/db_impl.h
similarity index 99%
rename from db/db_impl.h
rename to db/db_impl/db_impl.h
index 4c418d6f38f..27d39f90d24 100644
--- a/db/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
 #include "db/dbformat.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
@@ -39,7 +39,7 @@
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "memtable_list.h"
+#include "db/memtable_list.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
similarity index 99%
rename from db/db_impl_compaction_flush.cc
rename to db/db_impl/db_impl_compaction_flush.cc
index c6025a8cc57..881fa26af37 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
similarity index 99%
rename from db/db_impl_debug.cc
rename to db/db_impl/db_impl_debug.cc
index f558971190e..4b558facb37 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -9,7 +9,7 @@
 
 #ifndef NDEBUG
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "monitoring/thread_status_updater.h"
 
diff --git a/db/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
similarity index 99%
rename from db/db_impl_experimental.cc
rename to db/db_impl/db_impl_experimental.cc
index 47a880199e2..a8fed40be01 100644
--- a/db/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_impl_files.cc b/db/db_impl/db_impl_files.cc
similarity index 99%
rename from db/db_impl_files.cc
rename to db/db_impl/db_impl_files.cc
index 64c6dc96879..608c8ce4948 100644
--- a/db/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_impl_open.cc b/db/db_impl/db_impl_open.cc
similarity index 99%
rename from db/db_impl_open.cc
rename to db/db_impl/db_impl_open.cc
index 5dae140c7ea..5019221b5ca 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
similarity index 99%
rename from db/db_impl_readonly.cc
rename to db/db_impl/db_impl_readonly.cc
index 5d7515c28e2..55249228456 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -3,12 +3,12 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl_readonly.h"
+#include "db/db_impl/db_impl_readonly.h"
 
 #include "db/compacted_db_impl.h"
-#include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
+#include "db/db_impl/db_impl.h"
 #include "monitoring/perf_context_imp.h"
 
 namespace rocksdb {
diff --git a/db/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
similarity index 99%
rename from db/db_impl_readonly.h
rename to db/db_impl/db_impl_readonly.h
index 23816210dc8..18df900cba0 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -9,7 +9,7 @@
 
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
similarity index 99%
rename from db/db_impl_secondary.cc
rename to db/db_impl/db_impl_secondary.cc
index a8ea921a260..a976a5750dd 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl_secondary.h"
+#include "db/db_impl/db_impl_secondary.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
similarity index 99%
rename from db/db_impl_secondary.h
rename to db/db_impl/db_impl_secondary.h
index a57835432dc..24cfd33c11d 100644
--- a/db/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -9,7 +9,7 @@
 
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_impl_write.cc b/db/db_impl/db_impl_write.cc
similarity index 99%
rename from db/db_impl_write.cc
rename to db/db_impl/db_impl_write.cc
index 98463f7b27f..02e23e26931 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -6,7 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
diff --git a/db/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
similarity index 99%
rename from db/db_secondary_test.cc
rename to db/db_impl/db_secondary_test.cc
index 23132434f1f..c9184281c22 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl_secondary.h"
 #include "db/db_test_util.h"
+#include "db/db_impl/db_impl_secondary.h"
 #include "port/stack_trace.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
diff --git a/db/db_iter.h b/db/db_iter.h
index 85b546c544c..9a6df9610a4 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -10,10 +10,10 @@
 #pragma once
 #include <stdint.h>
 #include <string>
-#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "memory/arena.h"
+#include "db/db_impl/db_impl.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index b899ba18b4a..36ecf3a1b57 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -11,8 +11,8 @@
 #include <unordered_map>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
 #include "db/db_test_util.h"
+#include "db/db_impl/db_impl.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
diff --git a/db/db_test.cc b/db/db_test.cc
index debb2ba603e..4c4bd382ca8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -24,7 +24,7 @@
 #endif
 
 #include "cache/lru_cache.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 2af202fad96..4e9fcafadfa 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -25,7 +25,7 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 280d269f1c6..18014e5b435 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -13,7 +13,7 @@
 #include <map>
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 140fb4850f6..1d818f48948 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
-#include "db/db_impl.h"
 #include "db/event_helpers.h"
+#include "db/db_impl/db_impl.h"
 #include "file/sst_file_manager_impl.h"
 
 namespace rocksdb {
diff --git a/db/experimental.cc b/db/experimental.cc
index d509a37bf2e..0c3c3335d92 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -5,7 +5,7 @@
 
 #include "rocksdb/experimental.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 namespace experimental {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 00619d447d1..e6ce1fa8364 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -11,9 +11,9 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#include "db/db_impl.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
+#include "db/db_impl/db_impl.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
 #include "rocksdb/cache.h"
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 9e0823366d0..2633a3ff9bd 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -11,7 +11,7 @@
 #include <utility>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
diff --git a/db/in_memory_stats_history.cc b/db/in_memory_stats_history.cc
index e9e0cc74950..41fdb71c8c1 100644
--- a/db/in_memory_stats_history.cc
+++ b/db/in_memory_stats_history.cc
@@ -6,8 +6,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl.h"
 #include "db/in_memory_stats_history.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 58332f30faf..21dde297ab6 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -22,7 +22,7 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "util/string_util.h"
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 81a0fa17678..6fabf197f2c 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -3,11 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "db/db_impl/db_impl.h"
 #include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 2b4ac6b84da..ca5283139a5 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -13,7 +13,7 @@
 #include <limits>
 #include <queue>
 #include <string>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 13c35d2c017..1b62b5c2c57 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -7,8 +7,8 @@
 #include <memory>
 #include <iostream>
 
-#include "db/db_impl.h"
 #include "db/dbformat.h"
+#include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
@@ -18,6 +18,9 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "db/db_impl/db_impl.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 655c659b44f..3a78869c95d 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -13,7 +13,7 @@
 #include <map>
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index c7eba52c290..b86ecefa97a 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -6,7 +6,7 @@
 #ifndef ROCKSDB_LITE
 #include <string>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index a73dd3cb431..d2d0426e652 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -12,7 +12,7 @@
 #include <algorithm>
 #include <set>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 3f2e794a6c4..19f02f1099a 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -17,7 +17,7 @@ int main() {
 #include <iostream>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index 8f86528ecb2..7c188aeaa07 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -5,7 +5,7 @@
 
 #include "db/range_del_aggregator.h"
 
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index ce7897a975a..96cfb581309 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/repair.cc b/db/repair.cc
index 577c122bcf9..400e754ba45 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -66,7 +66,7 @@
 
 #include <inttypes.h>
 #include "db/builder.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 1851cde0dfc..21907e43575 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -9,7 +9,7 @@
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "file/file_util.h"
 #include "rocksdb/comparator.h"
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index e818f46142c..a9895bbedba 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -9,10 +9,10 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 
+#include "db/db_impl/db_impl.h"
 #include "options/cf_options.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
diff --git a/db/version_set.cc b/db/version_set.cc
index 5d0529d2707..26465a01a4e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -23,7 +23,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "db/compaction.h"
+#include "compaction/compaction.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
diff --git a/db/version_set.h b/db/version_set.h
index 776e08e448c..c43e4091442 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -29,8 +29,8 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction.h"
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
 #include "db/dbformat.h"
 #include "db/file_indexer.h"
 #include "db/log_reader.h"
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index c0c47b0c34b..3657fb691be 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -13,7 +13,7 @@
 #include "rocksdb/write_buffer_manager.h"
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
 #include "db/version_set.h"
 #include "db/wal_manager.h"
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 830fbeab15d..1459e5a3aae 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -42,13 +42,13 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
 #include "db/write_batch_internal.h"
+#include "db/db_impl/db_impl.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/merge_operator.h"
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index aa3d077c40d..b5e26a8a7f0 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -11,7 +11,7 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/write_callback.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index d63170452c0..efd9e30e6a5 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -12,7 +12,7 @@
 #include <inttypes.h>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
index b506ece2796..89304227807 100644
--- a/file/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -11,7 +11,7 @@
 
 #include "port/port.h"
 
-#include "db/compaction.h"
+#include "db/compaction/compaction.h"
 #include "db/error_handler.h"
 #include "file/delete_scheduler.h"
 #include "rocksdb/sst_file_manager.h"
diff --git a/src.mk b/src.mk
index 44013bc2e1d..5021acb96ac 100644
--- a/src.mk
+++ b/src.mk
@@ -7,24 +7,24 @@ LIB_SOURCES =                                                   \
   db/c.cc                                                       \
   db/column_family.cc                                           \
   db/compacted_db_impl.cc                                       \
-  db/compaction.cc                                              \
-  db/compaction_iterator.cc                                     \
-  db/compaction_job.cc                                          \
-  db/compaction_picker.cc                                       \
-  db/compaction_picker_fifo.cc                                  \
-  db/compaction_picker_level.cc                                 \
-  db/compaction_picker_universal.cc                             \
+  db/compaction/compaction.cc                                 	\
+  db/compaction/compaction_iterator.cc                          \
+  db/compaction/compaction_job.cc                               \
+  db/compaction/compaction_picker.cc                            \
+  db/compaction/compaction_picker_fifo.cc                       \
+  db/compaction/compaction_picker_level.cc                      \
+  db/compaction/compaction_picker_universal.cc                 	\
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
-  db/db_impl.cc                                                 \
-  db/db_impl_compaction_flush.cc                                \
-  db/db_impl_debug.cc                                           \
-  db/db_impl_experimental.cc                                    \
-  db/db_impl_files.cc                                           \
-  db/db_impl_open.cc                                            \
-  db/db_impl_readonly.cc                                        \
-  db/db_impl_secondary.cc                                       \
-  db/db_impl_write.cc                                           \
+  db/db_impl/db_impl.cc                                         \
+  db/db_impl/db_impl_compaction_flush.cc                        \
+  db/db_impl/db_impl_debug.cc                                   \
+  db/db_impl/db_impl_experimental.cc                            \
+  db/db_impl/db_impl_files.cc                                   \
+  db/db_impl/db_impl_open.cc                                    \
+  db/db_impl/db_impl_readonly.cc                                \
+  db/db_impl/db_impl_secondary.cc                               \
+  db/db_impl/db_impl_write.cc                                   \
   db/db_info_dumper.cc                                          \
   db/db_iter.cc                                                 \
   db/dbformat.cc                                                \
@@ -259,10 +259,10 @@ MAIN_SOURCES =                                                          \
   cache/cache_test.cc                                                   \
   db/column_family_test.cc                                              \
   db/compact_files_test.cc                                              \
-  db/compaction_iterator_test.cc                                        \
-  db/compaction_job_stats_test.cc                                       \
-  db/compaction_job_test.cc                                             \
-  db/compaction_picker_test.cc                                          \
+  db/compaction/compaction_iterator_test.cc                             \
+  db/compaction/compaction_job_test.cc                                  \
+  db/compaction/compaction_job_stats_test.cc                            \
+  db/compaction/compaction_picker_test.cc                               \
   db/comparator_db_test.cc                                              \
   db/corruption_test.cc                                                 \
   db/cuckoo_table_db_test.cc                                            \
@@ -286,7 +286,7 @@ MAIN_SOURCES =                                                          \
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
-  db/db_secondary_test.cc                                               \
+  db/db_impl/db_secondary_test.cc                                       \
   db/db_sst_test.cc                                                     \
   db/db_statistics_test.cc                                              \
   db/db_table_properties_test.cc                                        \
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 822c2294bb7..2ec7b2d0fb5 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -11,7 +11,7 @@ int main() {
 }
 #else
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/db.h"
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index d80502f16fa..c6f19bed585 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -33,7 +33,7 @@
 #include <thread>
 #include <unordered_map>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/malloc_stats.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index c112cb348ff..0c828deb165 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -44,7 +44,7 @@ int main() {
 #include <queue>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
 #include "monitoring/histogram.h"
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 10e9a495d23..d6f9b415707 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -12,7 +13,7 @@
 
 #include <inttypes.h>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index 3aa0e3cf36d..8b23dbf369d 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -6,7 +6,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/ldb_cmd.h"
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 03057afbc78..93528c00608 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -27,7 +27,7 @@
 #include <sstream>
 #include <stdexcept>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "options/cf_options.h"
diff --git a/util/trace_replay.cc b/util/trace_replay.cc
index c90fef2eff8..9e0e8c48cde 100644
--- a/util/trace_replay.cc
+++ b/util/trace_replay.cc
@@ -8,7 +8,8 @@
 #include <chrono>
 #include <sstream>
 #include <thread>
-#include "db/db_impl.h"
+
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 05006d6a3eb..37d9e4cd182 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -12,7 +12,7 @@
 #include <algorithm>
 #include <string>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "env/env_chroot.h"
 #include "file/filename.h"
 #include "port/port.h"
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 04b7eb73e2b..7f447a04ad0 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -11,7 +11,7 @@
 #include <iomanip>
 #include <memory>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "file/file_util.h"
 #include "file/filename.h"
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index e14307d44cd..4475772d8d1 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -17,7 +18,7 @@
 #include <memory>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "file/filename.h"
 #include "util/logging.h"
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 431ef697929..cec9ce7d88f 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <iostream>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index da2972affd7..d7d2548af3e 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -16,7 +16,7 @@
 #include <iostream>
 #include <thread>
 #include <utility>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 72fcbf0f54d..8ddf64b5dc4 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -7,7 +7,7 @@
 
 #include "rocksdb/utilities/debug.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index 29903d460f2..75fb9cd3f92 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/memory_util.h"
diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc
index 83bf33c1794..47ca4b7bb7d 100644
--- a/utilities/memory/memory_util.cc
+++ b/utilities/memory/memory_util.cc
@@ -7,7 +7,7 @@
 
 #include "rocksdb/utilities/memory_util.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc
index 48c9180ae9e..e8cf6eade4e 100644
--- a/utilities/transactions/optimistic_transaction.cc
+++ b/utilities/transactions/optimistic_transaction.cc
@@ -10,7 +10,7 @@
 #include <string>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index d9db6fde07e..b7fedc06615 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 93d75a8357f..ed7444894c7 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -13,7 +13,7 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/snapshot.h"
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 8920f85fb76..e906b444ff5 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -16,7 +16,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 68b87b5aa47..d4923a88f4c 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -13,7 +13,7 @@
 
 #include <inttypes.h>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/column_family.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 0750b249bbb..6c9f4bccd62 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -16,7 +16,7 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 22dc208f523..da2a08d3c52 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -15,7 +15,7 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction.h"
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index ec6f7e60ae2..c582b73aa3e 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -15,7 +15,7 @@
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/string_util.h"
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index c0a7e278054..8b52b1ae662 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -18,7 +18,7 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 6c7cb359dc4..05650e2b3f9 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -16,7 +16,7 @@
 #include <set>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index b4a71f5ea6c..bf94d83d82b 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -17,7 +17,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 823b12ea171..efd766514c8 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -6,7 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/transactions/write_unprepared_txn.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 8bf064a0466..69e991ed855 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -16,7 +16,7 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/utility_db.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 #ifdef _WIN32
 // Windows API macro interference
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 0f8f6c1d622..cf17abf22e9 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -10,7 +10,7 @@
 #include <memory>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "memory/arena.h"

From cae22c53fbad071be8aa3a8543415383b4dfaef4 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 31 May 2019 15:21:36 -0700
Subject: [PATCH 097/572] Make format

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5395

Differential Revision: D15581698

Pulled By: vjnadimpalli

fbshipit-source-id: f415972f16e784b1361714c202b97defcab46767
---
 db/column_family_test.cc                   | 2 +-
 db/compacted_db_impl.h                     | 4 ++--
 db/compaction/compaction_iterator.h        | 2 +-
 db/compaction/compaction_job.cc            | 2 +-
 db/db_impl/db_impl.h                       | 2 +-
 db/db_impl/db_impl_readonly.cc             | 2 +-
 db/db_impl/db_secondary_test.cc            | 2 +-
 db/db_iter.h                               | 2 +-
 db/db_options_test.cc                      | 2 +-
 db/error_handler.cc                        | 2 +-
 db/fault_injection_test.cc                 | 2 +-
 db/listener_test.cc                        | 2 +-
 db/merge_test.cc                           | 5 +----
 db/write_batch.cc                          | 2 +-
 utilities/transactions/transaction_base.cc | 2 +-
 utilities/ttl/db_ttl_impl.h                | 6 +++---
 16 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 9374a135866..63d987f3c99 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -12,8 +12,8 @@
 #include <string>
 #include <thread>
 
-#include "db/db_test_util.h"
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index 8c1a1428c81..c1b8da9a782 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -5,9 +5,9 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
-#include "db/db_impl/db_impl.h"
-#include <vector>
 #include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 9744ab8dfc8..86a2b87b22c 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -10,8 +10,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/compaction/compaction_iteration_stats.h"
 #include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3866d70ee00..b782c6ca7ad 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -23,8 +23,8 @@
 #include <utility>
 #include <vector>
 
-#include "db/compaction/compaction_job.h"
 #include "db/builder.h"
+#include "db/compaction/compaction_job.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 27d39f90d24..c241a36dbc3 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -30,6 +30,7 @@
 #include "db/internal_stats.h"
 #include "db/log_writer.h"
 #include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
 #include "db/pre_release_callback.h"
 #include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
@@ -39,7 +40,6 @@
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "db/memtable_list.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index 55249228456..6db498397ce 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -6,9 +6,9 @@
 #include "db/db_impl/db_impl_readonly.h"
 
 #include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
-#include "db/db_impl/db_impl.h"
 #include "monitoring/perf_context_imp.h"
 
 namespace rocksdb {
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index c9184281c22..e8eafd673ed 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_test_util.h"
 #include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
diff --git a/db/db_iter.h b/db/db_iter.h
index 9a6df9610a4..6a4bf8a5507 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -10,10 +10,10 @@
 #pragma once
 #include <stdint.h>
 #include <string>
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "memory/arena.h"
-#include "db/db_impl/db_impl.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 36ecf3a1b57..a9c8d218235 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -11,8 +11,8 @@
 #include <unordered_map>
 
 #include "db/column_family.h"
-#include "db/db_test_util.h"
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 1d818f48948..9e1bf5cc107 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
-#include "db/event_helpers.h"
 #include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
 #include "file/sst_file_manager_impl.h"
 
 namespace rocksdb {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index e6ce1fa8364..126addc80d1 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -11,9 +11,9 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
+#include "db/db_impl/db_impl.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
-#include "db/db_impl/db_impl.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
 #include "rocksdb/cache.h"
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 6fabf197f2c..5d8f6eb5e63 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -3,11 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
-#include "db/db_impl/db_impl.h"
 #include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 1b62b5c2c57..2965045d9df 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -7,8 +7,8 @@
 #include <memory>
 #include <iostream>
 
-#include "db/dbformat.h"
 #include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
 #include "db/write_batch_internal.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
@@ -18,9 +18,6 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
-#include "db/dbformat.h"
-#include "db/write_batch_internal.h"
-#include "db/db_impl/db_impl.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 1459e5a3aae..d7a2e792a33 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -42,13 +42,13 @@
 #include <vector>
 
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
 #include "db/write_batch_internal.h"
-#include "db/db_impl/db_impl.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/merge_operator.h"
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index d4923a88f4c..6553b49614c 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -13,8 +13,8 @@
 
 #include <inttypes.h>
 
-#include "db/db_impl/db_impl.h"
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 69e991ed855..593cd64a0fc 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -10,13 +10,13 @@
 #include <string>
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/compaction_filter.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/utilities/utility_db.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "db/db_impl/db_impl.h"
+#include "rocksdb/utilities/utility_db.h"
 
 #ifdef _WIN32
 // Windows API macro interference

From d7d8605f56fd4f881869395aa06f9c5f259b5020 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 31 May 2019 16:59:00 -0700
Subject: [PATCH 098/572] Fix a clang analyze warning (#5398)

Summary:
Clang analyzer is reporting a false positive warning thinking `type` is uninitialized. The variable is initialized by `ParseFileName` by reference so assigning a default value to keep clang happy.
Current failure:
```
file/filename.cc:435:15: warning: The left operand of '==' is a garbage value
        (type == kInfoLogFile)) {
         ~~~~ ^
1 warning generated.
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5398

Differential Revision: D15588421

Pulled By: miasantreble

fbshipit-source-id: fb121c270300f3a659e68bc7f6674ff4ddf2df9a
---
 file/filename.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/file/filename.cc b/file/filename.cc
index 6f00d15ebca..77d9569d3a9 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -413,7 +413,7 @@ Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
   assert(parent_dir != nullptr);
   assert(info_log_list != nullptr);
   uint64_t number = 0;
-  FileType type;
+  FileType type = kLogFile;
 
   if (!db_log_dir.empty()) {
     *parent_dir = db_log_dir;

From 79edf0a7a8ab75f60692efd54b1e0ed7da7aafca Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Fri, 31 May 2019 17:14:03 -0700
Subject: [PATCH 099/572] util: fix log_write_bench (#5335)

Summary:
log_write_bench doesn't compile due to some recent API changes.
This patch fixes the compile by adding the missing params for
OptimizeForLogWrite() and WritableFileWriter().

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5335

Differential Revision: D15588875

Pulled By: miasantreble

fbshipit-source-id: 726ff4dc227733e915c3b796df25bd3ab0b431ac
---
 util/log_write_bench.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index 9efa43f8a3c..ac4cb685b6e 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -32,13 +32,16 @@ DEFINE_bool(enable_sync, false, "sync after each write.");
 namespace rocksdb {
 void RunBenchmark() {
   std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
+  DBOptions options;
   Env* env = Env::Default();
-  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
+  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
   std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
   std::unique_ptr<WritableFileWriter> writer;
-  writer.reset(new WritableFileWriter(std::move(file), env_options));
+  writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
+                                      env, nullptr /* stats */,
+                                      options.listeners));
 
   std::string record;
   record.assign(FLAGS_record_size, 'X');

From 000b9ec217663faad1d0196b28c623149e01e024 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Fri, 31 May 2019 17:19:43 -0700
Subject: [PATCH 100/572] Move some logging related files to logging/ (#5387)

Summary:
Many logging related source files are under util/. It will be more structured if they are together.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5387

Differential Revision: D15579036

Pulled By: siying

fbshipit-source-id: 3850134ed50b8c0bb40a0c8ae1f184fa4081303f
---
 CMakeLists.txt                                        | 10 +++++-----
 Makefile                                              |  4 ++--
 TARGETS                                               | 10 +++++-----
 db/builder.h                                          |  2 +-
 db/compaction/compaction_job.cc                       |  4 ++--
 db/compaction/compaction_job.h                        |  2 +-
 db/compaction/compaction_job_stats_test.cc            |  2 +-
 db/compaction/compaction_picker.cc                    |  2 +-
 db/compaction/compaction_picker_fifo.cc               |  2 +-
 db/compaction/compaction_picker_level.cc              |  2 +-
 db/compaction/compaction_picker_test.cc               |  2 +-
 db/compaction/compaction_picker_universal.cc          |  2 +-
 db/db_impl/db_impl.cc                                 |  6 +++---
 db/db_impl/db_impl.h                                  |  3 ++-
 db/db_impl/db_impl_secondary.cc                       |  2 +-
 db/db_iter.cc                                         |  2 +-
 db/dbformat.h                                         |  2 +-
 db/dbformat_test.cc                                   |  2 +-
 db/event_helpers.h                                    |  2 +-
 db/fault_injection_test.cc                            |  2 +-
 db/filename_test.cc                                   |  2 +-
 db/flush_job.cc                                       |  6 +++---
 db/flush_job.h                                        |  2 +-
 db/listener_test.cc                                   |  2 +-
 db/memtable_list.cc                                   |  2 +-
 db/memtable_list.h                                    |  2 +-
 db/plain_table_db_test.cc                             |  2 +-
 db/version_builder_test.cc                            |  2 +-
 db/version_edit.cc                                    |  2 +-
 db/version_set_test.cc                                |  2 +-
 db/wal_manager.cc                                     |  2 +-
 env/env_hdfs.cc                                       |  2 +-
 env/env_posix.cc                                      |  4 ++--
 env/env_test.cc                                       |  2 +-
 env/io_posix.cc                                       |  2 +-
 file/delete_scheduler.cc                              |  2 +-
 file/filename.cc                                      |  2 +-
 java/rocksjni/write_batch.cc                          |  2 +-
 {util => logging}/auto_roll_logger.cc                 |  5 +++--
 {util => logging}/auto_roll_logger.h                  |  0
 {util => logging}/auto_roll_logger_test.cc            |  4 ++--
 {util => logging}/event_logger.cc                     |  4 ++--
 {util => logging}/event_logger.h                      |  2 +-
 {util => logging}/event_logger_test.cc                |  2 +-
 {util => logging}/log_buffer.cc                       |  2 +-
 {util => logging}/log_buffer.h                        |  0
 {util => logging}/logging.h                           |  2 +-
 {env => logging}/posix_logger.h                       |  0
 memory/arena.cc                                       |  2 +-
 options/db_options.cc                                 |  2 +-
 port/port_posix.cc                                    |  4 ++--
 port/util_logger.h                                    |  2 +-
 port/win/port_win.cc                                  |  2 +-
 src.mk                                                | 10 +++++-----
 table/block_based/block.cc                            |  2 +-
 table/block_based/partitioned_filter_block_test.cc    |  2 +-
 table/block_fetcher.cc                                |  2 +-
 table/format.cc                                       |  2 +-
 test_util/transaction_test_util.cc                    |  2 +-
 tools/db_stress.cc                                    |  2 +-
 util/bloom_test.cc                                    |  2 +-
 util/comparator.cc                                    |  8 ++++----
 util/dynamic_bloom_test.cc                            |  2 +-
 utilities/backupable/backupable_db.cc                 |  2 +-
 utilities/blob_db/blob_db_impl.cc                     |  2 +-
 utilities/blob_db/blob_db_impl_filesnapshot.cc        |  2 +-
 utilities/blob_db/blob_file.cc                        |  2 +-
 utilities/merge_operators/uint64add.cc                |  2 +-
 utilities/persistent_cache/block_cache_tier.cc        |  2 +-
 utilities/persistent_cache/block_cache_tier_file.cc   |  2 +-
 utilities/transactions/optimistic_transaction_test.cc |  2 +-
 71 files changed, 96 insertions(+), 94 deletions(-)
 rename {util => logging}/auto_roll_logger.cc (99%)
 rename {util => logging}/auto_roll_logger.h (100%)
 rename {util => logging}/auto_roll_logger_test.cc (99%)
 rename {util => logging}/event_logger.cc (96%)
 rename {util => logging}/event_logger.h (99%)
 rename {util => logging}/event_logger_test.cc (97%)
 rename {util => logging}/log_buffer.cc (98%)
 rename {util => logging}/log_buffer.h (100%)
 rename {util => logging}/logging.h (98%)
 rename {env => logging}/posix_logger.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c2fa7119c2..1b5f03a0f3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -549,6 +549,9 @@ set(SOURCES
         file/file_util.cc
         file/filename.cc
         file/sst_file_manager_impl.cc
+        logging/auto_roll_logger.cc
+        logging/event_logger.cc
+        logging/log_buffer.cc
         memory/arena.cc
         memory/concurrent_arena.cc
         memory/jemalloc_nodump_allocator.cc
@@ -620,7 +623,6 @@ set(SOURCES
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
-        util/auto_roll_logger.cc
         util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
@@ -629,11 +631,9 @@ set(SOURCES
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/dynamic_bloom.cc
-        util/event_logger.cc
         util/file_reader_writer.cc
         util/filter_policy.cc
         util/hash.cc
-        util/log_buffer.cc
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
@@ -939,6 +939,8 @@ if(WITH_TESTS)
         env/env_test.cc
         env/mock_env_test.cc
         file/delete_scheduler_test.cc
+        logging/auto_roll_logger_test.cc
+        logging/event_logger_test.cc
         memory/arena_test.cc
         memtable/inlineskiplist_test.cc
         memtable/skiplist_test.cc
@@ -963,13 +965,11 @@ if(WITH_TESTS)
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
         tools/trace_analyzer_test.cc
-        util/auto_roll_logger_test.cc
         util/autovector_test.cc
         util/bloom_test.cc
         util/coding_test.cc
         util/crc32c_test.cc
         util/dynamic_bloom_test.cc
-        util/event_logger_test.cc
         util/file_reader_writer_test.cc
         util/filelock_test.cc
         util/hash_test.cc
diff --git a/Makefile b/Makefile
index 5181154a212..080e0713355 100644
--- a/Makefile
+++ b/Makefile
@@ -1498,7 +1498,7 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS
 trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1519,7 +1519,7 @@ manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/TARGETS b/TARGETS
index edddc7b99be..da4f4d9a61d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -148,6 +148,9 @@ cpp_library(
         "file/file_util.cc",
         "file/filename.cc",
         "file/sst_file_manager_impl.cc",
+        "logging/auto_roll_logger.cc",
+        "logging/event_logger.cc",
+        "logging/log_buffer.cc",
         "memory/arena.cc",
         "memory/concurrent_arena.cc",
         "memory/jemalloc_nodump_allocator.cc",
@@ -218,7 +221,6 @@ cpp_library(
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
-        "util/auto_roll_logger.cc",
         "util/bloom.cc",
         "util/build_version.cc",
         "util/coding.cc",
@@ -228,11 +230,9 @@ cpp_library(
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
         "util/dynamic_bloom.cc",
-        "util/event_logger.cc",
         "util/file_reader_writer.cc",
         "util/filter_policy.cc",
         "util/hash.cc",
-        "util/log_buffer.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
@@ -359,7 +359,7 @@ ROCKS_TESTS = [
     ],
     [
         "auto_roll_logger_test",
-        "util/auto_roll_logger_test.cc",
+        "logging/auto_roll_logger_test.cc",
         "serial",
     ],
     [
@@ -699,7 +699,7 @@ ROCKS_TESTS = [
     ],
     [
         "event_logger_test",
-        "util/event_logger_test.cc",
+        "logging/event_logger_test.cc",
         "serial",
     ],
     [
diff --git a/db/builder.h b/db/builder.h
index 34a4bff1a25..4fa56f50e34 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include "db/range_tombstone_fragmenter.h"
 #include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
@@ -20,7 +21,6 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/types.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/event_logger.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index b782c6ca7ad..5761345d8a2 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -40,6 +40,8 @@
 #include "db/version_set.h"
 #include "file/filename.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -56,8 +58,6 @@
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 1387fffb1c1..84d38c163eb 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -29,6 +29,7 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "logging/event_logger.h"
 #include "options/cf_options.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -40,7 +41,6 @@
 #include "rocksdb/transaction_log.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index 91310e9f112..5fb805df5f0 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -28,6 +28,7 @@
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "monitoring/thread_status_util.h"
@@ -57,7 +58,6 @@
 #include "test_util/testutil.h"
 #include "util/compression.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 4276ea9cb41..a03f7b46fd1 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -21,9 +21,9 @@
 #include <vector>
 #include "db/column_family.h"
 #include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/statistics.h"
 #include "test_util/sync_point.h"
-#include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index ffb5a9f6495..1fc6ed113d2 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 #include "db/column_family.h"
-#include "util/log_buffer.h"
+#include "logging/log_buffer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index aeb368ea20a..e9653da8e55 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -16,8 +16,8 @@
 #include <vector>
 
 #include "db/compaction/compaction_picker_level.h"
+#include "logging/log_buffer.h"
 #include "test_util/sync_point.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index bab93227a4f..58a0a12f03e 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -12,9 +12,9 @@
 #include "db/compaction/compaction_picker_level.h"
 #include "db/compaction/compaction_picker_universal.h"
 
+#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 465245715fd..e8aca00be81 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -21,9 +21,9 @@
 #include <utility>
 #include "db/column_family.h"
 #include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/statistics.h"
 #include "test_util/sync_point.h"
-#include "util/log_buffer.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 196e38f14fa..9675e727dde 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -56,6 +56,9 @@
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/iostats_context_imp.h"
@@ -86,15 +89,12 @@
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
 #include "tools/sst_dump_tool_imp.h"
-#include "util/auto_roll_logger.h"
 #include "util/autovector.h"
 #include "util/build_version.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index c241a36dbc3..5461ef300aa 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -40,6 +40,8 @@
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "db/memtable_list.h"
+#include "logging/event_logger.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -52,7 +54,6 @@
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/hash.h"
 #include "util/repeatable_thread.h"
 #include "util/stop_watch.h"
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index a976a5750dd..34364d124a8 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -12,8 +12,8 @@
 
 #include "db/db_iter.h"
 #include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
 #include "monitoring/perf_context_imp.h"
-#include "util/auto_roll_logger.h"
 
 namespace rocksdb {
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index d953d365e0f..bcfed2bb021 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -17,6 +17,7 @@
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
@@ -25,7 +26,6 @@
 #include "rocksdb/options.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 #include "util/trace_replay.h"
diff --git a/db/dbformat.h b/db/dbformat.h
index 437119fb775..dbf6ea6f3c9 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -14,6 +14,7 @@
 #include <utility>
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -23,7 +24,6 @@
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
-#include "util/logging.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index f4665b06ca3..9ec1bc34348 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -8,8 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/dbformat.h"
+#include "logging/logging.h"
 #include "test_util/testharness.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/db/event_helpers.h b/db/event_helpers.h
index ea35b4b5b19..88c72cd4e13 100644
--- a/db/event_helpers.h
+++ b/db/event_helpers.h
@@ -10,9 +10,9 @@
 
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "logging/event_logger.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/table_properties.h"
-#include "util/event_logger.h"
 
 namespace rocksdb {
 
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 126addc80d1..1d18569f2f4 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -16,6 +16,7 @@
 #include "db/version_set.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -25,7 +26,6 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/db/filename_test.cc b/db/filename_test.cc
index 377d128fae0..bc52e0eae64 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -10,9 +10,9 @@
 #include "file/filename.h"
 
 #include "db/dbformat.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index d4ae79ff29a..2b2696c10ba 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -31,6 +31,9 @@
 #include "db/version_set.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -47,9 +50,6 @@
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/event_logger.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 
diff --git a/db/flush_job.h b/db/flush_job.h
index c4081945623..fdb0917bdba 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -28,6 +28,7 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "logging/event_logger.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -37,7 +38,6 @@
 #include "rocksdb/transaction_log.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 5d8f6eb5e63..9fbd5d0d3ff 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -9,6 +9,7 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/cache.h"
@@ -28,7 +29,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index ca5283139a5..045bfc9a2d3 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -17,6 +17,7 @@
 #include "db/memtable.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
+#include "logging/log_buffer.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -24,7 +25,6 @@
 #include "table/merging_iterator.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
diff --git a/db/memtable_list.h b/db/memtable_list.h
index a5f0c123292..a72077ff3d5 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -17,13 +17,13 @@
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
 #include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "util/autovector.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index d2d0426e652..68df71768e2 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -16,6 +16,7 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -32,7 +33,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 63067857420..3a144190cf1 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -6,9 +6,9 @@
 #include <string>
 #include "db/version_edit.h"
 #include "db/version_set.h"
+#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 668ff60f103..ecadf6e3980 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -10,10 +10,10 @@
 #include "db/version_edit.h"
 
 #include "db/version_set.h"
+#include "logging/event_logger.h"
 #include "rocksdb/slice.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/event_logger.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 9b4072dc777..77890d82638 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -9,10 +9,10 @@
 
 #include "db/version_set.h"
 #include "db/log_writer.h"
+#include "logging/logging.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 2fe5305f8d6..71c2ffe4b22 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -24,6 +24,7 @@
 #include "db/write_batch_internal.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
@@ -32,7 +33,6 @@
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc
index 9d0354cced8..5bdf03ae3e1 100644
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include <iostream>
 #include <sstream>
+#include "logging/logging.h"
 #include "rocksdb/status.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 
 #define HDFS_EXISTS 0
diff --git a/env/env_posix.cc b/env/env_posix.cc
index bf1a9e0e5c4..7eb5b7c1451 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -43,7 +43,8 @@
 #include <vector>
 
 #include "env/io_posix.h"
-#include "env/posix_logger.h"
+#include "logging/logging.h"
+#include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "port/port.h"
@@ -52,7 +53,6 @@
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
 #include "util/thread_local.h"
diff --git a/env/env_test.cc b/env/env_test.cc
index 615eca8b400..e8cb9b24534 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -38,13 +38,13 @@
 #endif
 
 #include "env/env_chroot.h"
+#include "logging/log_buffer.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/log_buffer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 313cbd8eee6..8b42a636295 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -27,7 +27,7 @@
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
-#include "env/posix_logger.h"
+#include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index 22f28f5375f..b66956ca08c 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -11,10 +11,10 @@
 #include <vector>
 
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/file/filename.cc b/file/filename.cc
index 77d9569d3a9..c9f22e585b7 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -16,10 +16,10 @@
 #include <ctype.h>
 #include <stdio.h>
 #include <vector>
+#include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index f1b77446c02..c6d0b9072ae 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -11,6 +11,7 @@
 #include "db/write_batch_internal.h"
 #include "include/org_rocksdb_WriteBatch.h"
 #include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "logging/logging.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
@@ -20,7 +21,6 @@
 #include "rocksjni/portal.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/logging.h"
 
 /*
  * Class:     org_rocksdb_WriteBatch
diff --git a/util/auto_roll_logger.cc b/logging/auto_roll_logger.cc
similarity index 99%
rename from util/auto_roll_logger.cc
rename to logging/auto_roll_logger.cc
index 9e8d6750319..ec240f5a334 100644
--- a/util/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -3,10 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/auto_roll_logger.h"
+#include "logging/auto_roll_logger.h"
+
 #include <algorithm>
 #include "file/filename.h"
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/util/auto_roll_logger.h b/logging/auto_roll_logger.h
similarity index 100%
rename from util/auto_roll_logger.h
rename to logging/auto_roll_logger.h
diff --git a/util/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
similarity index 99%
rename from util/auto_roll_logger_test.cc
rename to logging/auto_roll_logger_test.cc
index ff47719d490..cce98d374ef 100644
--- a/util/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -6,7 +6,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "util/auto_roll_logger.h"
+#include "logging/auto_roll_logger.h"
 #include <errno.h>
 #include <sys/stat.h>
 #include <algorithm>
@@ -17,11 +17,11 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 namespace {
diff --git a/util/event_logger.cc b/logging/event_logger.cc
similarity index 96%
rename from util/event_logger.cc
rename to logging/event_logger.cc
index b488984f350..aceccdf93c0 100644
--- a/util/event_logger.cc
+++ b/logging/event_logger.cc
@@ -7,14 +7,14 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include "util/event_logger.h"
+#include "logging/event_logger.h"
 
 #include <inttypes.h>
 #include <cassert>
 #include <sstream>
 #include <string>
 
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/util/event_logger.h b/logging/event_logger.h
similarity index 99%
rename from util/event_logger.h
rename to logging/event_logger.h
index d88a6a4fe68..c3a7c30c601 100644
--- a/util/event_logger.h
+++ b/logging/event_logger.h
@@ -10,8 +10,8 @@
 #include <string>
 #include <chrono>
 
+#include "logging/log_buffer.h"
 #include "rocksdb/env.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
diff --git a/util/event_logger_test.cc b/logging/event_logger_test.cc
similarity index 97%
rename from util/event_logger_test.cc
rename to logging/event_logger_test.cc
index 1ee0c4d9787..cc635d42fbf 100644
--- a/util/event_logger_test.cc
+++ b/logging/event_logger_test.cc
@@ -5,8 +5,8 @@
 
 #include <string>
 
+#include "logging/event_logger.h"
 #include "test_util/testharness.h"
-#include "util/event_logger.h"
 
 namespace rocksdb {
 
diff --git a/util/log_buffer.cc b/logging/log_buffer.cc
similarity index 98%
rename from util/log_buffer.cc
rename to logging/log_buffer.cc
index d09e0cb002f..74db11c66e3 100644
--- a/util/log_buffer.cc
+++ b/logging/log_buffer.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/log_buffer.h"
+#include "logging/log_buffer.h"
 
 #include "port/sys_time.h"
 #include "port/port.h"
diff --git a/util/log_buffer.h b/logging/log_buffer.h
similarity index 100%
rename from util/log_buffer.h
rename to logging/log_buffer.h
diff --git a/util/logging.h b/logging/logging.h
similarity index 98%
rename from util/logging.h
rename to logging/logging.h
index a4ef31bd6b5..cad90a309f1 100644
--- a/util/logging.h
+++ b/logging/logging.h
@@ -19,7 +19,7 @@
 
 inline const char* RocksLogShorterFileName(const char* file)
 {
-  // 15 is the length of "util/logging.h".
+  // 15 is the length of "logging/logging.h".
   // If the name of this file changed, please change this number, too.
   return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
 }
diff --git a/env/posix_logger.h b/logging/posix_logger.h
similarity index 100%
rename from env/posix_logger.h
rename to logging/posix_logger.h
diff --git a/memory/arena.cc b/memory/arena.cc
index b774225535e..3f113e776a4 100644
--- a/memory/arena.cc
+++ b/memory/arena.cc
@@ -19,10 +19,10 @@
 #include <sys/mman.h>
 #endif
 #include <algorithm>
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/options/db_options.cc b/options/db_options.cc
index e180238f433..72e348b3227 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -11,12 +11,12 @@
 
 #include <inttypes.h>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
 #include "rocksdb/wal_filter.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 80081e480e0..f19d18ff0e6 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -18,11 +18,11 @@
 #include <signal.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/time.h>
 #include <unistd.h>
 #include <cstdlib>
-#include "util/logging.h"
+#include "logging/logging.h"
 
 namespace rocksdb {
 
diff --git a/port/util_logger.h b/port/util_logger.h
index ba424705b27..d2d62a9879c 100644
--- a/port/util_logger.h
+++ b/port/util_logger.h
@@ -14,7 +14,7 @@
 // of what the new port_<platform>.h file must provide.
 
 #if defined(ROCKSDB_PLATFORM_POSIX)
-#include "env/posix_logger.h"
+#include "logging/posix_logger.h"
 #elif defined(OS_WIN)
 #include "port/win/win_logger.h"
 #endif
diff --git a/port/win/port_win.cc b/port/win/port_win.cc
index 03ba6ef4281..31e65e78cde 100644
--- a/port/win/port_win.cc
+++ b/port/win/port_win.cc
@@ -33,7 +33,7 @@
 #include <codecvt>
 #endif
 
-#include "util/logging.h"
+#include "logging/logging.h"
 
 namespace rocksdb {
 
diff --git a/src.mk b/src.mk
index 5021acb96ac..38835f8c6d2 100644
--- a/src.mk
+++ b/src.mk
@@ -72,6 +72,9 @@ LIB_SOURCES =                                                   \
   file/file_util.cc                                             \
   file/filename.cc                                              \
   file/sst_file_manager_impl.cc                                 \
+  logging/auto_roll_logger.cc                                   \
+  logging/event_logger.cc                                       \
+  logging/log_buffer.cc                                         \
   memory/arena.cc                                               \
   memory/concurrent_arena.cc                                    \
   memory/jemalloc_nodump_allocator.cc                           \
@@ -139,7 +142,6 @@ LIB_SOURCES =                                                   \
   test_util/sync_point_impl.cc                                  \
   test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
-  util/auto_roll_logger.cc                                      \
   util/bloom.cc                                                 \
   util/build_version.cc                                         \
   util/coding.cc                                                \
@@ -149,11 +151,9 @@ LIB_SOURCES =                                                   \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
   util/dynamic_bloom.cc                                         \
-  util/event_logger.cc                                          \
   util/file_reader_writer.cc                                    \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
-  util/log_buffer.cc                                            \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
   util/rate_limiter.cc                                          \
@@ -340,6 +340,8 @@ MAIN_SOURCES =                                                          \
   env/env_basic_test.cc                                                 \
   env/env_test.cc                                                       \
   env/mock_env_test.cc                                                  \
+  logging/auto_roll_logger_test.cc                                      \
+  logging/event_logger_test.cc                                          \
   memory/arena_test.cc                                                  \
   memtable/inlineskiplist_test.cc                                       \
   memtable/memtablerep_bench.cc                                         \
@@ -369,13 +371,11 @@ MAIN_SOURCES =                                                          \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
   tools/trace_analyzer_test.cc						\
-  util/auto_roll_logger_test.cc                                         \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
   util/coding_test.cc                                                   \
   util/crc32c_test.cc                                                   \
   util/dynamic_bloom_test.cc                                            \
-  util/event_logger_test.cc                                             \
   util/filelock_test.cc                                                 \
   util/log_write_bench.cc                                               \
   util/rate_limiter_test.cc                                             \
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index dfc4aa3c679..6c7e46d5969 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -15,6 +15,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -23,7 +24,6 @@
 #include "table/block_based/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 9a1a4d526f1..70e5bbd3bbd 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -11,11 +11,11 @@
 #include "table/full_filter_bits_builder.h"
 
 #include "index_builder.h"
+#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/hash.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 263abbfcf80..72b567fc23d 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -12,6 +12,7 @@
 #include <inttypes.h>
 #include <string>
 
+#include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -24,7 +25,6 @@
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/format.cc b/table/format.cc
index 3f95fd4d44b..a4441fe5646 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "block_fetcher.h"
+#include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -24,7 +25,6 @@
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc
index 14d39065182..3a7d9e97f50 100644
--- a/test_util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -24,7 +24,7 @@
 
 #include "db/dbformat.h"
 #include "db/snapshot_impl.h"
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 0c828deb165..dc8f8152376 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -47,6 +47,7 @@ int main() {
 #include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
+#include "logging/logging.h"
 #include "monitoring/histogram.h"
 #include "options/options_helper.h"
 #include "port/port.h"
@@ -66,7 +67,6 @@ int main() {
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/gflags_compat.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/string_util.h"
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 7a13728308c..5e61f31ba60 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -17,13 +17,13 @@ int main() {
 
 #include <vector>
 
+#include "logging/logging.h"
 #include "memory/arena.h"
 #include "rocksdb/filter_policy.h"
 #include "table/full_filter_bits_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/logging.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
diff --git a/util/comparator.cc b/util/comparator.cc
index b42c23725fc..eab17ebccf3 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -7,13 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/comparator.h"
+#include <stdint.h>
 #include <algorithm>
 #include <memory>
-#include <stdint.h>
-#include "rocksdb/comparator.h"
-#include "rocksdb/slice.h"
+#include "logging/logging.h"
 #include "port/port.h"
-#include "util/logging.h"
+#include "rocksdb/slice.h"
 
 namespace rocksdb {
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 036e0128008..7ca8bb891aa 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -24,12 +24,12 @@ int main() {
 #include <vector>
 
 #include "dynamic_bloom.h"
+#include "logging/logging.h"
 #include "memory/arena.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index dcd88ffdb8c..7a2e1940316 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -11,6 +11,7 @@
 
 #include "rocksdb/utilities/backupable_db.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/transaction_log.h"
@@ -19,7 +20,6 @@
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 7f447a04ad0..25583fa981a 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -16,6 +16,7 @@
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/convenience.h"
@@ -31,7 +32,6 @@
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 16b9ff826e6..e74396a33d6 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -8,7 +8,7 @@
 #include "utilities/blob_db/blob_db_impl.h"
 
 #include "file/filename.h"
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/mutexlock.h"
 
 // BlobDBImpl methods to get snapshot of files, e.g. for replication.
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 4475772d8d1..03cff7834b9 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -21,7 +21,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "file/filename.h"
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc
index b998e1b8e4e..e71ecfd9a5b 100644
--- a/utilities/merge_operators/uint64add.cc
+++ b/utilities/merge_operators/uint64add.cc
@@ -5,11 +5,11 @@
 
 #include <memory>
 
+#include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
-#include "util/logging.h"
 #include "utilities/merge_operators.h"
 
 using namespace rocksdb;
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index 5baf64772cc..2169f906955 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -10,9 +10,9 @@
 #include <utility>
 #include <vector>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "test_util/sync_point.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
 #include "utilities/persistent_cache/block_cache_tier_file.h"
 
diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index ce6335fb586..0fb17b369e3 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -13,9 +13,9 @@
 #include <memory>
 #include <vector>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "util/crc32c.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index 4f075d0d9fc..5e1af2fb1f5 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -9,6 +9,7 @@
 #include <string>
 #include <thread>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
@@ -16,7 +17,6 @@
 #include "test_util/testharness.h"
 #include "test_util/transaction_test_util.h"
 #include "util/crc32c.h"
-#include "util/logging.h"
 #include "util/random.h"
 
 using std::string;

From 349db9049732ad1f6c7466483b4e79c8817730dd Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 3 Jun 2019 12:31:45 -0700
Subject: [PATCH 101/572] Make GetEntryFromCache a member function. (#5394)

Summary:
The commit makes GetEntryFromCache become a member function. It also makes all its callers become member functions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5394

Differential Revision: D15579222

Pulled By: HaoyuHuang

fbshipit-source-id: 07509c42ee9022dcded54950012bd3bd562aa1ae
---
 table/block_based/block_based_table_reader.cc | 513 +++++++++---------
 table/block_based/block_based_table_reader.h  | 112 ++--
 2 files changed, 311 insertions(+), 314 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index b7fba779f47..2fdaf2afd2a 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -17,8 +17,6 @@
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
-#include "table/block_fetcher.h"
-#include "table/meta_blocks.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -29,6 +27,7 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -36,9 +35,11 @@
 #include "table/block_based/filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
 #include "table/multiget_context.h"
 #include "table/persistent_cache_helper.h"
 #include "table/sst_file_writer_collectors.h"
@@ -128,51 +129,6 @@ void ForceReleaseCachedEntry(void* arg, void* h) {
   cache->Release(handle, true /* force_erase */);
 }
 
-Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                 int level, Tickers block_cache_miss_ticker,
-                                 Tickers block_cache_hit_ticker,
-                                 uint64_t* block_cache_miss_stats,
-                                 uint64_t* block_cache_hit_stats,
-                                 Statistics* statistics,
-                                 GetContext* get_context) {
-  auto cache_handle = block_cache->Lookup(key, statistics);
-  if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
-                              static_cast<uint32_t>(level));
-    if (get_context != nullptr) {
-      // overall cache hit
-      get_context->get_context_stats_.num_cache_hit++;
-      // total bytes read from cache
-      get_context->get_context_stats_.num_cache_bytes_read +=
-          block_cache->GetUsage(cache_handle);
-      // block-type specific cache hit
-      (*block_cache_hit_stats)++;
-    } else {
-      // overall cache hit
-      RecordTick(statistics, BLOCK_CACHE_HIT);
-      // total bytes read from cache
-      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
-                 block_cache->GetUsage(cache_handle));
-      RecordTick(statistics, block_cache_hit_ticker);
-    }
-  } else {
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
-                              static_cast<uint32_t>(level));
-    if (get_context != nullptr) {
-      // overall cache miss
-      get_context->get_context_stats_.num_cache_miss++;
-      // block-type specific cache miss
-      (*block_cache_miss_stats)++;
-    } else {
-      RecordTick(statistics, BLOCK_CACHE_MISS);
-      RecordTick(statistics, block_cache_miss_ticker);
-    }
-  }
-
-  return cache_handle;
-}
-
 // For hash based index, return true if prefix_extractor and
 // prefix_extractor_block mismatch, false otherwise. This flag will be used
 // as total_order_seek via NewIndexIterator
@@ -275,8 +231,8 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   assert(rep != nullptr);
 
   constexpr bool is_index = true;
-  const Status s = BlockBasedTable::RetrieveBlock(
-      prefetch_buffer, rep, read_options, rep->footer.index_handle(),
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
       UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
 
   return s;
@@ -446,10 +402,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       const bool is_index = true;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
-      s = BlockBasedTable::MaybeReadBlockAndLoadToCache(
-          prefetch_buffer.get(), rep, ro, handle,
-          UncompressionDict::GetEmptyDict(), &block, is_index,
-          nullptr /* get_context */);
+      s = table()->MaybeReadBlockAndLoadToCache(
+          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+          &block, is_index, nullptr /* get_context */);
 
       assert(s.ok() || block.GetValue() == nullptr);
       if (s.ok() && block.GetValue() != nullptr) {
@@ -707,6 +662,49 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
 };
 
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+    Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker,
+    Tickers block_cache_hit_ticker, uint64_t* block_cache_miss_stats,
+    uint64_t* block_cache_hit_stats, Statistics* statistics,
+    GetContext* get_context) const {
+  auto cache_handle = block_cache->Lookup(key, statistics);
+  if (cache_handle != nullptr) {
+    PERF_COUNTER_ADD(block_cache_hit_count, 1);
+    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                              static_cast<uint32_t>(rep_->level));
+    if (get_context != nullptr) {
+      // overall cache hit
+      get_context->get_context_stats_.num_cache_hit++;
+      // total bytes read from cache
+      get_context->get_context_stats_.num_cache_bytes_read +=
+          block_cache->GetUsage(cache_handle);
+      // block-type specific cache hit
+      (*block_cache_hit_stats)++;
+    } else {
+      // overall cache hit
+      RecordTick(statistics, BLOCK_CACHE_HIT);
+      // total bytes read from cache
+      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
+                 block_cache->GetUsage(cache_handle));
+      RecordTick(statistics, block_cache_hit_ticker);
+    }
+  } else {
+    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                              static_cast<uint32_t>(rep_->level));
+    if (get_context != nullptr) {
+      // overall cache miss
+      get_context->get_context_stats_.num_cache_miss++;
+      // block-type specific cache miss
+      (*block_cache_miss_stats)++;
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_MISS);
+      RecordTick(statistics, block_cache_miss_ticker);
+    }
+  }
+
+  return cache_handle;
+}
+
 // Helper function to setup the cache key's prefix for the Table.
 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
   assert(kMaxCacheKeyPrefixSize >= 10);
@@ -938,24 +936,24 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // Read metaindex
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter);
+  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
   if (!s.ok()) {
     return s;
   }
 
-  s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(),
-                          largest_seqno);
+  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
+                                     largest_seqno);
   if (!s.ok()) {
     return s;
   }
-  s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(),
-                        internal_comparator);
+  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
+                                   internal_comparator);
   if (!s.ok()) {
     return s;
   }
-  s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(),
-                                   new_table.get(), prefetch_all, table_options,
-                                   level);
+  s = new_table->PrefetchIndexAndFilterBlocks(
+      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
+      table_options, level);
 
   if (s.ok()) {
     // Update tail prefetch stats
@@ -1043,7 +1041,7 @@ Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
 }
 
 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
     TableProperties** table_properties) {
   assert(table_properties != nullptr);
   // If this is an external SST file ingested with write_global_seqno set to
@@ -1054,8 +1052,8 @@ Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
   // original value, i.e. 0, and verify the checksum again.
   BlockHandle props_block_handle;
   CacheAllocationPtr tmp_buf;
-  Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer,
-                            rep->footer, rep->ioptions, table_properties,
+  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
+                            rep_->footer, rep_->ioptions, table_properties,
                             false /* verify_checksum */, &props_block_handle,
                             &tmp_buf, false /* compression_type_missing */,
                             nullptr /* memory_allocator */);
@@ -1071,21 +1069,21 @@ Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
           tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
     }
     uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
-    s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(),
+    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
                                 block_size + 1, value);
   }
   return s;
 }
 
 Status BlockBasedTable::ReadPropertiesBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
     const SequenceNumber largest_seqno) {
   bool found_properties_block = true;
   Status s;
   s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
 
   if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
+    ROCKS_LOG_WARN(rep_->ioptions.info_log,
                    "Error when seeking to properties block from file: %s",
                    s.ToString().c_str());
   } else if (found_properties_block) {
@@ -1093,15 +1091,15 @@ Status BlockBasedTable::ReadPropertiesBlock(
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
       s = ReadProperties(
-          meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer,
-          rep->ioptions, &table_properties, true /* verify_checksum */,
+          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, true /* verify_checksum */,
           nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
           false /* compression_type_missing */, nullptr /* memory_allocator */);
     }
 
     if (s.IsCorruption()) {
-      s = TryReadPropertiesWithGlobalSeqno(
-          rep, prefetch_buffer, meta_iter->value(), &table_properties);
+      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
+                                           &table_properties);
     }
     std::unique_ptr<TableProperties> props_guard;
     if (table_properties != nullptr) {
@@ -1109,53 +1107,55 @@ Status BlockBasedTable::ReadPropertiesBlock(
     }
 
     if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
+      ROCKS_LOG_WARN(rep_->ioptions.info_log,
                      "Encountered error while reading data from properties "
                      "block %s",
                      s.ToString().c_str());
     } else {
       assert(table_properties != nullptr);
-      rep->table_properties.reset(props_guard.release());
-      rep->blocks_maybe_compressed = rep->table_properties->compression_name !=
-                                     CompressionTypeToString(kNoCompression);
-      rep->blocks_definitely_zstd_compressed =
-          (rep->table_properties->compression_name ==
+      rep_->table_properties.reset(props_guard.release());
+      rep_->blocks_maybe_compressed =
+          rep_->table_properties->compression_name !=
+          CompressionTypeToString(kNoCompression);
+      rep_->blocks_definitely_zstd_compressed =
+          (rep_->table_properties->compression_name ==
                CompressionTypeToString(kZSTD) ||
-           rep->table_properties->compression_name ==
+           rep_->table_properties->compression_name ==
                CompressionTypeToString(kZSTDNotFinalCompression));
     }
   } else {
-    ROCKS_LOG_ERROR(rep->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
                     "Cannot find Properties block from file.");
   }
 #ifndef ROCKSDB_LITE
-  if (rep->table_properties) {
-    ParseSliceTransform(rep->table_properties->prefix_extractor_name,
-                        &(rep->table_prefix_extractor));
+  if (rep_->table_properties) {
+    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
+                        &(rep_->table_prefix_extractor));
   }
 #endif  // ROCKSDB_LITE
 
   // Read the table properties, if provided.
-  if (rep->table_properties) {
-    rep->whole_key_filtering &=
-        IsFeatureSupported(*(rep->table_properties),
+  if (rep_->table_properties) {
+    rep_->whole_key_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep->ioptions.info_log);
-    rep->prefix_filtering &= IsFeatureSupported(
-        *(rep->table_properties),
-        BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
-
-    s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno,
-                                &(rep->global_seqno));
+                           rep_->ioptions.info_log);
+    rep_->prefix_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kPrefixFiltering,
+                           rep_->ioptions.info_log);
+
+    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                                &(rep_->global_seqno));
     if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str());
+      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
     }
   }
   return s;
 }
 
 Status BlockBasedTable::ReadRangeDelBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
     const InternalKeyComparator& internal_comparator) {
   Status s;
   bool found_range_del_block;
@@ -1163,13 +1163,13 @@ Status BlockBasedTable::ReadRangeDelBlock(
   s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
   if (!s.ok()) {
     ROCKS_LOG_WARN(
-        rep->ioptions.info_log,
+        rep_->ioptions.info_log,
         "Error when seeking to range delete tombstones block from file: %s",
         s.ToString().c_str());
   } else if (found_range_del_block && !range_del_handle.IsNull()) {
     ReadOptions read_options;
     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
-        rep, read_options, range_del_handle, nullptr /* input_iter */,
+        read_options, range_del_handle, nullptr /* input_iter */,
         false /* is_index */, true /* key_includes_seq */,
         true /* index_key_is_full */, nullptr /* get_context */, Status(),
         prefetch_buffer));
@@ -1177,11 +1177,11 @@ Status BlockBasedTable::ReadRangeDelBlock(
     s = iter->status();
     if (!s.ok()) {
       ROCKS_LOG_WARN(
-          rep->ioptions.info_log,
+          rep_->ioptions.info_log,
           "Encountered error while reading data from range del block %s",
           s.ToString().c_str());
     } else {
-      rep->fragmented_range_dels =
+      rep_->fragmented_range_dels =
           std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
                                                          internal_comparator);
     }
@@ -1190,25 +1190,25 @@ Status BlockBasedTable::ReadRangeDelBlock(
 }
 
 Status BlockBasedTable::ReadCompressionDictBlock(
-    const Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-    std::unique_ptr<const BlockContents>* compression_dict_block) {
+    FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<const BlockContents>* compression_dict_block) const {
   assert(compression_dict_block != nullptr);
   Status s;
-  if (!rep->compression_dict_handle.IsNull()) {
+  if (!rep_->compression_dict_handle.IsNull()) {
     std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
     PersistentCacheOptions cache_options;
     ReadOptions read_options;
     read_options.verify_checksums = true;
     BlockFetcher compression_block_fetcher(
-        rep->file.get(), prefetch_buffer, rep->footer, read_options,
-        rep->compression_dict_handle, compression_dict_cont.get(),
-        rep->ioptions, false /* decompress */, false /*maybe_compressed*/,
+        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
+        rep_->compression_dict_handle, compression_dict_cont.get(),
+        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
         UncompressionDict::GetEmptyDict(), cache_options);
     s = compression_block_fetcher.ReadBlockContents();
 
     if (!s.ok()) {
       ROCKS_LOG_WARN(
-          rep->ioptions.info_log,
+          rep_->ioptions.info_log,
           "Encountered error while reading data from compression dictionary "
           "block %s",
           s.ToString().c_str());
@@ -1220,13 +1220,13 @@ Status BlockBasedTable::ReadCompressionDictBlock(
 }
 
 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
     BlockBasedTable* new_table, bool prefetch_all,
     const BlockBasedTableOptions& table_options, const int level) {
   Status s;
 
   // Find filter handle and filter type
-  if (rep->filter_policy) {
+  if (rep_->filter_policy) {
     for (auto filter_type :
          {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
           Rep::FilterType::kBlockFilter}) {
@@ -1245,10 +1245,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
           assert(0);
       }
       std::string filter_block_key = prefix;
-      filter_block_key.append(rep->filter_policy->Name());
-      if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle)
+      filter_block_key.append(rep_->filter_policy->Name());
+      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
               .ok()) {
-        rep->filter_type = filter_type;
+        rep_->filter_type = filter_type;
         break;
       }
     }
@@ -1258,7 +1258,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     // Find compression dictionary handle
     bool found_compression_dict;
     s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
-                                   &rep->compression_dict_handle);
+                                   &rep_->compression_dict_handle);
   }
 
   BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
@@ -1272,13 +1272,14 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
        index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
   // prefetch the first level of filter
   const bool prefetch_filter =
-      prefetch_all || (table_options.pin_top_level_index_and_filter &&
-                       rep->filter_type == Rep::FilterType::kPartitionedFilter);
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
   // Partition fitlers cannot be enabled without partition indexes
   assert(!prefetch_filter || prefetch_index);
   // pin both index and filters, down to all partitions
   const bool pin_all =
-      rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
   // pin the first level of index
   const bool pin_index =
       pin_all || (table_options.pin_top_level_index_and_filter &&
@@ -1286,7 +1287,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   // pin the first level of filter
   const bool pin_filter =
       pin_all || (table_options.pin_top_level_index_and_filter &&
-                  rep->filter_type == Rep::FilterType::kPartitionedFilter);
+                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
 
   IndexReader* index_reader = nullptr;
   if (s.ok()) {
@@ -1294,12 +1295,12 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
                                      prefetch_index, pin_index, &index_reader);
     if (s.ok()) {
       assert(index_reader != nullptr);
-      rep->index_reader.reset(index_reader);
+      rep_->index_reader.reset(index_reader);
       // The partitions of partitioned index are always stored in cache. They
       // are hence follow the configuration for pin and prefetch regardless of
       // the value of cache_index_and_filter_blocks
       if (prefetch_all) {
-        rep->index_reader->CacheDependencies(pin_all);
+        rep_->index_reader->CacheDependencies(pin_all);
       }
     } else {
       delete index_reader;
@@ -1318,43 +1319,43 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     if (s.ok() && prefetch_filter) {
       // Hack: Call GetFilter() to implicitly add filter to the block_cache
       auto filter_entry =
-          new_table->GetFilter(rep->table_prefix_extractor.get());
+          new_table->GetFilter(rep_->table_prefix_extractor.get());
       if (filter_entry.GetValue() != nullptr && prefetch_all) {
         filter_entry.GetValue()->CacheDependencies(
-            pin_all, rep->table_prefix_extractor.get());
+            pin_all, rep_->table_prefix_extractor.get());
       }
       // if pin_filter is true then save it in rep_->filter_entry; it will be
       // released in the destructor only, hence it will be pinned in the
       // cache while this reader is alive
       if (pin_filter) {
-        rep->filter_entry = std::move(filter_entry);
+        rep_->filter_entry = std::move(filter_entry);
       }
     }
   } else {
     std::unique_ptr<const BlockContents> compression_dict_block;
     if (s.ok()) {
       // Set filter block
-      if (rep->filter_policy) {
+      if (rep_->filter_policy) {
         const bool is_a_filter_partition = true;
-        auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle,
-                                            !is_a_filter_partition,
-                                            rep->table_prefix_extractor.get());
-        rep->filter.reset(filter);
+        auto filter = new_table->ReadFilter(
+            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
+            rep_->table_prefix_extractor.get());
+        rep_->filter.reset(filter);
         // Refer to the comment above about paritioned indexes always being
         // cached
         if (filter && prefetch_all) {
-          filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
+          filter->CacheDependencies(pin_all,
+                                    rep_->table_prefix_extractor.get());
         }
       }
-      s = ReadCompressionDictBlock(rep, prefetch_buffer,
-                                   &compression_dict_block);
+      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
     }
-    if (s.ok() && !rep->compression_dict_handle.IsNull()) {
+    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
       assert(compression_dict_block != nullptr);
       // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      rep->uncompression_dict.reset(new UncompressionDict(
+      rep_->uncompression_dict.reset(new UncompressionDict(
           compression_dict_block->data.ToString(),
-          rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics));
+          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
     }
   }
   return s;
@@ -1399,23 +1400,22 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const {
 
 // Load the meta-block from the file. On success, return the loaded meta block
 // and its iterator.
-Status BlockBasedTable::ReadMetaBlock(Rep* rep,
-                                      FilePrefetchBuffer* prefetch_buffer,
+Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
                                       std::unique_ptr<Block>* meta_block,
                                       std::unique_ptr<InternalIterator>* iter) {
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
   std::unique_ptr<Block> meta;
   Status s = ReadBlockFromFile(
-      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
-      rep->footer.metaindex_handle(), &meta, rep->ioptions,
+      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
+      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
       true /* decompress */, true /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
+      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
-      GetMemoryAllocator(rep->table_options));
+      GetMemoryAllocator(rep_->table_options));
 
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(rep->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
                     "Encountered error while reading data from properties"
                     " block %s",
                     s.ToString().c_str());
@@ -1431,22 +1431,24 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
 
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
+    Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, CachableEntry<Block>* block,
-    const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
-    bool is_index, GetContext* get_context) {
+    const UncompressionDict& uncompression_dict, bool is_index,
+    GetContext* get_context) const {
+  const size_t read_amp_bytes_per_bit =
+      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
   assert(block);
   assert(block->IsEmpty());
 
   Status s;
   BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
-  Statistics* statistics = rep->ioptions.statistics;
+  Statistics* statistics = rep_->ioptions.statistics;
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
     auto cache_handle = GetEntryFromCache(
-        block_cache, block_cache_key, rep->level,
+        block_cache, block_cache_key,
         is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
         is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
         get_context
@@ -1498,16 +1500,16 @@ Status BlockBasedTable::GetDataBlockFromCache(
   BlockContents contents;
   UncompressionContext context(compression_type);
   UncompressionInfo info(context, uncompression_dict, compression_type);
-  s = UncompressBlockContents(info, compressed_block->data.data(),
-                              compressed_block->data.size(), &contents,
-                              rep->table_options.format_version, rep->ioptions,
-                              GetMemoryAllocator(rep->table_options));
+  s = UncompressBlockContents(
+      info, compressed_block->data.data(), compressed_block->data.size(),
+      &contents, rep_->table_options.format_version, rep_->ioptions,
+      GetMemoryAllocator(rep_->table_options));
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
     std::unique_ptr<Block> block_holder(
-      new Block(std::move(contents), rep->get_global_seqno(is_index),
-                read_amp_bytes_per_bit, statistics));  // uncompressed block
+        new Block(std::move(contents), rep_->get_global_seqno(is_index),
+                  read_amp_bytes_per_bit, statistics));  // uncompressed block
 
     if (block_cache != nullptr && block_holder->own_bytes() &&
         read_options.fill_cache) {
@@ -1566,13 +1568,20 @@ Status BlockBasedTable::GetDataBlockFromCache(
 Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
-    const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions,
     CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
-    CompressionType raw_block_comp_type, uint32_t format_version,
+    CompressionType raw_block_comp_type,
     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-    bool is_index, Cache::Priority priority, GetContext* get_context) {
-
+    MemoryAllocator* memory_allocator, bool is_index,
+    GetContext* get_context) const {
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const uint32_t format_version = rep_->table_options.format_version;
+  const size_t read_amp_bytes_per_bit =
+      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
+  const Cache::Priority priority =
+      is_index && rep_->table_options
+                      .cache_index_and_filter_blocks_with_high_priority
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
   assert(cached_block);
   assert(cached_block->IsEmpty());
   assert(raw_block_comp_type == kNoCompression ||
@@ -1791,8 +1800,7 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 
   Statistics* statistics = rep_->ioptions.statistics;
   Cache::Handle* cache_handle = GetEntryFromCache(
-      block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS,
-      BLOCK_CACHE_FILTER_HIT,
+      block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT,
       get_context ? &get_context->get_context_stats_.num_cache_filter_miss
                   : nullptr,
       get_context ? &get_context->get_context_stats_.num_cache_filter_hit
@@ -1843,25 +1851,24 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     false /* own_value */};
 }
 
-CachableEntry<UncompressionDict>
-BlockBasedTable::GetUncompressionDict(const Rep* rep,
-                                      FilePrefetchBuffer* prefetch_buffer,
-                                      bool no_io, GetContext* get_context) {
-  if (!rep->table_options.cache_index_and_filter_blocks) {
+CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io,
+    GetContext* get_context) const {
+  if (!rep_->table_options.cache_index_and_filter_blocks) {
     // block cache is either disabled or not used for meta-blocks. In either
     // case, BlockBasedTableReader is the owner of the uncompression dictionary.
-    return {rep->uncompression_dict.get(), nullptr /* cache */,
-      nullptr /* cache_handle */, false /* own_value */};
+    return {rep_->uncompression_dict.get(), nullptr /* cache */,
+            nullptr /* cache_handle */, false /* own_value */};
   }
-  if (rep->compression_dict_handle.IsNull()) {
+  if (rep_->compression_dict_handle.IsNull()) {
     return CachableEntry<UncompressionDict>();
   }
   char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   auto cache_key =
-      GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
-                  rep->compression_dict_handle, cache_key_buf);
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                  rep_->compression_dict_handle, cache_key_buf);
   auto cache_handle = GetEntryFromCache(
-      rep->table_options.block_cache.get(), cache_key, rep->level,
+      rep_->table_options.block_cache.get(), cache_key,
       BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT,
       get_context
           ? &get_context->get_context_stats_.num_cache_compression_dict_miss
@@ -1869,29 +1876,29 @@ BlockBasedTable::GetUncompressionDict(const Rep* rep,
       get_context
           ? &get_context->get_context_stats_.num_cache_compression_dict_hit
           : nullptr,
-      rep->ioptions.statistics, get_context);
+      rep_->ioptions.statistics, get_context);
   UncompressionDict* dict = nullptr;
   if (cache_handle != nullptr) {
     dict = reinterpret_cast<UncompressionDict*>(
-        rep->table_options.block_cache->Value(cache_handle));
+        rep_->table_options.block_cache->Value(cache_handle));
   } else if (no_io) {
     // Do not invoke any io.
   } else {
     std::unique_ptr<const BlockContents> compression_dict_block;
     Status s =
-        ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block);
+        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
     size_t usage = 0;
     if (s.ok()) {
       assert(compression_dict_block != nullptr);
       // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
       dict = new UncompressionDict(compression_dict_block->data.ToString(),
-                                   rep->blocks_definitely_zstd_compressed,
-                                   rep->ioptions.statistics);
+                                   rep_->blocks_definitely_zstd_compressed,
+                                   rep_->ioptions.statistics);
       usage = dict->ApproximateMemoryUsage();
-      s = rep->table_options.block_cache->Insert(
+      s = rep_->table_options.block_cache->Insert(
           cache_key, dict, usage, &DeleteCachedUncompressionDictEntry,
           &cache_handle,
-          rep->table_options.cache_index_and_filter_blocks_with_high_priority
+          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
               ? Cache::Priority::HIGH
               : Cache::Priority::LOW);
     }
@@ -1904,23 +1911,23 @@ BlockBasedTable::GetUncompressionDict(const Rep* rep,
         get_context->get_context_stats_
             .num_cache_compression_dict_bytes_insert += usage;
       } else {
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD);
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
-        RecordTick(rep->ioptions.statistics,
+        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD);
+        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        RecordTick(rep_->ioptions.statistics,
                    BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage);
       }
     } else {
       // There should be no way to get here if block cache insertion succeeded.
       // Though it is still possible something failed earlier.
-      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
+      RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
       delete dict;
       dict = nullptr;
       assert(cache_handle == nullptr);
     }
   }
-  return {dict, cache_handle ? rep->table_options.block_cache.get() : nullptr,
-    cache_handle, false /* own_value */};
+  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
+          cache_handle, false /* own_value */};
 }
 
 // disable_prefix_seek should be set to true when prefix_extractor found in SST
@@ -1943,10 +1950,10 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
 // If input_iter is not null, update this iter and return it
 template <typename TBlockIter>
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    const Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
-    TBlockIter* input_iter, bool is_index, bool key_includes_seq,
-    bool index_key_is_full, GetContext* get_context, Status s,
-    FilePrefetchBuffer* prefetch_buffer) {
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    bool is_index, bool key_includes_seq, bool index_key_is_full,
+    GetContext* get_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
@@ -1957,15 +1964,15 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   auto uncompression_dict_storage =
-      GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
+      GetUncompressionDict(prefetch_buffer, no_io, get_context);
   const UncompressionDict& uncompression_dict =
       uncompression_dict_storage.GetValue() == nullptr
           ? UncompressionDict::GetEmptyDict()
           : *uncompression_dict_storage.GetValue();
 
   CachableEntry<Block> block;
-  s = RetrieveBlock(prefetch_buffer, rep, ro, handle, uncompression_dict,
-                    &block, is_index, get_context);
+  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
+                    is_index, get_context);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
@@ -1984,16 +1991,16 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   //    Otherwise, the block is pinned iff the source is immortal.
   const bool block_contents_pinned =
       block.IsCached() ||
-      (!block.GetValue()->own_bytes() && rep->immortal_table);
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
   iter = block.GetValue()->NewIterator<TBlockIter>(
-      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-      iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
+      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
+      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
       index_key_is_full, block_contents_pinned);
 
   if (!block.IsCached()) {
-    if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
       // insert a dummy record to block cache to track the memory usage
-      Cache* const block_cache = rep->table_options.block_cache.get();
+      Cache* const block_cache = rep_->table_options.block_cache.get();
       Cache::Handle* cache_handle = nullptr;
       // There are two other types of cache keys: 1) SST cache key added in
       // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
@@ -2002,11 +2009,11 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
       // differentiate from `write_buffer_manager`
       const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
       char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
-      // Prefix: use rep->cache_key_prefix padded by 0s
+      // Prefix: use rep_->cache_key_prefix padded by 0s
       memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
-      assert(rep->cache_key_prefix_size != 0);
-      assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
-      memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
       char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
                                  next_cache_key_id_++);
       assert(end - cache_key <=
@@ -2028,17 +2035,18 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 }
 
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
-    FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
+    CachableEntry<Block>* block_entry, bool is_index,
+    GetContext* get_context) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
-  Cache* block_cache = rep->table_options.block_cache.get();
+  Cache* block_cache = rep_->table_options.block_cache.get();
 
   // No point to cache compressed blocks if it never goes away
   Cache* block_cache_compressed =
-      rep->immortal_table ? nullptr
-                          : rep->table_options.block_cache_compressed.get();
+      rep_->immortal_table ? nullptr
+                           : rep_->table_options.block_cache_compressed.get();
 
   // First, try to get the block from the cache
   //
@@ -2051,58 +2059,50 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
     // create key for block cache
     if (block_cache != nullptr) {
-      key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
+      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                         handle, cache_key);
     }
 
     if (block_cache_compressed != nullptr) {
-      ckey = GetCacheKey(rep->compressed_cache_key_prefix,
-                         rep->compressed_cache_key_prefix_size, handle,
+      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
+                         rep_->compressed_cache_key_prefix_size, handle,
                          compressed_cache_key);
     }
 
-    s = GetDataBlockFromCache(
-        key, ckey, block_cache, block_cache_compressed, rep, ro, block_entry,
-        uncompression_dict,
-        !is_index ? rep->table_options.read_amp_bytes_per_bit : 0, is_index,
-        get_context);
+    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                              ro, block_entry, uncompression_dict, is_index,
+                              get_context);
 
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
-      Statistics* statistics = rep->ioptions.statistics;
+      Statistics* statistics = rep_->ioptions.statistics;
       bool do_decompress =
-          block_cache_compressed == nullptr && rep->blocks_maybe_compressed;
+          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
       {
-        StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         BlockFetcher block_fetcher(
-            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &raw_block_contents, rep->ioptions,
-            do_decompress /* do uncompress */, rep->blocks_maybe_compressed,
-            uncompression_dict, rep->persistent_cache_options,
-            GetMemoryAllocator(rep->table_options),
-            GetMemoryAllocatorForCompressedBlock(rep->table_options));
+            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+            &raw_block_contents, rep_->ioptions,
+            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
+            uncompression_dict, rep_->persistent_cache_options,
+            GetMemoryAllocator(rep_->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
         s = block_fetcher.ReadBlockContents();
         raw_block_comp_type = block_fetcher.get_compression_type();
       }
 
       if (s.ok()) {
-        SequenceNumber seq_no = rep->get_global_seqno(is_index);
+        SequenceNumber seq_no = rep_->get_global_seqno(is_index);
         // If filling cache is allowed and a cache is configured, try to put the
         // block to the cache.
-        s = PutDataBlockToCache(
-            key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
-            block_entry, &raw_block_contents, raw_block_comp_type,
-            rep->table_options.format_version, uncompression_dict, seq_no,
-            !is_index ? rep->table_options.read_amp_bytes_per_bit : 0,
-            GetMemoryAllocator(rep->table_options), is_index,
-            is_index && rep->table_options
-                            .cache_index_and_filter_blocks_with_high_priority
-                ? Cache::Priority::HIGH
-                : Cache::Priority::LOW,
-            get_context);
+        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
+                                block_entry, &raw_block_contents,
+                                raw_block_comp_type, uncompression_dict, seq_no,
+                                GetMemoryAllocator(rep_->table_options),
+                                is_index, get_context);
       }
     }
   }
@@ -2111,16 +2111,16 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 }
 
 Status BlockBasedTable::RetrieveBlock(
-    FilePrefetchBuffer* prefetch_buffer, const Rep* rep, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
-  assert(rep);
+    CachableEntry<Block>* block_entry, bool is_index,
+    GetContext* get_context) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
   Status s;
-  if (!is_index || rep->table_options.cache_index_and_filter_blocks) {
-    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
+  if (!is_index || rep_->table_options.cache_index_and_filter_blocks) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
                                      uncompression_dict, block_entry, is_index,
                                      get_context);
 
@@ -2144,15 +2144,15 @@ Status BlockBasedTable::RetrieveBlock(
   std::unique_ptr<Block> block;
 
   {
-    StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
+    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
                  READ_BLOCK_GET_MICROS);
     s = ReadBlockFromFile(
-        rep->file.get(), prefetch_buffer, rep->footer, ro, handle, &block,
-        rep->ioptions, rep->blocks_maybe_compressed,
-        rep->blocks_maybe_compressed, uncompression_dict,
-        rep->persistent_cache_options, rep->get_global_seqno(is_index),
-        !is_index ? rep->table_options.read_amp_bytes_per_bit : 0,
-        GetMemoryAllocator(rep->table_options));
+        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+        rep_->ioptions, rep_->blocks_maybe_compressed,
+        rep_->blocks_maybe_compressed, uncompression_dict,
+        rep_->persistent_cache_options, rep_->get_global_seqno(is_index),
+        !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0,
+        GetMemoryAllocator(rep_->table_options));
   }
 
   if (!s.ok()) {
@@ -2530,8 +2530,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
     }
 
     Status s;
-    BlockBasedTable::NewDataBlockIterator<TBlockIter>(
-        rep, read_options_, data_block_handle, &block_iter_, is_index_,
+    table_->NewDataBlockIterator<TBlockIter>(
+        read_options_, data_block_handle, &block_iter_, is_index_,
         key_includes_seq_, index_key_is_full_,
         /* get_context */ nullptr, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
@@ -2775,7 +2775,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       } else {
         DataBlockIter biter;
         NewDataBlockIterator<DataBlockIter>(
-            rep_, read_options, iiter->value(), &biter, false,
+            read_options, iiter->value(), &biter, false,
             true /* key_includes_seq */, true /* index_key_is_full */,
             get_context);
 
@@ -2886,7 +2886,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
         DataBlockIter biter;
         NewDataBlockIterator<DataBlockIter>(
-            rep_, read_options, iiter->value(), &biter, false,
+            read_options, iiter->value(), &biter, false,
             true /* key_includes_seq */, get_context);
 
         if (read_options.read_tier == kBlockCacheTier &&
@@ -2989,8 +2989,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 
     // Load the block specified by the block_handle into the block cache
     DataBlockIter biter;
-    NewDataBlockIterator<DataBlockIter>(rep_, ReadOptions(), block_handle,
-                                        &biter);
+    NewDataBlockIterator<DataBlockIter>(ReadOptions(), block_handle, &biter);
 
     if (!biter.status().ok()) {
       // there was an unexpected error while pre-fetching
@@ -3006,7 +3005,7 @@ Status BlockBasedTable::VerifyChecksum() {
   // Check Meta blocks
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter);
+  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
   if (s.ok()) {
     s = VerifyChecksumInMetaBlocks(meta_iter.get());
     if (!s.ok()) {
@@ -3075,7 +3074,7 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
     s = block_fetcher.ReadBlockContents();
     if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
       TableProperties* table_properties;
-      s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */,
+      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
                                            index_iter->value(),
                                            &table_properties);
       delete table_properties;
@@ -3170,8 +3169,7 @@ Status BlockBasedTable::CreateIndexReader(
       std::unique_ptr<InternalIterator> meta_iter_guard;
       auto meta_index_iter = preloaded_meta_index_iter;
       if (meta_index_iter == nullptr) {
-        auto s =
-            ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard);
+        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
@@ -3251,7 +3249,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        rep_, ReadOptions(), blockhandles_iter->value()));
+        ReadOptions(), blockhandles_iter->value()));
     s = datablock_iter->status();
 
     if (!s.ok()) {
@@ -3296,8 +3294,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
       "--------------------------------------\n");
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
-  Status s =
-      ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter);
+  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
   if (s.ok()) {
     for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
       s = meta_iter->status();
@@ -3387,7 +3384,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
   // Output compression dictionary
   if (!rep_->compression_dict_handle.IsNull()) {
     std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
+    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
                                  &compression_dict_block);
     if (!s.ok()) {
       return s;
@@ -3543,7 +3540,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        rep_, ReadOptions(), blockhandles_iter->value()));
+        ReadOptions(), blockhandles_iter->value()));
     s = datablock_iter->status();
 
     if (!s.ok()) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index f6f610ca2ac..e53248fbcba 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -219,12 +219,12 @@ class BlockBasedTable : public TableReader {
 
   // input_iter: if it is not null, update this one and return it as Iterator
   template <typename TBlockIter>
-  static TBlockIter* NewDataBlockIterator(
-      const Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
+  TBlockIter* NewDataBlockIterator(
+      const ReadOptions& ro, const BlockHandle& block_hanlde,
       TBlockIter* input_iter = nullptr, bool is_index = false,
       bool key_includes_seq = true, bool index_key_is_full = true,
       GetContext* get_context = nullptr, Status s = Status(),
-      FilePrefetchBuffer* prefetch_buffer = nullptr);
+      FilePrefetchBuffer* prefetch_buffer = nullptr) const;
 
   class PartitionedIndexIteratorState;
 
@@ -238,6 +238,14 @@ class BlockBasedTable : public TableReader {
   friend class MockedBlockBasedTable;
   static std::atomic<uint64_t> next_cache_key_id_;
 
+  Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
+                                   Tickers block_cache_miss_ticker,
+                                   Tickers block_cache_hit_ticker,
+                                   uint64_t* block_cache_miss_stats,
+                                   uint64_t* block_cache_hit_stats,
+                                   Statistics* statistics,
+                                   GetContext* get_context) const;
+
   // If block cache enabled (compressed or uncompressed), looks for the block
   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
   // then (3) file. If found, inserts into the cache(s) that were searched
@@ -247,22 +255,20 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
-  static Status MaybeReadBlockAndLoadToCache(
-      FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
-      const ReadOptions& ro, const BlockHandle& handle,
-      const UncompressionDict& uncompression_dict,
+  Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
       CachableEntry<Block>* block_entry, bool is_index = false,
-      GetContext* get_context = nullptr);
+      GetContext* get_context = nullptr) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
   // read options allow I/O).
-  static Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
-                              const Rep* rep, const ReadOptions& ro,
-                              const BlockHandle& handle,
-                              const UncompressionDict& uncompression_dict,
-                              CachableEntry<Block>* block_entry, bool is_index,
-                              GetContext* get_context);
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<Block>* block_entry, bool is_index,
+                       GetContext* get_context) const;
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
@@ -276,9 +282,9 @@ class BlockBasedTable : public TableReader {
       const bool is_a_filter_partition, bool no_io, GetContext* get_context,
       const SliceTransform* prefix_extractor = nullptr) const;
 
-  static CachableEntry<UncompressionDict> GetUncompressionDict(
-      const Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
-      GetContext* get_context);
+  CachableEntry<UncompressionDict> GetUncompressionDict(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io,
+      GetContext* get_context) const;
 
   // Get the iterator from the index reader.
   // If input_iter is not set, return new Iterator
@@ -301,13 +307,12 @@ class BlockBasedTable : public TableReader {
   // pointer to the block as well as its block handle.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
-  static Status GetDataBlockFromCache(
+  Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
+      Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, CachableEntry<Block>* block,
-      const UncompressionDict& uncompression_dict,
-      size_t read_amp_bytes_per_bit, bool is_index = false,
-      GetContext* get_context = nullptr);
+      const UncompressionDict& uncompression_dict, bool is_index = false,
+      GetContext* get_context = nullptr) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -319,16 +324,16 @@ class BlockBasedTable : public TableReader {
   // PutDataBlockToCache(). After the call, the object will be invalid.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
-  static Status PutDataBlockToCache(
-      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed,
-      const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
-      CachableEntry<Block>* block, BlockContents* raw_block_contents,
-      CompressionType raw_block_comp_type, uint32_t format_version,
-      const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-      size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-      bool is_index = false, Cache::Priority pri = Cache::Priority::LOW,
-      GetContext* get_context = nullptr);
+  Status PutDataBlockToCache(const Slice& block_cache_key,
+                             const Slice& compressed_block_cache_key,
+                             Cache* block_cache, Cache* block_cache_compressed,
+                             CachableEntry<Block>* cached_block,
+                             BlockContents* raw_block_contents,
+                             CompressionType raw_block_comp_type,
+                             const UncompressionDict& uncompression_dict,
+                             SequenceNumber seq_no,
+                             MemoryAllocator* memory_allocator, bool is_index,
+                             GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -336,8 +341,6 @@ class BlockBasedTable : public TableReader {
   friend class TableCache;
   friend class BlockBasedTableBuilder;
 
-  void ReadMeta(const Footer& footer);
-
   // Figure the index type, update it in rep_, and also return it.
   BlockBasedTableOptions::IndexType UpdateIndexType();
 
@@ -365,28 +368,25 @@ class BlockBasedTable : public TableReader {
       TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
       const bool preload_all,
       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
-  static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-                              std::unique_ptr<Block>* meta_block,
-                              std::unique_ptr<InternalIterator>* iter);
-  static Status TryReadPropertiesWithGlobalSeqno(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
-      TableProperties** table_properties);
-  static Status ReadPropertiesBlock(Rep* rep,
-                                    FilePrefetchBuffer* prefetch_buffer,
-                                    InternalIterator* meta_iter,
-                                    const SequenceNumber largest_seqno);
-  static Status ReadRangeDelBlock(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      InternalIterator* meta_iter,
-      const InternalKeyComparator& internal_comparator);
-  static Status ReadCompressionDictBlock(
-      const Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      std::unique_ptr<const BlockContents>* compression_dict_block);
-  static Status PrefetchIndexAndFilterBlocks(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      InternalIterator* meta_iter, BlockBasedTable* new_table,
-      bool prefetch_all, const BlockBasedTableOptions& table_options,
-      const int level);
+  Status ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
+                       std::unique_ptr<Block>* meta_block,
+                       std::unique_ptr<InternalIterator>* iter);
+  Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
+                                          const Slice& handle_value,
+                                          TableProperties** table_properties);
+  Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
+                             InternalIterator* meta_iter,
+                             const SequenceNumber largest_seqno);
+  Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* meta_iter,
+                           const InternalKeyComparator& internal_comparator);
+  Status ReadCompressionDictBlock(
+      FilePrefetchBuffer* prefetch_buffer,
+      std::unique_ptr<const BlockContents>* compression_dict_block) const;
+  Status PrefetchIndexAndFilterBlocks(
+      FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+      BlockBasedTable* new_table, bool prefetch_all,
+      const BlockBasedTableOptions& table_options, const int level);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
   Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);

From 5851cb7fdbb85a19dc0d3d9cc0a61adeb9a3ae02 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 3 Jun 2019 13:21:02 -0700
Subject: [PATCH 102/572] Move util/trace_replay.* to trace_replay/ (#5376)

Summary:
util/ means for lower level libraries. trace_replay is highly integrated to DB and sometimes call DB. Move it out to a separate directory.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5376

Differential Revision: D15550938

Pulled By: siying

fbshipit-source-id: f46dce5ceffdc05a73f26379c7bb1b79ebe6c207
---
 CMakeLists.txt                              | 2 +-
 TARGETS                                     | 2 +-
 db/db_impl/db_impl.h                        | 2 +-
 db/db_iter.cc                               | 2 +-
 src.mk                                      | 2 +-
 tools/trace_analyzer_test.cc                | 2 +-
 tools/trace_analyzer_tool.cc                | 2 +-
 tools/trace_analyzer_tool.h                 | 2 +-
 {util => trace_replay}/trace_replay.cc      | 2 +-
 {util => trace_replay}/trace_replay.h       | 0
 utilities/trace/file_trace_reader_writer.cc | 2 +-
 11 files changed, 10 insertions(+), 10 deletions(-)
 rename {util => trace_replay}/trace_replay.cc (99%)
 rename {util => trace_replay}/trace_replay.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b5f03a0f3b..7cb4cc7a863 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -623,6 +623,7 @@ set(SOURCES
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
+        trace_replay/trace_replay.cc
         util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
@@ -642,7 +643,6 @@ set(SOURCES
         util/string_util.cc
         util/thread_local.cc
         util/threadpool_imp.cc
-        util/trace_replay.cc
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_compaction_filter.cc
diff --git a/TARGETS b/TARGETS
index da4f4d9a61d..a635ed5ac7d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -221,6 +221,7 @@ cpp_library(
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
+        "trace_replay/trace_replay.cc",
         "util/bloom.cc",
         "util/build_version.cc",
         "util/coding.cc",
@@ -241,7 +242,6 @@ cpp_library(
         "util/string_util.cc",
         "util/thread_local.cc",
         "util/threadpool_imp.cc",
-        "util/trace_replay.cc",
         "util/xxhash.cc",
         "utilities/backupable/backupable_db.cc",
         "utilities/blob_db/blob_compaction_filter.cc",
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 5461ef300aa..f73e8665fb6 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -53,12 +53,12 @@
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
 #include "util/hash.h"
 #include "util/repeatable_thread.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
-#include "util/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index bcfed2bb021..29a1a9eac1a 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -26,9 +26,9 @@
 #include "rocksdb/options.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/trace_replay.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
diff --git a/src.mk b/src.mk
index 38835f8c6d2..c172d0b2c2d 100644
--- a/src.mk
+++ b/src.mk
@@ -142,6 +142,7 @@ LIB_SOURCES =                                                   \
   test_util/sync_point_impl.cc                                  \
   test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
+  trace_replay/trace_replay.cc                                  \
   util/bloom.cc                                                 \
   util/build_version.cc                                         \
   util/coding.cc                                                \
@@ -162,7 +163,6 @@ LIB_SOURCES =                                                   \
   util/string_util.cc                                           \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
-  util/trace_replay.cc                                          \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_compaction_filter.cc                   \
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index 7c242f60f26..dcc954384fd 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -30,7 +30,7 @@ int main() {
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "tools/trace_analyzer_tool.h"
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 93528c00608..6ab606f6a6a 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -44,13 +44,13 @@
 #include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/trace_replay.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h
index be96f5005da..4c3b973b79c 100644
--- a/tools/trace_analyzer_tool.h
+++ b/tools/trace_analyzer_tool.h
@@ -16,7 +16,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/write_batch.h"
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/util/trace_replay.cc b/trace_replay/trace_replay.cc
similarity index 99%
rename from util/trace_replay.cc
rename to trace_replay/trace_replay.cc
index 9e0e8c48cde..f9448069b80 100644
--- a/util/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 #include <chrono>
 #include <sstream>
diff --git a/util/trace_replay.h b/trace_replay/trace_replay.h
similarity index 100%
rename from util/trace_replay.h
rename to trace_replay/trace_replay.h
diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc
index 4a81516a8b7..d8e36c31276 100644
--- a/utilities/trace/file_trace_reader_writer.cc
+++ b/utilities/trace/file_trace_reader_writer.cc
@@ -5,9 +5,9 @@
 
 #include "utilities/trace/file_trace_reader_writer.h"
 
+#include "trace_replay/trace_replay.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
-#include "util/trace_replay.h"
 
 namespace rocksdb {
 

From ae05a83e19ff53ed0cb83e248ba19bc9f3b07a07 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 3 Jun 2019 19:47:02 -0700
Subject: [PATCH 103/572] Call ValidateOptions from SetOptions (#5368)

Summary:
Currently we validate options in DB::Open. However the validation step is missing when options are dynamically updated in ::SetOptions. The patch fixes that.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5368

Differential Revision: D15540101

Pulled By: maysamyabandeh

fbshipit-source-id: d27bbffd8f0252d1b50bcf59e0a70a278ed937f4
---
 db/column_family.cc                           | 49 ++++++++++++++++++-
 db/column_family.h                            |  4 ++
 db/db_impl/db_impl.cc                         | 29 ++++++++---
 db/db_impl/db_impl.h                          |  7 +++
 db/db_impl/db_impl_open.cc                    | 46 +++--------------
 db/db_options_test.cc                         |  4 +-
 db/db_test.cc                                 |  3 ++
 options/options_test.cc                       |  7 +--
 test_util/testutil.cc                         | 15 ++++--
 test_util/testutil.h                          |  2 +-
 utilities/options/options_util_test.cc        |  4 +-
 .../transactions/write_prepared_txn_db.cc     |  2 +-
 12 files changed, 114 insertions(+), 58 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index ce22a00aac3..531cbeca681 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1148,13 +1148,60 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
+Status ColumnFamilyData::ValidateOptions(
+    const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+  Status s;
+  s = CheckCompressionSupported(cf_options);
+  if (s.ok() && db_options.allow_concurrent_memtable_write) {
+    s = CheckConcurrentWritesSupported(cf_options);
+  }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(db_options, cf_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cf_options.ttl > 0) {
+    if (db_options.max_open_files != -1) {
+      return Status::NotSupported(
+          "TTL is only supported when files are always "
+          "kept open (set max_open_files = -1). ");
+    }
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "TTL is only supported in Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.periodic_compaction_seconds > 0) {
+    if (db_options.max_open_files != -1) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported when files are always "
+          "kept open (set max_open_files = -1). ");
+    }
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported in "
+          "Block-Based Table format. ");
+    }
+  }
+  return s;
+}
+
 #ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
-      const std::unordered_map<std::string, std::string>& options_map) {
+    const DBOptions& db_options,
+    const std::unordered_map<std::string, std::string>& options_map) {
   MutableCFOptions new_mutable_cf_options;
   Status s =
       GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
                                    ioptions_.info_log, &new_mutable_cf_options);
+  if (s.ok()) {
+    ColumnFamilyOptions cf_options =
+        BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
+    s = ValidateOptions(db_options, cf_options);
+  }
   if (s.ok()) {
     mutable_cf_options_ = new_mutable_cf_options;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
diff --git a/db/column_family.h b/db/column_family.h
index 655cb159261..8646b4fc197 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -338,9 +338,13 @@ class ColumnFamilyData {
 
   bool is_delete_range_supported() { return is_delete_range_supported_; }
 
+  // Validate CF options against DB options
+  static Status ValidateOptions(const DBOptions& db_options,
+                                const ColumnFamilyOptions& cf_options);
 #ifndef ROCKSDB_LITE
   // REQUIRES: DB mutex held
   Status SetOptions(
+      const DBOptions& db_options,
       const std::unordered_map<std::string, std::string>& options_map);
 #endif  // ROCKSDB_LITE
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 9675e727dde..ba76abc2875 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -848,8 +848,9 @@ Status DBImpl::SetOptions(
   Status persist_options_status;
   SuperVersionContext sv_context(/* create_superversion */ true);
   {
+    auto db_options = GetDBOptions();
     InstrumentedMutexLock l(&mutex_);
-    s = cfd->SetOptions(options_map);
+    s = cfd->SetOptions(db_options, options_map);
     if (s.ok()) {
       new_options = *cfd->GetLatestMutableCFOptions();
       // Append new version to recompute compaction score.
@@ -912,6 +913,25 @@ Status DBImpl::SetDBOptions(
     InstrumentedMutexLock l(&mutex_);
     s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
                                        &new_options);
+    if (new_options.bytes_per_sync == 0) {
+      new_options.bytes_per_sync = 1024 * 1024;
+    }
+    DBOptions new_db_options =
+        BuildDBOptions(immutable_db_options_, new_options);
+    if (s.ok()) {
+      s = ValidateOptions(new_db_options);
+    }
+    if (s.ok()) {
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped()) {
+          auto cf_options = c->GetLatestCFOptions();
+          s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    }
     if (s.ok()) {
       if (new_options.max_background_compactions >
           mutable_db_options_.max_background_compactions) {
@@ -956,15 +976,12 @@ Status DBImpl::SetDBOptions(
                                           : new_options.max_open_files - 10);
       wal_changed = mutable_db_options_.wal_bytes_per_sync !=
                     new_options.wal_bytes_per_sync;
-      if (new_options.bytes_per_sync == 0) {
-        new_options.bytes_per_sync = 1024 * 1024;
-      }
       mutable_db_options_ = new_options;
-      env_options_for_compaction_ = EnvOptions(
-          BuildDBOptions(immutable_db_options_, mutable_db_options_));
+      env_options_for_compaction_ = EnvOptions(new_db_options);
       env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite(
           env_options_for_compaction_, immutable_db_options_);
       versions_->ChangeEnvOptions(mutable_db_options_);
+      //TODO(xiez): clarify why apply optimize for read to write options
       env_options_for_compaction_ = env_->OptimizeForCompactionTableRead(
           env_options_for_compaction_, immutable_db_options_);
       env_options_for_compaction_.compaction_readahead_size =
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index f73e8665fb6..ab8cb11d9c9 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1501,6 +1501,13 @@ class DBImpl : public DB {
   Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
                    size_t preallocate_block_size, log::Writer** new_log);
 
+  // Validate self-consistency of DB options
+  static Status ValidateOptions(const DBOptions& db_options);
+  // Validate self-consistency of DB options and its consistency with cf options
+  static Status ValidateOptions(
+      const DBOptions& db_options,
+      const std::vector<ColumnFamilyDescriptor>& column_families);
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 5019221b5ca..2fc12746d7d 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -145,7 +145,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
 }
 
 namespace {
-
 Status SanitizeOptionsByTable(
     const DBOptions& db_opts,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
@@ -158,52 +157,23 @@ Status SanitizeOptionsByTable(
   }
   return Status::OK();
 }
+}  // namespace
 
-static Status ValidateOptions(
+Status DBImpl::ValidateOptions(
     const DBOptions& db_options,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
   Status s;
-
   for (auto& cfd : column_families) {
-    s = CheckCompressionSupported(cfd.options);
-    if (s.ok() && db_options.allow_concurrent_memtable_write) {
-      s = CheckConcurrentWritesSupported(cfd.options);
-    }
-    if (s.ok()) {
-      s = CheckCFPathsSupported(db_options, cfd.options);
-    }
+    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
     if (!s.ok()) {
       return s;
     }
-
-    if (cfd.options.ttl > 0) {
-      if (db_options.max_open_files != -1) {
-        return Status::NotSupported(
-            "TTL is only supported when files are always "
-            "kept open (set max_open_files = -1). ");
-      }
-      if (cfd.options.table_factory->Name() !=
-          BlockBasedTableFactory().Name()) {
-        return Status::NotSupported(
-            "TTL is only supported in Block-Based Table format. ");
-      }
-    }
-
-    if (cfd.options.periodic_compaction_seconds > 0) {
-      if (db_options.max_open_files != -1) {
-        return Status::NotSupported(
-            "Periodic Compaction is only supported when files are always "
-            "kept open (set max_open_files = -1). ");
-      }
-      if (cfd.options.table_factory->Name() !=
-          BlockBasedTableFactory().Name()) {
-        return Status::NotSupported(
-            "Periodic Compaction is only supported in "
-            "Block-Based Table format. ");
-      }
-    }
   }
+  s = ValidateOptions(db_options);
+  return s;
+}
 
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
   if (db_options.db_paths.size() > 4) {
     return Status::NotSupported(
         "More than four DB paths are not supported yet. ");
@@ -241,7 +211,7 @@ static Status ValidateOptions(
 
   return Status::OK();
 }
-}  // namespace
+
 Status DBImpl::NewDB() {
   VersionEdit new_db;
   new_db.SetLogNumber(0);
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index a9c8d218235..bf33153284e 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -66,10 +66,10 @@ class DBOptionsTest : public DBTestBase {
 
   std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
       Random* rnd) {
-    Options options;
+    Options options = CurrentOptions();
     options.env = env_;
     ImmutableDBOptions db_options(options);
-    test::RandomInitCFOptions(&options, rnd);
+    test::RandomInitCFOptions(&options, options, rnd);
     auto sanitized_options = SanitizeOptions(db_options, options);
     auto opt_map = GetMutableCFOptionsMap(sanitized_options);
     delete options.compaction_filter;
diff --git a/db/db_test.cc b/db/db_test.cc
index 4c4bd382ca8..27cf790ee57 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4884,11 +4884,14 @@ TEST_F(DBTest, DynamicMiscOptions) {
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
                                                      &mutable_cf_options));
   ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+  // Appveyor fails with: Compression type Snappy is not linked with the binary
+#ifndef OS_WIN
   ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
                                                      &mutable_cf_options));
   ASSERT_EQ(CompressionType::kSnappyCompression,
             mutable_cf_options.compression);
+#endif
   // Test paranoid_file_checks already done in db_block_cache_test
   ASSERT_OK(
       dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
diff --git a/options/options_test.cc b/options/options_test.cc
index 429b607e4f9..1aa3bace7dd 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -842,7 +842,7 @@ TEST_F(OptionsTest, OptionsComposeDecompose) {
 
   Random rnd(301);
   test::RandomInitDBOptions(&base_db_opts, &rnd);
-  test::RandomInitCFOptions(&base_cf_opts, &rnd);
+  test::RandomInitCFOptions(&base_cf_opts, base_db_opts, &rnd);
 
   Options base_opts(base_db_opts, base_cf_opts);
   DBOptions new_db_opts(base_opts);
@@ -854,11 +854,12 @@ TEST_F(OptionsTest, OptionsComposeDecompose) {
 }
 
 TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
+  Options options;
   ColumnFamilyOptions base_opt, new_opt;
   Random rnd(302);
   // Phase 1: randomly assign base_opt
   // custom type options
-  test::RandomInitCFOptions(&base_opt, &rnd);
+  test::RandomInitCFOptions(&base_opt, options, &rnd);
 
   // Phase 2: obtain a string from base_opt
   std::string base_options_file_content;
@@ -1521,7 +1522,7 @@ TEST_F(OptionsParserTest, DumpAndParse) {
   for (int c = 0; c < num_cf; ++c) {
     ColumnFamilyOptions cf_opt;
     Random cf_rnd(0xFB + c);
-    test::RandomInitCFOptions(&cf_opt, &cf_rnd);
+    test::RandomInitCFOptions(&cf_opt, base_db_opt, &cf_rnd);
     if (c < 4) {
       cf_opt.prefix_extractor.reset(test::RandomSliceTransform(&rnd, c));
     }
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 18e1a45bb36..4e37cde40d1 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -162,7 +162,11 @@ std::string RandomName(Random* rnd, const size_t len) {
 }
 
 CompressionType RandomCompressionType(Random* rnd) {
-  return static_cast<CompressionType>(rnd->Uniform(6));
+  auto ret = static_cast<CompressionType>(rnd->Uniform(6));
+  while (!CompressionTypeSupported(ret)) {
+    ret = static_cast<CompressionType>((static_cast<int>(ret) + 1) % 6);
+  }
+  return ret;
 }
 
 void RandomCompressionTypeVector(const size_t count,
@@ -293,7 +297,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
   db_opt->stats_dump_period_sec = rnd->Uniform(100000);
 }
 
-void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
+                         Random* rnd) {
   cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
 
   // boolean options
@@ -345,8 +350,10 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
 
   // uint64_t options
   static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
-  cf_opt->ttl = uint_max + rnd->Uniform(10000);
-  cf_opt->periodic_compaction_seconds = uint_max + rnd->Uniform(10000);
+  cf_opt->ttl =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
+  cf_opt->periodic_compaction_seconds =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
   cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
   cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
   cf_opt->max_compaction_bytes =
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 7890ce5f511..bc0b2b07d5f 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -657,7 +657,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
 // Randomly initialize the given ColumnFamilyOptions
 // Note that the caller is responsible for releasing non-null
 // cf_opt->compaction_filter.
-void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd);
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd);
 
 // A dummy merge operator which can change its name
 class ChanglingMergeOperator : public MergeOperator {
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 5b8015152ff..8c71dbf5dc3 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -58,7 +58,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
     cf_names.push_back(i == 0 ? kDefaultColumnFamilyName
                               : test::RandomName(&rnd_, 10));
     cf_opts.emplace_back();
-    test::RandomInitCFOptions(&cf_opts.back(), &rnd_);
+    test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_);
   }
 
   const std::string kFileName = "OPTIONS-123456";
@@ -82,7 +82,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
           cf_opts[i].table_factory.get(),
           loaded_cf_descs[i].options.table_factory.get()));
     }
-    test::RandomInitCFOptions(&cf_opts[i], &rnd_);
+    test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_);
     ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
         cf_opts[i], loaded_cf_descs[i].options));
   }
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index bf94d83d82b..e2a8fbbf20f 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -210,7 +210,7 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   WriteBatch empty_batch;
   write_options.disableWAL = true;
   write_options.sync = false;
-  const size_t ONE_BATCH = 1; // Just to inc the seq
+  const size_t ONE_BATCH = 1;  // Just to inc the seq
   s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
                           no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);

From 5d6e8df1cf81213bed4c8fb27bf00bb09dc57e65 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Mon, 3 Jun 2019 22:37:40 -0700
Subject: [PATCH 104/572] Ignore shutdown error during compaction (#5400)

Summary:
The PR #5275 separated the column dropped and shutdown status codes. However, there were a couple of places in compaction where this change ended up treating a ShutdownInProgress() error as a real error and set bg_error. This caused MyRocks unit test to fail due to WAL writes during shutdown returning this error. Fix it by ignoring the shutdown status during compaction.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5400

Differential Revision: D15611680

Pulled By: anand1976

fbshipit-source-id: c602e97840e3ae24eb420d61e0ce95d3e6258632
---
 db/db_compaction_test.cc               | 30 ++++++++++++++++++++++++++
 db/db_impl/db_impl.h                   |  1 +
 db/db_impl/db_impl_compaction_flush.cc |  6 ++++--
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 3051e89cd37..6537950fcc7 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -4557,6 +4557,36 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
   ASSERT_EQ(num, 0);
 }
 
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+  Options opts = CurrentOptions();
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+    }
+    Flush();
+  }
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+      [&](void* /*arg*/) {
+    dbfull()->shutting_down_.store(true);
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
 // FixFileIngestionCompactionDeadlock tests and verifies that compaction and
 // file ingestion do not cause deadlock in the event of write stall triggered
 // by number of L0 files reaching level0_stop_writes_trigger.
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index ab8cb11d9c9..111a91e04f3 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1000,6 +1000,7 @@ class DBImpl : public DB {
   friend class DBTest_ConcurrentFlushWAL_Test;
   friend class DBTest_MixedSlowdownOptionsStop_Test;
   friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+  friend class DBCompactionTest_CompactionDuringShutdown_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
   friend class WriteCallbackTest_WriteWithCallbackTest_Test;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 881fa26af37..7be9b62c5d6 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1049,7 +1049,7 @@ Status DBImpl::CompactFilesImpl(
 
   if (status.ok()) {
     // Done
-  } else if (status.IsColumnFamilyDropped()) {
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
@@ -2680,6 +2680,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                             compaction_job_stats, job_context->job_id);
 
     mutex_.Unlock();
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
     compaction_job.Run();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
     mutex_.Lock();
@@ -2713,7 +2715,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
   if (status.ok() || status.IsCompactionTooLarge()) {
     // Done
-  } else if (status.IsColumnFamilyDropped()) {
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",

From c8267120d809551d1de99f518c1a7b453fad20c0 Mon Sep 17 00:00:00 2001
From: Mark Rambacher <mrambach@gmail.com>
Date: Mon, 3 Jun 2019 22:59:54 -0700
Subject: [PATCH 105/572] Add support for loading dynamic libraries into the
 RocksDB environment (#5281)

Summary:
This change adds a Dynamic Library class to the RocksDB Env.  Dynamic libraries are populated via the  Env::LoadLibrary method.

The addition of dynamic library support allows for a few different features to be developed:
1.  The compression code can be changed to use dynamic library support.  This would allow RocksDB to determine at run-time what compression packages were installed.  This change would eliminate the need to make sure the build-time and run-time environment had the same library set.  It would also simplify some of the Java build issues (where it attempts to build and include various packages inside the RocksDB jars).

2.  Along with other features (to be provided in a subsequent PR), this change would allow code/configurations to be added to RocksDB at run-time.  For example, the build system includes code for building an "rados" environment and adding "Cassandra" features.  Instead of these extensions being built into the base RocksDB code, these extensions could be loaded at run-time as required/appropriate, either by configuration or explicitly.

We intend to push out other changes in support of the extending RocksDB at run-time via configurations.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5281

Differential Revision: D15447613

Pulled By: riversand963

fbshipit-source-id: 452cd4f54511c0bceee18f6d9d919aae9fd25fef
---
 .gitignore                        |  1 +
 TARGETS                           |  1 +
 buckifier/targets_cfg.py          |  1 +
 build_tools/build_detect_platform | 13 ++++
 env/env_posix.cc                  | 99 +++++++++++++++++++++++++++++++
 env/env_test.cc                   | 46 ++++++++++++++
 include/rocksdb/env.h             | 42 +++++++++++++
 7 files changed, 203 insertions(+)

diff --git a/.gitignore b/.gitignore
index e88ccfc008c..6364dfdc401 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,7 @@ rocksdb_undump
 db_test2
 trace_analyzer
 trace_analyzer_test
+.DS_Store
 
 java/out
 java/target
diff --git a/TARGETS b/TARGETS
index a635ed5ac7d..0cdd3b162f9 100644
--- a/TARGETS
+++ b/TARGETS
@@ -30,6 +30,7 @@ ROCKSDB_COMPILER_FLAGS = [
     "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
     "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
+    "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
 
 ROCKSDB_EXTERNAL_DEPS = [
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 730b5ebf9da..79648bb6a6d 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -35,6 +35,7 @@
     "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
     "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
+    "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
 
 ROCKSDB_EXTERNAL_DEPS = [
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 7f454bcca08..5d42faa30ae 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -602,6 +602,19 @@ EOF
   fi
 fi
 
+if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
+  $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
+  void dummy_func() {}
+EOF
+  if [ "$?" = 0 ]; then
+    $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o /dev/null 2>/dev/null
+    if [ "$?" = 0 ]; then
+      EXEC_LDFLAGS+="-ldl"
+      rm -f test_dl.o
+    fi
+  fi
+fi
+
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
 
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 7eb5b7c1451..f1a0907c9fe 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -7,8 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors
 #include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
 #include <errno.h>
 #include <fcntl.h>
+
 #if defined(OS_LINUX)
 #include <linux/fs.h>
 #endif
@@ -69,6 +73,17 @@
 #endif
 
 namespace rocksdb {
+#if defined(OS_WIN)
+static const std::string kSharedLibExt = ".dll";
+static const char kPathSeparator = ';';
+#else
+static const char kPathSeparator = ':';
+#if defined(OS_MACOSX)
+static const std::string kSharedLibExt = ".dylib";
+#else
+static const std::string kSharedLibExt = ".so";
+#endif
+#endif
 
 namespace {
 
@@ -115,6 +130,32 @@ int cloexec_flags(int flags, const EnvOptions* options) {
   return flags;
 }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+class PosixDynamicLibrary : public DynamicLibrary {
+ public:
+  PosixDynamicLibrary(const std::string& name, void* handle)
+      : name_(name), handle_(handle) {}
+  ~PosixDynamicLibrary() override { dlclose(handle_); }
+
+  Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) override {
+    char* err = dlerror();  // Clear any old error
+    *func = (FunctionPtr)dlsym(handle_, sym_name.c_str());
+    if (*func != nullptr) {
+      return Status::OK();
+    } else {
+      err = dlerror();
+      return Status::NotFound("Error finding symbol: " + sym_name, err);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+  void* handle_;
+};
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
 class PosixEnv : public Env {
  public:
   PosixEnv();
@@ -729,6 +770,64 @@ class PosixEnv : public Env {
     return result;
   }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+  /**
+   * Loads the named library into the result.
+   * If the input name is empty, the current executable is loaded
+   * On *nix systems, a "lib" prefix is added to the name if one is not supplied
+   * Comparably, the appropriate shared library extension is added to the name
+   * if not supplied. If search_path is not specified, the shared library will
+   * be loaded using the default path (LD_LIBRARY_PATH) If search_path is
+   * specified, the shared library will be searched for in the directories
+   * provided by the search path
+   */
+  Status LoadLibrary(const std::string& name, const std::string& path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    Status status;
+    assert(result != nullptr);
+    if (name.empty()) {
+      void* hndl = dlopen(NULL, RTLD_NOW);
+      if (hndl != nullptr) {
+        result->reset(new PosixDynamicLibrary(name, hndl));
+        return Status::OK();
+      }
+    } else {
+      std::string library_name = name;
+      if (library_name.find(kSharedLibExt) == std::string::npos) {
+        library_name = library_name + kSharedLibExt;
+      }
+#if !defined(OS_WIN)
+      if (library_name.find('/') == std::string::npos &&
+          library_name.compare(0, 3, "lib") != 0) {
+        library_name = "lib" + library_name;
+      }
+#endif
+      if (path.empty()) {
+        void* hndl = dlopen(library_name.c_str(), RTLD_NOW);
+        if (hndl != nullptr) {
+          result->reset(new PosixDynamicLibrary(library_name, hndl));
+          return Status::OK();
+        }
+      } else {
+        std::string local_path;
+        std::stringstream ss(path);
+        while (getline(ss, local_path, kPathSeparator)) {
+          if (!path.empty()) {
+            std::string full_name = local_path + "/" + library_name;
+            void* hndl = dlopen(full_name.c_str(), RTLD_NOW);
+            if (hndl != nullptr) {
+              result->reset(new PosixDynamicLibrary(full_name, hndl));
+              return Status::OK();
+            }
+          }
+        }
+      }
+    }
+    return Status::IOError(
+        IOErrorMsg("Failed to open shared library: xs", name), dlerror());
+  }
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
   void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
                 void* tag = nullptr,
                 void (*unschedFunction)(void* arg) = nullptr) override;
diff --git a/env/env_test.cc b/env/env_test.cc
index e8cb9b24534..30d5b528217 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -247,6 +247,52 @@ TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
   ASSERT_EQ(expected_data, actual_data);
 }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+TEST_F(EnvPosixTest, LoadRocksDBLibrary) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  Status status = env_->LoadLibrary("no-such-library", "", &library);
+  ASSERT_NOK(status);
+  ASSERT_EQ(nullptr, library.get());
+  status = env_->LoadLibrary("rocksdb", "", &library);
+  if (status.ok()) {  // If we have can find a rocksdb shared library
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(library->LoadFunction("rocksdb_create_default_env",
+                                    &function));  // from C definition
+    ASSERT_NE(nullptr, function);
+    ASSERT_NOK(library->LoadFunction("no-such-method", &function));
+    ASSERT_EQ(nullptr, function);
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  } else {
+    ASSERT_EQ(nullptr, library.get());
+  }
+}
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+  char buff[1024];
+  std::string cwd = getcwd(buff, sizeof(buff));
+
+  status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+}
+#endif  // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION
+
 TEST_P(EnvPosixTestWithParam, UnSchedule) {
   std::atomic<bool> called(false);
   env_->SetBackgroundThreads(1, Env::LOW);
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 8f6bd607228..a8fe2fb78ea 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -41,6 +41,7 @@
 
 namespace rocksdb {
 
+class DynamicLibrary;
 class FileLock;
 class Logger;
 class RandomAccessFile;
@@ -338,6 +339,18 @@ class Env {
   // REQUIRES: lock has not already been unlocked.
   virtual Status UnlockFile(FileLock* lock) = 0;
 
+  // Opens `lib_name` as a dynamic library.
+  // If the 'search_path' is specified, breaks the path into its components
+  // based on the appropriate platform separator (";" or ";") and looks for the
+  // library in those directories.  If 'search path is not specified, uses the
+  // default library path search mechanism (such as LD_LIBRARY_PATH). On
+  // success, stores a dynamic library in `*result`.
+  virtual Status LoadLibrary(const std::string& /*lib_name*/,
+                             const std::string& /*search_path */,
+                             std::shared_ptr<DynamicLibrary>* /*result*/) {
+    return Status::NotSupported("LoadLibrary is not implemented in this Env");
+  }
+
   // Priority for scheduling job in thread pool
   enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
 
@@ -978,6 +991,29 @@ class FileLock {
   void operator=(const FileLock&);
 };
 
+class DynamicLibrary {
+ public:
+  typedef void* (*FunctionPtr)();
+  virtual ~DynamicLibrary() {}
+
+  /** Returns the name of the dynamic library */
+  virtual const char* Name() const = 0;
+
+  /**
+   * Loads the symbol for sym_name from the library and updates the input
+   * function. Returns the loaded symbol
+   */
+  template <typename T>
+  Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+    FunctionPtr ptr;
+    Status s = LoadSymbol(sym_name, &ptr);
+    *function = reinterpret_cast<T*>(ptr);
+    return s;
+  }
+  /** Loads and returns the symbol for sym_name from the library  */
+  virtual Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) = 0;
+};
+
 extern void LogFlush(const std::shared_ptr<Logger>& info_log);
 
 extern void Log(const InfoLogLevel log_level,
@@ -1168,6 +1204,12 @@ class EnvWrapper : public Env {
 
   Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
 
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_->LoadLibrary(lib_name, search_path, result);
+  }
+
   void Schedule(void (*f)(void* arg), void* a, Priority pri,
                 void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
     return target_->Schedule(f, a, pri, tag, u);

From ebe89ef9d84cf1a05a47b8d03c7509f9f103ad10 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Tue, 4 Jun 2019 10:17:24 -0700
Subject: [PATCH 106/572] Fix merging range tombstone covering put during
 flush/compaction (#5406)

Summary:
Flush/compaction use `MergeUntil` which has a special code path to
handle a merge ending with a non-`Merge` point key. In particular if
that key is a `Put` we forgot to check whether it is covered by a range
tombstone. If it is covered then we must not include it in the following call
to `TimedFullMerge`.

Fixes #5392.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5406

Differential Revision: D15611144

Pulled By: sagar0

fbshipit-source-id: ba6a7863ca2d043f591de78fd0c4f4561f0c500e
---
 HISTORY.md              |  1 +
 db/db_range_del_test.cc | 24 ++++++++++++++++++++++++
 db/merge_helper.cc      | 10 +++++++++-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index b9b6998c6f5..b3c2ef14ac2 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -24,6 +24,7 @@
 
 ### Bug Fixes
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
+* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 16d682fc083..e58095b2d92 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -491,6 +491,30 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
   ASSERT_EQ(expected, actual);
 }
 
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "key", "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 4a4d2fb714e..b5ae924ffc6 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -201,7 +201,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       // want. Also if we're in compaction and it's a put, it would be nice to
       // run compaction filter on it.
       const Slice val = iter->value();
-      const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
+      const Slice* val_ptr;
+      if (kTypeValue == ikey.type &&
+          (range_del_agg == nullptr ||
+           !range_del_agg->ShouldDelete(
+               ikey, RangeDelPositioningMode::kForwardTraversal))) {
+        val_ptr = &val;
+      } else {
+        val_ptr = nullptr;
+      }
       std::string merge_result;
       s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
                          merge_context_.GetOperands(), &merge_result, logger_,

From 227b5d52df103ef8722e537bd3ecd3445082b288 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Tue, 4 Jun 2019 10:51:22 -0700
Subject: [PATCH 107/572] Make RocksDB secondary instance respect atomic groups
 in version edits. (#5411)

Summary:
With this commit, RocksDB secondary instance respects atomic groups in version edits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5411

Differential Revision: D15617512

Pulled By: HaoyuHuang

fbshipit-source-id: 913f4ede391d772dcaf5649e3cd2099fa292d120
---
 db/db_impl/db_secondary_test.cc |   2 +-
 db/version_edit.h               |   1 +
 db/version_set.cc               | 403 ++++++++++++++----------
 db/version_set.h                |  47 ++-
 db/version_set_test.cc          | 542 +++++++++++++++++++++-----------
 5 files changed, 647 insertions(+), 348 deletions(-)

diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index e8eafd673ed..5b375422f02 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -373,7 +373,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) {
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->SetCallBack(
-      "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers",
+      "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
       [&](void* arg) {
         Status s = *reinterpret_cast<Status*>(arg);
         if (s.IsPathNotFound()) {
diff --git a/db/version_edit.h b/db/version_edit.h
index 471b4e095ab..e1857b37fc4 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -316,6 +316,7 @@ class VersionEdit {
   friend class ReactiveVersionSet;
   friend class VersionSet;
   friend class Version;
+  friend class AtomicGroupReadBuffer;
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 26465a01a4e..a60a4e87cac 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3313,6 +3313,51 @@ struct VersionSet::ManifestWriter {
         edit_list(e) {}
 };
 
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+  assert(edit);
+  if (edit->is_in_atomic_group_) {
+    TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+    if (replay_buffer_.empty()) {
+      replay_buffer_.resize(edit->remaining_entries_ + 1);
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+    }
+    read_edits_in_atomic_group_++;
+    if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+        static_cast<uint32_t>(replay_buffer_.size())) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+      return Status::Corruption("corrupted atomic group");
+    }
+    replay_buffer_[read_edits_in_atomic_group_ - 1] = std::move(*edit);
+    if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
+  // A normal edit.
+  if (!replay_buffer().empty()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+    return Status::Corruption("corrupted atomic group");
+  }
+  return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+  return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+  read_edits_in_atomic_group_ = 0;
+  replay_buffer_.clear();
+}
+
 VersionSet::VersionSet(const std::string& dbname,
                        const ImmutableDBOptions* _db_options,
                        const EnvOptions& storage_options, Cache* table_cache,
@@ -4071,6 +4116,74 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env,
   return Status::OK();
 }
 
+Status VersionSet::ReadAndRecover(
+    log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+        builders,
+    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+    bool* have_last_sequence, SequenceNumber* last_sequence,
+    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+  assert(reader != nullptr);
+  assert(read_buffer != nullptr);
+  Status s;
+  Slice record;
+  std::string scratch;
+  size_t recovered_edits = 0;
+  while (reader->ReadRecord(&record, &scratch) && s.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+    s = read_buffer->AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer->IsFull()) {
+        // Apply edits in an atomic group when we have read all edits in the
+        // group.
+        for (auto& e : read_buffer->replay_buffer()) {
+          s = ApplyOneVersionEditToBuilder(
+              e, name_to_options, column_families_not_found, builders,
+              have_log_number, log_number, have_prev_log_number,
+              previous_log_number, have_next_file, next_file,
+              have_last_sequence, last_sequence, min_log_number_to_keep,
+              max_column_family);
+          if (!s.ok()) {
+            break;
+          }
+          recovered_edits++;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer->Clear();
+      }
+    } else {
+      // Apply a normal edit immediately.
+      s = ApplyOneVersionEditToBuilder(
+          edit, name_to_options, column_families_not_found, builders,
+          have_log_number, log_number, have_prev_log_number,
+          previous_log_number, have_next_file, next_file, have_last_sequence,
+          last_sequence, min_log_number_to_keep, max_column_family);
+      if (s.ok()) {
+        recovered_edits++;
+      }
+    }
+  }
+  if (!s.ok()) {
+    // Clear the buffer if we fail to decode/apply an edit.
+    read_buffer->Clear();
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
+                           &recovered_edits);
+  return s;
+}
+
 Status VersionSet::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families,
     bool read_only) {
@@ -4148,66 +4261,12 @@ Status VersionSet::Recover(
                        true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
-    std::vector<VersionEdit> replay_buffer;
-    size_t num_entries_decoded = 0;
-    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      if (edit.is_in_atomic_group_) {
-        if (replay_buffer.empty()) {
-          replay_buffer.resize(edit.remaining_entries_ + 1);
-          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup",
-                                   &edit);
-        }
-        ++num_entries_decoded;
-        if (num_entries_decoded + edit.remaining_entries_ !=
-            static_cast<uint32_t>(replay_buffer.size())) {
-          TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::Recover:IncorrectAtomicGroupSize", &edit);
-          s = Status::Corruption("corrupted atomic group");
-          break;
-        }
-        replay_buffer[num_entries_decoded - 1] = std::move(edit);
-        if (num_entries_decoded == replay_buffer.size()) {
-          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup",
-                                   &edit);
-          for (auto& e : replay_buffer) {
-            s = ApplyOneVersionEditToBuilder(
-                e, cf_name_to_options, column_families_not_found, builders,
-                &have_log_number, &log_number, &have_prev_log_number,
-                &previous_log_number, &have_next_file, &next_file,
-                &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-                &max_column_family);
-            if (!s.ok()) {
-              break;
-            }
-          }
-          replay_buffer.clear();
-          num_entries_decoded = 0;
-        }
-        TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup");
-      } else {
-        if (!replay_buffer.empty()) {
-          TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit);
-          s = Status::Corruption("corrupted atomic group");
-          break;
-        }
-        s = ApplyOneVersionEditToBuilder(
-            edit, cf_name_to_options, column_families_not_found, builders,
-            &have_log_number, &log_number, &have_prev_log_number,
-            &previous_log_number, &have_next_file, &next_file,
-            &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-            &max_column_family);
-      }
-      if (!s.ok()) {
-        break;
-      }
-    }
+    AtomicGroupReadBuffer read_buffer;
+    s = ReadAndRecover(
+        &reader, &read_buffer, cf_name_to_options, column_families_not_found,
+        builders, &have_log_number, &log_number, &have_prev_log_number,
+        &previous_log_number, &have_next_file, &next_file, &have_last_sequence,
+        &last_sequence, &min_log_number_to_keep, &max_column_family);
   }
 
   if (s.ok()) {
@@ -5218,19 +5277,11 @@ Status ReactiveVersionSet::Recover(
     assert(reader != nullptr);
     Slice record;
     std::string scratch;
-    while (s.ok() && reader->ReadRecord(&record, &scratch)) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-      s = ApplyOneVersionEditToBuilder(
-          edit, cf_name_to_options, column_families_not_found, builders,
-          &have_log_number, &log_number, &have_prev_log_number,
-          &previous_log_number, &have_next_file, &next_file,
-          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-          &max_column_family);
-    }
+    s = ReadAndRecover(
+        reader, &read_buffer_, cf_name_to_options, column_families_not_found,
+        builders, &have_log_number, &log_number, &have_prev_log_number,
+        &previous_log_number, &have_next_file, &next_file, &have_last_sequence,
+        &last_sequence, &min_log_number_to_keep, &max_column_family);
     if (s.ok()) {
       bool enough = have_next_file && have_log_number && have_last_sequence;
       if (enough) {
@@ -5350,7 +5401,7 @@ Status ReactiveVersionSet::ReadAndApply(
   uint64_t previous_log_number = 0;
   uint32_t max_column_family = 0;
   uint64_t min_log_number_to_keep = 0;
-
+  uint64_t applied_edits = 0;
   while (s.ok()) {
     Slice record;
     std::string scratch;
@@ -5362,73 +5413,46 @@ Status ReactiveVersionSet::ReadAndApply(
       if (!s.ok()) {
         break;
       }
-      ColumnFamilyData* cfd =
-          column_family_set_->GetColumnFamily(edit.column_family_);
-      // If we cannot find this column family in our column family set, then it
-      // may be a new column family created by the primary after the secondary
-      // starts. Ignore it for now.
-      if (nullptr == cfd) {
-        continue;
-      }
-      if (active_version_builders_.find(edit.column_family_) ==
-          active_version_builders_.end()) {
-        std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
-            new BaseReferencedVersionBuilder(cfd));
-        active_version_builders_.insert(
-            std::make_pair(edit.column_family_, std::move(builder_guard)));
-      }
-      s = ApplyOneVersionEditToBuilder(
-          edit, &have_log_number, &log_number, &have_prev_log_number,
-          &previous_log_number, &have_next_file, &next_file,
-          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-          &max_column_family);
+
+      s = read_buffer_.AddEdit(&edit);
       if (!s.ok()) {
         break;
       }
-      auto builder_iter = active_version_builders_.find(edit.column_family_);
-      assert(builder_iter != active_version_builders_.end());
-      auto builder = builder_iter->second->version_builder();
-      assert(builder != nullptr);
-      s = builder->LoadTableHandlers(
-          cfd->internal_stats(), db_options_->max_file_opening_threads,
-          false /* prefetch_index_and_filter_in_cache */,
-          false /* is_initial_load */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-      TEST_SYNC_POINT_CALLBACK(
-          "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s);
-      if (!s.ok() && !s.IsPathNotFound()) {
-        break;
-      } else if (s.IsPathNotFound()) {
-        s = Status::OK();
-      } else {  // s.ok() == true
-        auto version = new Version(cfd, this, env_options_,
-                                   *cfd->GetLatestMutableCFOptions(),
-                                   current_version_number_++);
-        builder->SaveTo(version->storage_info());
-        version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
-        AppendVersion(cfd, version);
-        active_version_builders_.erase(builder_iter);
-        if (cfds_changed->count(cfd) == 0) {
-          cfds_changed->insert(cfd);
+      if (edit.is_in_atomic_group_) {
+        if (read_buffer_.IsFull()) {
+          // Apply edits in an atomic group when we have read all edits in the
+          // group.
+          for (auto& e : read_buffer_.replay_buffer()) {
+            s = ApplyOneVersionEditToBuilder(
+                e, cfds_changed, &have_log_number, &log_number,
+                &have_prev_log_number, &previous_log_number, &have_next_file,
+                &next_file, &have_last_sequence, &last_sequence,
+                &min_log_number_to_keep, &max_column_family);
+            if (!s.ok()) {
+              break;
+            }
+            applied_edits++;
+          }
+          if (!s.ok()) {
+            break;
+          }
+          read_buffer_.Clear();
+        }
+      } else {
+        // Apply a normal edit immediately.
+        s = ApplyOneVersionEditToBuilder(
+            edit, cfds_changed, &have_log_number, &log_number,
+            &have_prev_log_number, &previous_log_number, &have_next_file,
+            &next_file, &have_last_sequence, &last_sequence,
+            &min_log_number_to_keep, &max_column_family);
+        if (s.ok()) {
+          applied_edits++;
         }
       }
-      if (have_next_file) {
-        next_file_number_.store(next_file + 1);
-      }
-      if (have_last_sequence) {
-        last_allocated_sequence_ = last_sequence;
-        last_published_sequence_ = last_sequence;
-        last_sequence_ = last_sequence;
-      }
-      if (have_prev_log_number) {
-        prev_log_number_ = previous_log_number;
-        MarkFileNumberUsed(previous_log_number);
-      }
-      if (have_log_number) {
-        MarkFileNumberUsed(log_number);
-      }
-      column_family_set_->UpdateMaxColumnFamily(max_column_family);
-      MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
+    }
+    if (!s.ok()) {
+      // Clear the buffer if we fail to decode/apply an edit.
+      read_buffer_.Clear();
     }
     // It's possible that:
     // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
@@ -5457,52 +5481,113 @@ Status ReactiveVersionSet::ReadAndApply(
       }
     }
   }
+  TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
+                           &applied_edits);
   return s;
 }
 
 Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
-    bool* have_prev_log_number, uint64_t* previous_log_number,
-    bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
-    SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
-    uint32_t* max_column_family) {
-  ColumnFamilyData* cfd = nullptr;
-  Status status;
+    VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+    bool* have_last_sequence, SequenceNumber* last_sequence,
+    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+  ColumnFamilyData* cfd =
+      column_family_set_->GetColumnFamily(edit.column_family_);
+
+  // If we cannot find this column family in our column family set, then it
+  // may be a new column family created by the primary after the secondary
+  // starts. It is also possible that the secondary instance opens only a subset
+  // of column families. Ignore it for now.
+  if (nullptr == cfd) {
+    return Status::OK();
+  }
+  if (active_version_builders_.find(edit.column_family_) ==
+      active_version_builders_.end()) {
+    std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
+        new BaseReferencedVersionBuilder(cfd));
+    active_version_builders_.insert(
+        std::make_pair(edit.column_family_, std::move(builder_guard)));
+  }
+
+  auto builder_iter = active_version_builders_.find(edit.column_family_);
+  assert(builder_iter != active_version_builders_.end());
+  auto builder = builder_iter->second->version_builder();
+  assert(builder != nullptr);
+
   if (edit.is_column_family_add_) {
     // TODO (yanqin) for now the secondary ignores column families created
     // after Open. This also simplifies handling of switching to a new MANIFEST
     // and processing the snapshot of the system at the beginning of the
     // MANIFEST.
-    return Status::OK();
   } else if (edit.is_column_family_drop_) {
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // Drop a CF created by primary after secondary starts? Then ignore
-    if (cfd == nullptr) {
-      return Status::OK();
-    }
     // Drop the column family by setting it to be 'dropped' without destroying
     // the column family handle.
+    // TODO (haoyu) figure out how to handle column faimly drop for
+    // secondary instance. (Is it possible that the ref count for cfd is 0 but
+    // the ref count for its versions is higher than 0?)
     cfd->SetDropped();
     if (cfd->Unref()) {
       delete cfd;
       cfd = nullptr;
     }
   } else {
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // Operation on a CF created after Open? Then ignore
-    if (cfd == nullptr) {
-      return Status::OK();
-    }
-    auto builder_iter = active_version_builders_.find(edit.column_family_);
-    assert(builder_iter != active_version_builders_.end());
-    auto builder = builder_iter->second->version_builder();
-    assert(builder != nullptr);
     builder->Apply(&edit);
   }
-  return ExtractInfoFromVersionEdit(
+  Status s = ExtractInfoFromVersionEdit(
       cfd, edit, have_log_number, log_number, have_prev_log_number,
       previous_log_number, have_next_file, next_file, have_last_sequence,
       last_sequence, min_log_number_to_keep, max_column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cfd != nullptr) {
+    s = builder->LoadTableHandlers(
+        cfd->internal_stats(), db_options_->max_file_opening_threads,
+        false /* prefetch_index_and_filter_in_cache */,
+        false /* is_initial_load */,
+        cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+    TEST_SYNC_POINT_CALLBACK(
+        "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
+        "AfterLoadTableHandlers",
+        &s);
+
+    if (s.ok()) {
+      auto version = new Version(cfd, this, env_options_,
+                                 *cfd->GetLatestMutableCFOptions(),
+                                 current_version_number_++);
+      builder->SaveTo(version->storage_info());
+      version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
+      AppendVersion(cfd, version);
+      active_version_builders_.erase(builder_iter);
+      if (cfds_changed->count(cfd) == 0) {
+        cfds_changed->insert(cfd);
+      }
+    } else if (s.IsPathNotFound()) {
+      s = Status::OK();
+    }
+    // Some other error has occurred during LoadTableHandlers.
+  }
+
+  if (have_next_file) {
+    next_file_number_.store(*next_file + 1);
+  }
+  if (have_last_sequence) {
+    last_allocated_sequence_ = *last_sequence;
+    last_published_sequence_ = *last_sequence;
+    last_sequence_ = *last_sequence;
+  }
+  if (have_prev_log_number) {
+    prev_log_number_ = *previous_log_number;
+    MarkFileNumberUsed(*previous_log_number);
+  }
+  if (have_log_number) {
+    MarkFileNumberUsed(*log_number);
+  }
+  column_family_set_->UpdateMaxColumnFamily(*max_column_family);
+  MarkMinLogNumberToKeep2PC(*min_log_number_to_keep);
+  return s;
 }
 
 Status ReactiveVersionSet::MaybeSwitchManifest(
diff --git a/db/version_set.h b/db/version_set.h
index c43e4091442..dc9e759655e 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -752,6 +752,23 @@ struct ObsoleteFileInfo {
 
 class BaseReferencedVersionBuilder;
 
+class AtomicGroupReadBuffer {
+ public:
+  Status AddEdit(VersionEdit* edit);
+  void Clear();
+  bool IsFull() const;
+  bool IsEmpty() const;
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_edits_in_atomic_group_;
+  }
+  std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+  uint64_t read_edits_in_atomic_group_ = 0;
+  std::vector<VersionEdit> replay_buffer_;
+};
+
 // VersionSet is the collection of versions of all the column families of the
 // database. Each database owns one VersionSet. A VersionSet has access to all
 // column families via ColumnFamilySet, i.e. set of the column families.
@@ -1028,6 +1045,18 @@ class VersionSet {
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
+  Status ReadAndRecover(
+      log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+      const std::unordered_map<std::string, ColumnFamilyOptions>&
+          name_to_options,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<
+          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+      bool* have_last_sequence, SequenceNumber* last_sequence,
+      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
       VersionEdit& edit,
@@ -1135,16 +1164,23 @@ class ReactiveVersionSet : public VersionSet {
                  std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
                  std::unique_ptr<Status>* manifest_reader_status);
 
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_buffer_.TEST_read_edits_in_atomic_group();
+  }
+  std::vector<VersionEdit>& replay_buffer() {
+    return read_buffer_.replay_buffer();
+  }
+
  protected:
   using VersionSet::ApplyOneVersionEditToBuilder;
 
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
-      VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
-      bool* have_prev_log_number, uint64_t* previous_log_number,
-      bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
-      SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
-      uint32_t* max_column_family);
+      VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+      bool* have_last_sequence, SequenceNumber* last_sequence,
+      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
 
   Status MaybeSwitchManifest(
       log::Reader::Reporter* reporter,
@@ -1153,6 +1189,7 @@ class ReactiveVersionSet : public VersionSet {
  private:
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       active_version_builders_;
+  AtomicGroupReadBuffer read_buffer_;
 
   using VersionSet::LogAndApply;
   using VersionSet::Recover;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 77890d82638..bf9ef8e39fe 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -607,6 +607,7 @@ class VersionSetTestBase {
   const static std::string kColumnFamilyName1;
   const static std::string kColumnFamilyName2;
   const static std::string kColumnFamilyName3;
+  int num_initial_edits_;
 
   VersionSetTestBase()
       : env_(Env::Default()),
@@ -618,6 +619,9 @@ class VersionSetTestBase {
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
                                  &write_controller_)),
+        reactive_versions_(std::make_shared<ReactiveVersionSet>(
+            dbname_, &db_options_, env_options_, table_cache_.get(),
+            &write_buffer_manager_, &write_controller_)),
         shutting_down_(false),
         mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
@@ -653,7 +657,7 @@ class VersionSetTestBase {
       new_cfs.emplace_back(new_cf);
     }
     *last_seqno = last_seq;
-
+    num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
     const std::string manifest = DescriptorFileName(dbname_, 1);
     std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
@@ -708,6 +712,7 @@ class VersionSetTestBase {
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
   std::shared_ptr<VersionSet> versions_;
+  std::shared_ptr<ReactiveVersionSet> reactive_versions_;
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
@@ -758,216 +763,388 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
   EXPECT_EQ(kGroupSize - 1, count);
 }
 
-TEST_F(VersionSetTest, HandleValidAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  VersionSetAtomicGroupTest() : VersionSetTestBase() {}
 
-  // Append multiple version edits that form an atomic group
-  const int kAtomicGroupSize = 3;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    edits[i].MarkAtomicGroup(--remaining);
-    edits[i].SetLastSequence(last_seqno++);
+  void SetUp() override {
+    PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+    SetupTestSyncPoints();
   }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
-  }
-  log_writer.reset();
-
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+  void SetupValidAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  bool first_in_atomic_group = false;
-  bool last_in_atomic_group = false;
+  void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.front().DebugString(),
-                  e->DebugString());  // compare based on value
-        first_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.back().DebugString(),
-                  e->DebugString());  // compare based on value
-        EXPECT_TRUE(first_in_atomic_group);
-        last_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
+  void SetupCorruptedAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != ((size_t)atomic_group_size / 2)) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  EXPECT_OK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
-            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(first_in_atomic_group);
-  EXPECT_TRUE(last_in_atomic_group);
-}
+  void SetupIncorrectAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != 1) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      } else {
+        edits_[i].MarkAtomicGroup(remaining--);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  void SetupTestSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.front().DebugString(),
+                    e->DebugString());  // compare based on value
+          first_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.back().DebugString(),
+                    e->DebugString());  // compare based on value
+          EXPECT_TRUE(first_in_atomic_group_);
+          last_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<int*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "ReactiveVersionSet::ReadAndApply:AppliedEdits",
+        [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+        [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+        [&](void* arg) {
+          corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+        [&](void* arg) {
+          edit_with_incorrect_group_size_ =
+              *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
 
-  // Append multiple version edits that form an atomic group
-  const int kAtomicGroupSize = 4;
-  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
-  std::vector<VersionEdit> edits(kNumberOfPersistedVersionEdits);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    edits[i].MarkAtomicGroup(--remaining);
-    edits[i].SetLastSequence(last_seqno++);
+  void AddNewEditsToLog(int num_edits) {
+    for (int i = 0; i < num_edits; i++) {
+      std::string record;
+      edits_[i].EncodeTo(&record);
+      ASSERT_OK(log_writer_->AddRecord(record));
+    }
   }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    log_writer_.reset();
   }
-  log_writer.reset();
 
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+ protected:
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::vector<VersionEdit> edits_;
+  bool first_in_atomic_group_ = false;
+  bool last_in_atomic_group_ = false;
+  int num_edits_in_atomic_group_ = 0;
+  int num_recovered_edits_ = 0;
+  int num_applied_edits_ = 0;
+  VersionEdit corrupted_edit_;
+  VersionEdit edit_with_incorrect_group_size_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  bool first_in_atomic_group = false;
-  bool last_in_atomic_group = false;
-  size_t num = 0;
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.front().DebugString(),
-                  e->DebugString());  // compare based on value
-        first_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:LastInAtomicGroup",
-      [&](void* /* arg */) { last_in_atomic_group = true; });
-  SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup",
-                                        [&](void* /* arg */) { ++num; });
-  SyncPoint::GetInstance()->EnableProcessing();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
 
-  EXPECT_OK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
             versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(first_in_atomic_group);
-  EXPECT_FALSE(last_in_atomic_group);
-  EXPECT_EQ(kNumberOfPersistedVersionEdits, num);
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
 }
 
-TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
-
-  // Append multiple version edits that form an atomic group
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
   const int kAtomicGroupSize = 4;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    if (i != (kAtomicGroupSize / 2)) {
-      edits[i].MarkAtomicGroup(--remaining);
-    }
-    edits[i].SetLastSequence(last_seqno++);
-  }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
-  }
-  log_writer.reset();
-
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  // Write the last record. The reactive version set should now apply all
+  // edits.
+  std::string last_record;
+  edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+  EXPECT_OK(log_writer_->AddRecord(last_record));
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  // Reactive version set should be empty now.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  // No edits in an atomic group.
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  // Write a few edits in an atomic group.
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  bool mixed = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString());
-        mixed = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-  EXPECT_NOK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
             versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(mixed);
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
 }
 
-TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
 
-  // Append multiple version edits that form an atomic group
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
   const int kAtomicGroupSize = 4;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    if (i != 1) {
-      edits[i].MarkAtomicGroup(--remaining);
-    } else {
-      edits[i].MarkAtomicGroup(remaining--);
-    }
-    edits[i].SetLastSequence(last_seqno++);
-  }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
-  }
-  log_writer.reset();
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  // Write the corrupted edits.
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
 
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
 
-  bool incorrect_group_size = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits[1].DebugString(), e->DebugString());
-        incorrect_group_size = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-  EXPECT_NOK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
-            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(incorrect_group_size);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
 }
 
 class VersionSetTestDropOneCF : public VersionSetTestBase,
@@ -1088,7 +1265,6 @@ INSTANTIATE_TEST_CASE_P(
     testing::Values(VersionSetTestBase::kColumnFamilyName1,
                     VersionSetTestBase::kColumnFamilyName2,
                     VersionSetTestBase::kColumnFamilyName3));
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 0153e14569c30f225d7a08050acbf10c4d211d41 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 5 Jun 2019 09:38:23 -0700
Subject: [PATCH 108/572] Add a MultiRead() method to Env (#5311)

Summary:
Define the Env:: MultiRead() method to allow callers to request multiple block reads in one shot. The underlying Env implementation can parallelize it if it chooses to in order to reduce the overall IO latency.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5311

Differential Revision: D15502172

Pulled By: anand1976

fbshipit-source-id: 2b228269c2e11b5f54694d6b2bb3119c8a8ce2b9
---
 env/env_test.cc       | 53 +++++++++++++++++++++++++++++++++++++++++++
 include/rocksdb/env.h | 39 +++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/env/env_test.cc b/env/env_test.cc
index 30d5b528217..a2b6db5c475 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -1105,6 +1105,59 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
   }
 }
 
+TEST_P(EnvPosixTestWithParam, MultiRead) {
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kSectorSize = 4096;
+  const size_t kNumSectors = 8;
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+    if (soptions.use_direct_writes) {
+      soptions.use_direct_writes = false;
+    }
+#endif
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<const char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice));
+    }
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Random Read
+  {
+    std::unique_ptr<RandomAccessFile> file;
+    std::vector<ReadRequest> reqs(3);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    uint64_t offset = 0;
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      reqs[i].offset = offset;
+      offset += 2 * kSectorSize;
+      reqs[i].len = kSectorSize;
+      data.emplace_back(NewAligned(kSectorSize, 0));
+      reqs[i].scratch = data.back().get();
+    }
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+    if (soptions.use_direct_reads) {
+      soptions.use_direct_reads = false;
+    }
+#endif
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<const char>(i*2 + 1));
+      ASSERT_OK(reqs[i].status);
+      ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
+    }
+  }
+}
+
 // Only works in linux platforms
 #ifdef OS_WIN
 TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) {
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index a8fe2fb78ea..0a055cea0bf 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -583,6 +583,26 @@ class SequentialFile {
   // SequentialFileWrapper too.
 };
 
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+  // File offset in bytes
+  uint64_t offset;
+
+  // Length to read in bytes
+  size_t len;
+
+  // A buffer that MultiRead()  can optionally place data in. It can
+  // ignore this and allocate its own buffer
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  Slice result;
+
+  // Status of read
+  Status status;
+};
+
 // A file abstraction for randomly reading the contents of a file.
 class RandomAccessFile {
  public:
@@ -607,6 +627,22 @@ class RandomAccessFile {
     return Status::OK();
   }
 
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping. If the function return Status is not ok, status of
+  // individual requests will be ignored and return status will be assumed
+  // for all read requests. The function return status is only meant for any
+  // any errors that occur before even processing specific read requests
+  virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest& req = reqs[i];
+      req.status = Read(req.offset, req.len, &req.result, req.scratch);
+    }
+    return Status::OK();
+  }
+
   // Tries to get an unique ID for this file that will be the same each time
   // the file is opened (and will stay the same while the file is open).
   // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
@@ -1357,6 +1393,9 @@ class RandomAccessFileWrapper : public RandomAccessFile {
               char* scratch) const override {
     return target_->Read(offset, n, result, scratch);
   }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    return target_->MultiRead(reqs, num_reqs);
+  }
   Status Prefetch(uint64_t offset, size_t n) override {
     return target_->Prefetch(offset, n);
   }

From 267b9b109176f51e59604233d6bef5293278f2a1 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 5 Jun 2019 13:56:46 -0700
Subject: [PATCH 109/572] Disable dynamic extension support by default for
 CMake (#5419)

Summary:
We have users reporting linking error while building RocksDB using CMake, and we do not enable dynamic extension feature for them. The fix is to add `-DROCKSDB_NO_DYNAMIC_EXTENSION` to CMake by default.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5419

Differential Revision: D15676792

Pulled By: riversand963

fbshipit-source-id: d45aaacfc64ea61646fd7329c352cd760145baf3
---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cb4cc7a863..354697b05bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -332,6 +332,10 @@ if(DISABLE_STALL_NOTIF)
   add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION)
 endif()
 
+option(WITH_DYNAMIC_EXTENSION "build with dynamic extension support" OFF)
+if(NOT WITH_DYNAMIC_EXTENSION)
+  add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION)
+endif()
 
 if(DEFINED USE_RTTI)
   if(USE_RTTI)
@@ -488,7 +492,7 @@ set(SOURCES
         db/compacted_db_impl.cc
         db/compaction/compaction.cc
         db/compaction/compaction_iterator.cc
-        db/compaction/compaction_picker.cc        
+        db/compaction/compaction_picker.cc
         db/compaction/compaction_job.cc
         db/compaction/compaction_picker_fifo.cc
         db/compaction/compaction_picker_level.cc

From cb1bf09bfc912472b380d09ee2b733a6684457d7 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 5 Jun 2019 15:16:43 -0700
Subject: [PATCH 110/572] Fix tsan error (#5414)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Previous code has a warning when compile with tsan, leading to an error since we have -Werror.
Compilation result
```
In file included from ./env/env_chroot.h:12,
                 from env/env_test.cc:40:
./include/rocksdb/env.h: In instantiation of ‘rocksdb::Status rocksdb::DynamicLibrary::LoadFunction(const string&, std::function<T>*) [with T = void*(void*, const char*); std::__cxx11::string = std::__cxx11::basic_string<char>]’:
env/env_test.cc:260:5:   required from here
./include/rocksdb/env.h:1010:17: error: cast between incompatible function types from ‘rocksdb::DynamicLibrary::FunctionPtr’ {aka ‘void* (*)()’} to ‘void* (*)(void*, const char*)’ [-Werror=cast-function-type]
     *function = reinterpret_cast<T*>(ptr);
                 ^~~~~~~~~~~~~~~~~~~~~~~~~
cc1plus: all warnings being treated as errors
make: *** [env/env_test.o] Error 1
```
It also has another error reported by clang
```
env/env_posix.cc:141:11: warning: Value stored to 'err' during its initialization is never read
    char* err = dlerror();  // Clear any old error
          ^~~   ~~~~~~~~~
1 warning generated.
```

Test plan (on my devserver).
```
$make clean
$OPT=-g ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1 make -j32
$
$make clean
$USE_CLANG=1 TEST_TMPDIR=/dev/shm/rocksdb OPT=-g make -j1 analyze
```
Both should pass.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5414

Differential Revision: D15637315

Pulled By: riversand963

fbshipit-source-id: 8e307483761019a4d5998cab92d49516d7edffbf
---
 env/env_posix.cc      | 27 +++++++++++++--------------
 include/rocksdb/env.h | 16 +++++++---------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/env/env_posix.cc b/env/env_posix.cc
index f1a0907c9fe..c0edb00968e 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -137,13 +137,14 @@ class PosixDynamicLibrary : public DynamicLibrary {
       : name_(name), handle_(handle) {}
   ~PosixDynamicLibrary() override { dlclose(handle_); }
 
-  Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) override {
-    char* err = dlerror();  // Clear any old error
-    *func = (FunctionPtr)dlsym(handle_, sym_name.c_str());
+  Status LoadSymbol(const std::string& sym_name, void** func) override {
+    assert(nullptr != func);
+    dlerror();  // Clear any old error
+    *func = dlsym(handle_, sym_name.c_str());
     if (*func != nullptr) {
       return Status::OK();
     } else {
-      err = dlerror();
+      char* err = dlerror();
       return Status::NotFound("Error finding symbol: " + sym_name, err);
     }
   }
@@ -771,16 +772,14 @@ class PosixEnv : public Env {
   }
 
 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
-  /**
-   * Loads the named library into the result.
-   * If the input name is empty, the current executable is loaded
-   * On *nix systems, a "lib" prefix is added to the name if one is not supplied
-   * Comparably, the appropriate shared library extension is added to the name
-   * if not supplied. If search_path is not specified, the shared library will
-   * be loaded using the default path (LD_LIBRARY_PATH) If search_path is
-   * specified, the shared library will be searched for in the directories
-   * provided by the search path
-   */
+  // Loads the named library into the result.
+  // If the input name is empty, the current executable is loaded
+  // On *nix systems, a "lib" prefix is added to the name if one is not supplied
+  // Comparably, the appropriate shared library extension is added to the name
+  // if not supplied. If search_path is not specified, the shared library will
+  // be loaded using the default path (LD_LIBRARY_PATH) If search_path is
+  // specified, the shared library will be searched for in the directories
+  // provided by the search path
   Status LoadLibrary(const std::string& name, const std::string& path,
                      std::shared_ptr<DynamicLibrary>* result) override {
     Status status;
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 0a055cea0bf..ba8978dc810 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -1029,25 +1029,23 @@ class FileLock {
 
 class DynamicLibrary {
  public:
-  typedef void* (*FunctionPtr)();
   virtual ~DynamicLibrary() {}
 
-  /** Returns the name of the dynamic library */
+  // Returns the name of the dynamic library.
   virtual const char* Name() const = 0;
 
-  /**
-   * Loads the symbol for sym_name from the library and updates the input
-   * function. Returns the loaded symbol
-   */
+  // Loads the symbol for sym_name from the library and updates the input
+  // function. Returns the loaded symbol.
   template <typename T>
   Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
-    FunctionPtr ptr;
+    assert(nullptr != function);
+    void* ptr = nullptr;
     Status s = LoadSymbol(sym_name, &ptr);
     *function = reinterpret_cast<T*>(ptr);
     return s;
   }
-  /** Loads and returns the symbol for sym_name from the library  */
-  virtual Status LoadSymbol(const std::string& sym_name, FunctionPtr* func) = 0;
+  // Loads and returns the symbol for sym_name from the library.
+  virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
 };
 
 extern void LogFlush(const std::shared_ptr<Logger>& info_log);

From 340ed4fac751025dcf4368affabf950b3a417a05 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 5 Jun 2019 23:07:28 -0700
Subject: [PATCH 111/572] Add support for timestamp in Get/Put (#5079)

Summary:
It's useful to be able to (optionally) associate key-value pairs with user-provided timestamps. This PR is an early effort towards this goal and continues the work of facebook#4942. A suite of new unit tests exist in DBBasicTestWithTimestampWithParam. Support for timestamp requires the user to provide timestamp as a slice in `ReadOptions` and `WriteOptions`. All timestamps of the same database must share the same length, format, etc. The format of the timestamp is the same throughout the same database, and the user is responsible for providing a comparator function (Comparator) to order the <key, timestamp> tuples. Once created, the format and length of the timestamp cannot change (at least for now).

Test plan (on devserver):
```
$COMPILE_WITH_ASAN=1 make -j32 all
$./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/*
$make check
```
All tests must pass.

We also run the following db_bench tests to verify whether there is regression on Get/Put while timestamp is not enabled.
```
$TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000
$TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000
```
Repeat for 6 times for both versions.

Results are as follows:
```
|        | readrandom | fillrandom |
| master | 16.77 MB/s | 47.05 MB/s |
| PR5079 | 16.44 MB/s | 47.03 MB/s |
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5079

Differential Revision: D15132946

Pulled By: riversand963

fbshipit-source-id: 833a0d657eac21182f0f206c910a6438154c742c
---
 HISTORY.md                                    |   1 +
 db/db_basic_test.cc                           | 151 ++++++++++++++++++
 db/db_impl/db_impl.cc                         |  11 +-
 db/db_impl/db_impl_write.cc                   |  24 ++-
 db/dbformat.h                                 |  27 ++++
 db/memtable.cc                                |  16 +-
 db/version_set.cc                             |  43 ++---
 include/rocksdb/comparator.h                  |  27 ++++
 include/rocksdb/options.h                     |  22 ++-
 options/options.cc                            |   6 +-
 .../block_based/block_based_table_builder.cc  |   3 +-
 table/block_based/block_based_table_reader.cc |  17 +-
 table/get_context.cc                          |   2 +-
 util/comparator.cc                            |   8 +
 14 files changed, 318 insertions(+), 40 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b3c2ef14ac2..028ddcf8253 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
 * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
+* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 45524b250f7..1aec864dd6f 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1284,6 +1284,157 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
     }
   }
 }
+
+class DBBasicTestWithTimestampWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestWithTimestampWithParam()
+      : DBTestBase("/db_basic_test_with_timestamp") {}
+
+ protected:
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+      assert(a.size() >= timestamp_size());
+      assert(b.size() >= timestamp_size());
+      Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
+      Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
+
+      return cmp_without_ts_->Compare(k1, k2);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      auto* ptr1 = const_cast<Slice*>(&ts1);
+      auto* ptr2 = const_cast<Slice*>(&ts2);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return 1;
+      } else if (high1 > high2) {
+        return -1;
+      }
+      if (low1 < low2) {
+        return 1;
+      } else if (low1 > low2) {
+        return -1;
+      }
+      return 0;
+    }
+  };
+
+  Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
+    assert(nullptr != ts);
+    ts->clear();
+    PutFixed64(ts, low);
+    PutFixed64(ts, high);
+    assert(ts->size() == sizeof(low) + sizeof(high));
+    return Slice(*ts);
+  }
+};
+
+TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 6;
+  bool memtable_only = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  std::string tmp;
+  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_strs(kNumTimestamps);
+  std::vector<std::string> read_ts_strs(kNumTimestamps);
+  std::vector<Slice> write_ts_list;
+  std::vector<Slice> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+    read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+    const Slice& write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    wopts.timestamp = &write_ts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+        ASSERT_OK(Put(cf, "key" + std::to_string(j),
+                      "value_" + std::to_string(j) + "_" + std::to_string(i),
+                      wopts));
+      }
+      if (!memtable_only) {
+        ASSERT_OK(Flush(cf));
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      ropts.timestamp = &read_ts_list[i];
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+}
+
+INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
+                        ::testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index ba76abc2875..96b911a6d37 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1376,7 +1376,16 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
-  return GetImpl(read_options, column_family, key, value);
+  if (nullptr == read_options.timestamp) {
+    return GetImpl(read_options, column_family, key, value);
+  }
+  Slice akey;
+  std::string buf;
+  Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf);
+  if (s.ok()) {
+    s = GetImpl(read_options, column_family, akey, value);
+  }
+  return s;
 }
 
 Status DBImpl::GetImpl(const ReadOptions& read_options,
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 02e23e26931..947194ace19 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1677,11 +1677,25 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
                const Slice& key, const Slice& value) {
-  // Pre-allocate size of write batch conservatively.
-  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
-  // and we allocate 11 extra bytes for key length, as well as value length.
-  WriteBatch batch(key.size() + value.size() + 24);
-  Status s = batch.Put(column_family, key, value);
+  if (nullptr == opt.timestamp) {
+    // Pre-allocate size of write batch conservatively.
+    // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+    // and we allocate 11 extra bytes for key length, as well as value length.
+    WriteBatch batch(key.size() + value.size() + 24);
+    Status s = batch.Put(column_family, key, value);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(opt, &batch);
+  }
+  Slice akey;
+  std::string buf;
+  Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf);
+  if (!s.ok()) {
+    return s;
+  }
+  WriteBatch batch(akey.size() + value.size() + 24);
+  s = batch.Put(column_family, akey, value);
   if (!s.ok()) {
     return s;
   }
diff --git a/db/dbformat.h b/db/dbformat.h
index dbf6ea6f3c9..c6ee5677c09 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -151,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
   return Slice(internal_key.data(), internal_key.size() - 8);
 }
 
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+                                             size_t ts_sz) {
+  assert(internal_key.size() >= 8 + ts_sz);
+  return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data(), user_key.size() - ts_sz);
+}
+
 inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
   assert(internal_key.size() >= 8);
   const size_t n = internal_key.size();
@@ -658,4 +669,20 @@ struct ParsedInternalKeyComparator {
   const InternalKeyComparator* cmp;
 };
 
+// TODO (yanqin): this causes extra memory allocation and copy. Should be
+// addressed in the future.
+inline Status AppendTimestamp(const Slice& key, const Slice& timestamp,
+                              Slice* ret_key, std::string* ret_buf) {
+  assert(ret_key != nullptr);
+  assert(ret_buf != nullptr);
+  if (key.data() + key.size() == timestamp.data()) {
+    *ret_key = Slice(key.data(), key.size() + timestamp.size());
+  } else {
+    ret_buf->assign(key.data(), key.size());
+    ret_buf->append(timestamp.data(), timestamp.size());
+    *ret_key = Slice(*ret_buf);
+  }
+  return Status::OK();
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable.cc b/db/memtable.cc
index 46acbbfa61a..fdd1a577ade 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -493,6 +493,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+
   if (!allow_concurrent) {
     // Extract prefix for insert with hint.
     if (insert_with_hint_prefix_extractor_ != nullptr &&
@@ -525,7 +527,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
       bloom_filter_->Add(prefix_extractor_->Transform(key));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->Add(key);
+      bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
     }
 
     // The first sequence number inserted into the memtable
@@ -559,7 +561,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
       bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->AddConcurrently(key);
+      bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
     }
 
     // atomically update first_seqno_ and earliest_seqno_.
@@ -632,8 +634,10 @@ static bool SaveValue(void* arg, const char* entry) {
   // all entries with overly large sequence numbers.
   uint32_t key_length;
   const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-  if (s->mem->GetInternalKeyComparator().user_comparator()->Equal(
-          Slice(key_ptr, key_length - 8), s->key->user_key())) {
+  Slice user_key_slice = Slice(key_ptr, key_length - 8);
+  if (s->mem->GetInternalKeyComparator()
+          .user_comparator()
+          ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
     ValueType type;
@@ -767,11 +771,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
   bool may_contain = true;
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
   if (bloom_filter_) {
     // when both memtable_whole_key_filtering and prefix_extractor_ are set,
     // only do whole key filtering for Get() to save CPU
     if (moptions_.memtable_whole_key_filtering) {
-      may_contain = bloom_filter_->MayContain(user_key);
+      may_contain =
+          bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
     } else {
       assert(prefix_extractor_);
       may_contain =
diff --git a/db/version_set.cc b/db/version_set.cc
index a60a4e87cac..ed9a316ac72 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -93,7 +93,8 @@ Status OverlapWithIterator(const Comparator* ucmp,
       return Status::Corruption("DB have corrupted keys");
     }
 
-    if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) {
+    if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+        0) {
       *overlap = true;
     }
   }
@@ -171,17 +172,16 @@ class FilePicker {
           // Check if key is within a file's range. If search left bound and
           // right bound point to the same find, we are sure key falls in
           // range.
-          assert(
-              curr_level_ == 0 ||
-              curr_index_in_curr_level_ == start_index_in_curr_level_ ||
-              user_comparator_->Compare(user_key_,
-                ExtractUserKey(f->smallest_key)) <= 0);
-
-          int cmp_smallest = user_comparator_->Compare(user_key_,
-              ExtractUserKey(f->smallest_key));
+          assert(curr_level_ == 0 ||
+                 curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+                 user_comparator_->CompareWithoutTimestamp(
+                     user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+              user_key_, ExtractUserKey(f->smallest_key));
           if (cmp_smallest >= 0) {
-            cmp_largest = user_comparator_->Compare(user_key_,
-                ExtractUserKey(f->largest_key));
+            cmp_largest = user_comparator_->CompareWithoutTimestamp(
+                user_key_, ExtractUserKey(f->largest_key));
           }
 
           // Setup file search bound for the next level based on the
@@ -799,14 +799,16 @@ static bool AfterFile(const Comparator* ucmp,
                       const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs before all keys and is therefore never after *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0);
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->largest_key)) > 0);
 }
 
 static bool BeforeFile(const Comparator* ucmp,
                        const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs after all keys and is therefore never before *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0);
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->smallest_key)) < 0);
 }
 
 bool SomeFileOverlapsRange(
@@ -952,8 +954,9 @@ class LevelIterator final : public InternalIterator {
 
   bool KeyReachedUpperBound(const Slice& internal_key) {
     return read_options_.iterate_upper_bound != nullptr &&
-           user_comparator_.Compare(ExtractUserKey(internal_key),
-                                    *read_options_.iterate_upper_bound) >= 0;
+           user_comparator_.CompareWithoutTimestamp(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) >= 0;
   }
 
   InternalIterator* NewFileIterator() {
@@ -2774,11 +2777,12 @@ void VersionStorageInfo::GetOverlappingInputs(
       FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
       const Slice file_start = ExtractUserKey(f->smallest_key);
       const Slice file_limit = ExtractUserKey(f->largest_key);
-      if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+      if (begin != nullptr &&
+          user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
         // "f" is completely before specified range; skip it
         iter++;
       } else if (end != nullptr &&
-                 user_cmp->Compare(file_start, user_end) > 0) {
+                 user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
         // "f" is completely after specified range; skip it
         iter++;
       } else {
@@ -2793,10 +2797,11 @@ void VersionStorageInfo::GetOverlappingInputs(
         iter = index.erase(iter);
         if (expand_range) {
           if (begin != nullptr &&
-              user_cmp->Compare(file_start, user_begin) < 0) {
+              user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
             user_begin = file_start;
           }
-          if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) {
+          if (end != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
             user_end = file_limit;
           }
         }
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 46279f9a693..9f262367d11 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -20,6 +20,19 @@ class Slice;
 // from multiple threads.
 class Comparator {
  public:
+  Comparator() : timestamp_size_(0) {}
+
+  Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+  Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+  Comparator& operator=(const Comparator& rhs) {
+    if (this != &rhs) {
+      timestamp_size_ = rhs.timestamp_size_;
+    }
+    return *this;
+  }
+
   virtual ~Comparator() {}
 
   // Three-way comparison.  Returns value:
@@ -78,6 +91,20 @@ class Comparator {
   // The major use case is to determine if DataBlockHashIndex is compatible
   // with the customized comparator.
   virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+  inline size_t timestamp_size() const { return timestamp_size_; }
+
+  virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return Compare(a, b);
+  }
+
+  virtual int CompareTimestamp(const Slice& /*ts1*/,
+                               const Slice& /*ts2*/) const {
+    return 0;
+  }
+
+ private:
+  size_t timestamp_size_;
 };
 
 // Return a builtin comparator that uses lexicographic byte-wise
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index cc7119410a0..307582fe678 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1255,6 +1255,14 @@ struct ReadOptions {
   // Default: 0 (don't filter by seqnum, return user keys)
   SequenceNumber iter_start_seqnum;
 
+  // Timestamp of operation. Read should return the latest data visible to the
+  // specified timestamp. All timestamps of the same database must be of the
+  // same length and format. The user is responsible for providing a customized
+  // compare function via Comparator to order <key, timestamp> tuples.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  const Slice* timestamp;
+
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
 };
@@ -1307,12 +1315,24 @@ struct WriteOptions {
   // Default: false
   bool low_pri;
 
+  // Timestamp of write operation, e.g. Put. All timestamps of the same
+  // database must share the same length and format. The user is also
+  // responsible for providing a customized compare function via Comparator to
+  // order <key, timestamp> tuples. If the user wants to enable timestamp, then
+  // all write operations must be associated with timestamp because RocksDB, as
+  // a single-node storage engine currently has no knowledge of global time,
+  // thus has to rely on the application.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  const Slice* timestamp;
+
   WriteOptions()
       : sync(false),
         disableWAL(false),
         ignore_missing_column_families(false),
         no_slowdown(false),
-        low_pri(false) {}
+        low_pri(false),
+        timestamp(nullptr) {}
 };
 
 // Options that control flush operations
diff --git a/options/options.cc b/options/options.cc
index a5037ee78d3..8977b58905f 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -600,7 +600,8 @@ ReadOptions::ReadOptions()
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
-      iter_start_seqnum(0) {}
+      iter_start_seqnum(0),
+      timestamp(nullptr) {}
 
 ReadOptions::ReadOptions(bool cksum, bool cache)
     : snapshot(nullptr),
@@ -618,6 +619,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
-      iter_start_seqnum(0) {}
+      iter_start_seqnum(0),
+      timestamp(nullptr) {}
 
 }  // namespace rocksdb
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 9769e394f87..cae93f7f26f 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -531,7 +531,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // Note: PartitionedFilterBlockBuilder requires key being added to filter
     // builder after being added to index builder.
     if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
-      r->filter_builder->Add(ExtractUserKey(key));
+      size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
+      r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
     }
 
     r->last_key.assign(key.data(), key.size());
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 2fdaf2afd2a..37bbc3b52b3 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2672,8 +2672,11 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
   const Slice* const const_ikey_ptr = &internal_key;
   bool may_match = true;
   if (filter->whole_key_filtering()) {
-    may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid,
-                                    no_io, const_ikey_ptr);
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+    may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor,
+                                    kNotValid, no_io, const_ikey_ptr);
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0 &&
@@ -2755,6 +2758,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       iiter_unique_ptr.reset(iiter);
     }
 
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
     bool matched = false;  // if such user key mathced a key in SST
     bool done = false;
     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
@@ -2762,8 +2767,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
 
       bool not_exist_in_filter =
           filter != nullptr && filter->IsBlockBased() == true &&
-          !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
-                               handle.offset(), no_io);
+          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
+                               prefix_extractor, handle.offset(), no_io);
 
       if (not_exist_in_filter) {
         // Not found
@@ -2793,7 +2798,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         }
 
         bool may_exist = biter.SeekForGet(key);
-        if (!may_exist) {
+        // If user-specified timestamp is supported, we cannot end the search
+        // just because hash index lookup indicates the key+ts does not exist.
+        if (!may_exist && ts_sz == 0) {
           // HashSeek cannot find the key this block and the the iter is not
           // the end of the block, i.e. cannot be in the following blocks
           // either. In this case, the seek_key cannot be found, so we break
diff --git a/table/get_context.cc b/table/get_context.cc
index 24c9ba7d5b7..9be16b0627d 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   assert(matched);
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
-  if (ucmp_->Equal(parsed_key.user_key, user_key_)) {
+  if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
     *matched = true;
     // If the value is not in the snapshot, skip it
     if (!CheckCallback(parsed_key.sequence)) {
diff --git a/util/comparator.cc b/util/comparator.cc
index eab17ebccf3..717ebb52353 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator {
   bool CanKeysWithDifferentByteContentsBeEqual() const override {
     return false;
   }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return a.compare(b);
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
@@ -192,6 +196,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
   bool CanKeysWithDifferentByteContentsBeEqual() const override {
     return false;
   }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return -a.compare(b);
+  }
 };
 }// namespace
 

From aa71718ac3f5c2ed41f44f2dd5aa51aac6c1583e Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 6 Jun 2019 11:21:11 -0700
Subject: [PATCH 112/572] Add block cache tracer. (#5410)

Summary:
This PR adds a help class block cache tracer to read/write block cache accesses. It uses the trace reader/writer to perform this task.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5410

Differential Revision: D15612843

Pulled By: HaoyuHuang

fbshipit-source-id: f30fd1e1524355ca87db5d533a5c086728b141ea
---
 CMakeLists.txt                          |   1 +
 Makefile                                |   4 +
 src.mk                                  |   2 +
 trace_replay/block_cache_tracer.cc      | 218 ++++++++++++++++++++++++
 trace_replay/block_cache_tracer.h       | 105 ++++++++++++
 trace_replay/block_cache_tracer_test.cc | 167 ++++++++++++++++++
 trace_replay/trace_replay.cc            |  38 +++--
 trace_replay/trace_replay.h             |  15 ++
 8 files changed, 538 insertions(+), 12 deletions(-)
 create mode 100644 trace_replay/block_cache_tracer.cc
 create mode 100644 trace_replay/block_cache_tracer.h
 create mode 100644 trace_replay/block_cache_tracer_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 354697b05bb..cef1f85d797 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -628,6 +628,7 @@ set(SOURCES
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
         trace_replay/trace_replay.cc
+        trace_replay/block_cache_tracer.cc
         util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
diff --git a/Makefile b/Makefile
index 080e0713355..3ee85ad67d0 100644
--- a/Makefile
+++ b/Makefile
@@ -561,6 +561,7 @@ TESTS = \
 	range_del_aggregator_test \
 	sst_file_reader_test \
 	db_secondary_test \
+	block_cache_tracer_test \
 
 PARALLEL_TEST = \
 	backupable_db_test \
@@ -1588,6 +1589,9 @@ sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
diff --git a/src.mk b/src.mk
index c172d0b2c2d..6303997cd59 100644
--- a/src.mk
+++ b/src.mk
@@ -143,6 +143,7 @@ LIB_SOURCES =                                                   \
   test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
   trace_replay/trace_replay.cc                                  \
+  trace_replay/block_cache_tracer.cc                            \
   util/bloom.cc                                                 \
   util/build_version.cc                                         \
   util/coding.cc                                                \
@@ -371,6 +372,7 @@ MAIN_SOURCES =                                                          \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
   tools/trace_analyzer_test.cc						\
+  trace_replay/block_cache_tracer_test.cc                               \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
   util/coding_test.cc                                                   \
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
new file mode 100644
index 00000000000..8d0119a6891
--- /dev/null
+++ b/trace_replay/block_cache_tracer.cc
@@ -0,0 +1,218 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+namespace {
+const unsigned int kCharSize = 1;
+bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) {
+  return (record.block_type == TraceType::kBlockTraceDataBlock) &&
+         (record.caller == BlockCacheLookupCaller::kUserGet ||
+          record.caller == BlockCacheLookupCaller::kUserMGet);
+}
+}  // namespace
+
+BlockCacheTraceWriter::BlockCacheTraceWriter(
+    Env* env, const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
+bool BlockCacheTraceWriter::ShouldTrace(
+    const BlockCacheTraceRecord& record) const {
+  if (trace_options_.sampling_frequency == 0 ||
+      trace_options_.sampling_frequency == 1) {
+    return true;
+  }
+  // We use spatial downsampling so that we have a complete access history for a
+  // block.
+  const uint64_t hash = GetSliceNPHash64(Slice(record.block_key));
+  return hash % trace_options_.sampling_frequency == 0;
+}
+
+Status BlockCacheTraceWriter::WriteBlockAccess(
+    const BlockCacheTraceRecord& record) {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  if (trace_file_size > trace_options_.max_trace_file_size ||
+      !ShouldTrace(record)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = record.access_timestamp;
+  trace.type = record.block_type;
+  PutLengthPrefixedSlice(&trace.payload, record.block_key);
+  PutFixed64(&trace.payload, record.block_size);
+  PutFixed32(&trace.payload, record.cf_id);
+  PutLengthPrefixedSlice(&trace.payload, record.cf_name);
+  PutFixed32(&trace.payload, record.level);
+  PutFixed32(&trace.payload, record.sst_fd_number);
+  trace.payload.push_back(record.caller);
+  trace.payload.push_back(record.is_cache_hit);
+  trace.payload.push_back(record.no_insert);
+  if (ShouldTraceReferencedKey(record)) {
+    PutLengthPrefixedSlice(&trace.payload, record.referenced_key);
+    PutFixed64(&trace.payload, record.num_keys_in_block);
+    trace.payload.push_back(record.is_referenced_key_exist_in_block);
+  }
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  return trace_writer_->Write(encoded_trace);
+}
+
+Status BlockCacheTraceWriter::WriteHeader() {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = TraceType::kTraceBegin;
+  PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
+  PutFixed32(&trace.payload, kMajorVersion);
+  PutFixed32(&trace.payload, kMinorVersion);
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  return trace_writer_->Write(encoded_trace);
+}
+
+BlockCacheTraceReader::BlockCacheTraceReader(
+    std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {}
+
+Status BlockCacheTraceReader::ReadHeader(BlockCacheTraceHeader* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  header->start_time = trace.ts;
+  Slice enc_slice = Slice(trace.payload);
+  Slice magnic_number;
+  if (!GetLengthPrefixedSlice(&enc_slice, &magnic_number)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read the magic number.");
+  }
+  if (magnic_number.ToString() != kTraceMagic) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Magic number does not match.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb major "
+        "version number.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb minor "
+        "version number.");
+  }
+  // We should have retrieved all information in the header.
+  if (!enc_slice.empty()) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: The length of header is too "
+        "long.");
+  }
+  return Status::OK();
+}
+
+Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
+  assert(record);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  record->access_timestamp = trace.ts;
+  record->block_type = trace.type;
+  Slice enc_slice = Slice(trace.payload);
+  Slice block_key;
+  if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block key.");
+  }
+  record->block_key = block_key.ToString();
+  if (!GetFixed64(&enc_slice, &record->block_size)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block size.");
+  }
+  if (!GetFixed32(&enc_slice, &record->cf_id)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family ID.");
+  }
+  Slice cf_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &cf_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family name.");
+  }
+  record->cf_name = cf_name.ToString();
+  if (!GetFixed32(&enc_slice, &record->level)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read level.");
+  }
+  if (!GetFixed32(&enc_slice, &record->sst_fd_number)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read SST file number.");
+  }
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read caller.");
+  }
+  record->caller = static_cast<BlockCacheLookupCaller>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read is_cache_hit.");
+  }
+  record->is_cache_hit = static_cast<Boolean>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read no_insert.");
+  }
+  record->no_insert = static_cast<Boolean>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+
+  if (ShouldTraceReferencedKey(*record)) {
+    Slice referenced_key;
+    if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced key.");
+    }
+    record->referenced_key = referenced_key.ToString();
+    if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the number of keys in the "
+          "block.");
+    }
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "is_referenced_key_exist_in_block.");
+    }
+    record->is_referenced_key_exist_in_block =
+        static_cast<Boolean>(enc_slice[0]);
+  }
+  return Status::OK();
+}
+
+}  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
new file mode 100644
index 00000000000..7b3c82e2b7e
--- /dev/null
+++ b/trace_replay/block_cache_tracer.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "trace_replay/trace_replay.h"
+
+namespace rocksdb {
+
+enum BlockCacheLookupCaller : char {
+  kUserGet = 1,
+  kUserMGet = 2,
+  kUserIterator = 3,
+  kPrefetch = 4,
+  kCompaction = 5,
+  // All callers should be added before kMaxBlockCacheLookupCaller.
+  kMaxBlockCacheLookupCaller
+};
+
+enum Boolean : char { kTrue = 1, kFalse = 0 };
+
+struct BlockCacheTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp;
+  std::string block_key;
+  TraceType block_type;
+  uint64_t block_size;
+  uint32_t cf_id;
+  std::string cf_name;
+  uint32_t level;
+  uint32_t sst_fd_number;
+  BlockCacheLookupCaller caller;
+  Boolean is_cache_hit;
+  Boolean no_insert;
+
+  // Required fields for data block and user Get/Multi-Get only.
+  std::string referenced_key;
+  uint64_t num_keys_in_block = 0;
+  Boolean is_referenced_key_exist_in_block = Boolean::kFalse;
+};
+
+struct BlockCacheTraceHeader {
+  uint64_t start_time;
+  uint32_t rocksdb_major_version;
+  uint32_t rocksdb_minor_version;
+};
+
+// BlockCacheTraceWriter captures all RocksDB block cache accesses using a
+// user-provided TraceWriter. Every RocksDB operation is written as a single
+// trace. Each trace will have a timestamp and type, followed by the trace
+// payload.
+class BlockCacheTraceWriter {
+ public:
+  BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
+                        std::unique_ptr<TraceWriter>&& trace_writer);
+  ~BlockCacheTraceWriter() = default;
+  // No copy and move.
+  BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete;
+  BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete;
+  BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
+  BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
+
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record);
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  Status WriteHeader();
+
+ private:
+  bool ShouldTrace(const BlockCacheTraceRecord& record) const;
+
+  Env* env_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+  /*Mutex to protect trace_writer_ */
+  InstrumentedMutex trace_writer_mutex_;
+};
+
+// BlockCacheTraceReader helps read the trace file generated by
+// BlockCacheTraceWriter using a user provided TraceReader.
+class BlockCacheTraceReader {
+ public:
+  BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
+  ~BlockCacheTraceReader() = default;
+  // No copy and move.
+  BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
+  BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::unique_ptr<TraceReader> trace_reader_;
+};
+
+}  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
new file mode 100644
index 00000000000..28052d9db8d
--- /dev/null
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -0,0 +1,167 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTFDNumber = 100;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    env_ = rocksdb::Env::Default();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+  }
+
+  ~BlockCacheTracerTest() override {
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  BlockCacheLookupCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return BlockCacheLookupCaller::kPrefetch;
+      case 1:
+        return BlockCacheLookupCaller::kCompaction;
+      case 2:
+        return BlockCacheLookupCaller::kUserGet;
+      case 3:
+        return BlockCacheLookupCaller::kUserMGet;
+      case 4:
+        return BlockCacheLookupCaller::kUserIterator;
+    }
+    assert(false);
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+      record.access_timestamp = env_->NowMicros();
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      record.sst_fd_number = kSSTFDNumber + key_id;
+      record.is_cache_hit = Boolean::kFalse;
+      record.no_insert = Boolean::kFalse;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+      record.is_referenced_key_exist_in_block = Boolean::kTrue;
+      record.num_keys_in_block = kNumKeysInBlock;
+      ASSERT_OK(writer->WriteBlockAccess(record));
+    }
+  }
+
+  void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id,
+                    TraceType block_type, uint32_t nblocks) {
+    assert(reader);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      ASSERT_OK(reader->ReadAccess(&record));
+      ASSERT_EQ(block_type, record.block_type);
+      ASSERT_EQ(kBlockSize + key_id, record.block_size);
+      ASSERT_EQ(kBlockKeyPrefix + std::to_string(key_id), record.block_key);
+      ASSERT_EQ(kCFId, record.cf_id);
+      ASSERT_EQ(kDefaultColumnFamilyName, record.cf_name);
+      ASSERT_EQ(GetCaller(key_id), record.caller);
+      ASSERT_EQ(kLevel, record.level);
+      ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number);
+      ASSERT_EQ(Boolean::kFalse, record.is_cache_hit);
+      ASSERT_EQ(Boolean::kFalse, record.no_insert);
+      if (block_type == TraceType::kBlockTraceDataBlock &&
+          (record.caller == BlockCacheLookupCaller::kUserGet ||
+           record.caller == BlockCacheLookupCaller::kUserMGet)) {
+        ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
+                  record.referenced_key);
+        ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block);
+        ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
+        continue;
+      }
+      ASSERT_EQ("", record.referenced_key);
+      ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block);
+      ASSERT_EQ(0, record.num_keys_in_block);
+    }
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
+                     10);
+    WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    // Read blocks.
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10);
+    VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10);
+    VerifyAccess(&reader, 20, TraceType::kBlockTraceFilterBlock, 10);
+    VerifyAccess(&reader, 30, TraceType::kBlockTraceIndexBlock, 10);
+    VerifyAccess(&reader, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    // Read one more record should report an error.
+    BlockCacheTraceRecord record;
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index f9448069b80..b444ab371d9 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -8,7 +8,6 @@
 #include <chrono>
 #include <sstream>
 #include <thread>
-
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
@@ -32,6 +31,30 @@ void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
 }
 }  // namespace
 
+void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) {
+  assert(encoded_trace);
+  PutFixed64(encoded_trace, trace.ts);
+  encoded_trace->push_back(trace.type);
+  PutFixed32(encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace->append(trace.payload);
+}
+
+Status TracerHelper::DecodeTrace(const std::string& encoded_trace,
+                                 Trace* trace) {
+  assert(trace != nullptr);
+  Slice enc_slice = Slice(encoded_trace);
+  if (!GetFixed64(&enc_slice, &trace->ts)) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  if (enc_slice.size() < kTraceTypeSize + kTracePayloadLengthSize) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return Status::OK();
+}
+
 Tracer::Tracer(Env* env, const TraceOptions& trace_options,
                std::unique_ptr<TraceWriter>&& trace_writer)
     : env_(env),
@@ -139,10 +162,7 @@ Status Tracer::WriteFooter() {
 
 Status Tracer::WriteTrace(const Trace& trace) {
   std::string encoded_trace;
-  PutFixed64(&encoded_trace, trace.ts);
-  encoded_trace.push_back(trace.type);
-  PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size()));
-  encoded_trace.append(trace.payload);
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
   return trace_writer_->Write(Slice(encoded_trace));
 }
 
@@ -302,13 +322,7 @@ Status Replayer::ReadTrace(Trace* trace) {
   if (!s.ok()) {
     return s;
   }
-
-  Slice enc_slice = Slice(encoded_trace);
-  GetFixed64(&enc_slice, &trace->ts);
-  trace->type = static_cast<TraceType>(enc_slice[0]);
-  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
-  trace->payload = enc_slice.ToString();
-  return s;
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
 }
 
 }  // namespace rocksdb
diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h
index d4030c61518..d6956317096 100644
--- a/trace_replay/trace_replay.h
+++ b/trace_replay/trace_replay.h
@@ -40,6 +40,12 @@ enum TraceType : char {
   kTraceGet = 4,
   kTraceIteratorSeek = 5,
   kTraceIteratorSeekForPrev = 6,
+  // Block cache related types.
+  kBlockTraceIndexBlock = 7,
+  kBlockTraceFilterBlock = 8,
+  kBlockTraceDataBlock = 9,
+  kBlockTraceUncompressionDictBlock = 10,
+  kBlockTraceRangeDeletionBlock = 11,
   // All trace types should be added before kTraceMax
   kTraceMax,
 };
@@ -60,6 +66,15 @@ struct Trace {
   }
 };
 
+class TracerHelper {
+ public:
+  // Encode a trace object into the given string.
+  static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
+
+  // Decode a string into the given trace object.
+  static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
+};
+
 // Tracer captures all RocksDB operations using a user-provided TraceWriter.
 // Every RocksDB operation is written as a single trace. Each trace will have a
 // timestamp and type, followed by the trace payload.

From bee2f48a6607f641701e8971f7df3a711feaf64a Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 6 Jun 2019 11:28:54 -0700
Subject: [PATCH 113/572] Refactor the handling of cache related counters and
 statistics (#5408)

Summary:
The patch cleans up the handling of cache hit/miss/insertion related
performance counters, get context counters, and statistics by
eliminating some code duplication and factoring out the affected logic
into separate methods. In addition, it makes the semantics of cache hit
metrics more consistent by changing the code so that accessing a
partition of partitioned indexes/filters through a pinned reference no
longer counts as a cache hit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5408

Differential Revision: D15610883

Pulled By: ltamasi

fbshipit-source-id: ee749c18965077aca971d8f8bee8b24ed8fa76f1
---
 HISTORY.md                                    |   1 +
 table/block_based/block_based_table_reader.cc | 460 ++++++++++--------
 table/block_based/block_based_table_reader.h  |  58 ++-
 table/block_based/block_type.h                |  24 +
 table/block_based/partitioned_filter_block.cc |   5 -
 5 files changed, 308 insertions(+), 240 deletions(-)
 create mode 100644 table/block_based/block_type.h

diff --git a/HISTORY.md b/HISTORY.md
index 028ddcf8253..c88b436e40d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
+* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 37bbc3b52b3..0d7e3cf53a0 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -230,10 +230,10 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   const Rep* const rep = table->get_rep();
   assert(rep != nullptr);
 
-  constexpr bool is_index = true;
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->footer.index_handle(),
-      UncompressionDict::GetEmptyDict(), index_block, is_index, get_context);
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context);
 
   return s;
 }
@@ -244,9 +244,7 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
   assert(index_block != nullptr);
 
   if (!index_block_.IsEmpty()) {
-    *index_block =
-        CachableEntry<Block>(index_block_.GetValue(), nullptr /* cache */,
-                             nullptr /* cache_handle */, false /* own_value */);
+    index_block->SetUnownedValue(index_block_.GetValue());
     return Status::OK();
   }
 
@@ -321,7 +319,6 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     } else {
       ReadOptions ro;
       ro.fill_cache = read_options.fill_cache;
-      constexpr bool is_index = true;
       // We don't return pinned data from index blocks, so no need
       // to set `block_contents_pinned`.
       it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
@@ -330,7 +327,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
               internal_comparator(), internal_comparator()->user_comparator(),
               nullptr, kNullStats, true, index_key_includes_seq(),
               index_value_is_full()),
-          false, true, /* prefix_extractor */ nullptr, is_index,
+          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
           index_key_includes_seq(), index_value_is_full());
     }
 
@@ -399,12 +396,11 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     for (; biter.Valid(); biter.Next()) {
       handle = biter.value();
       CachableEntry<Block> block;
-      const bool is_index = true;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
       s = table()->MaybeReadBlockAndLoadToCache(
           prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-          &block, is_index, nullptr /* get_context */);
+          &block, BlockType::kIndex, nullptr /* get_context */);
 
       assert(s.ok() || block.GetValue() == nullptr);
       if (s.ok() && block.GetValue() != nullptr) {
@@ -662,44 +658,188 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
 };
 
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+                                            GetContext* get_context,
+                                            size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  PERF_COUNTER_ADD(block_cache_hit_count, 1);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_hit;
+    get_context->get_context_stats_.num_cache_bytes_read += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      // TODO: introduce perf counter for compression dictionary hit count
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+      }
+      break;
+
+    case BlockType::kIndex:
+      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+                                             GetContext* get_context) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce aggregate (not per-level) block cache miss count
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_miss;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+  }
+
+  // TODO: introduce perf counters for misses per block type
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
+                                                  GetContext* get_context,
+                                                  size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce perf counters for block cache insertions
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_add;
+    get_context->get_context_stats_.num_cache_bytes_write += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_add;
+        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+                   usage);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_add;
+        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_add;
+        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+      }
+      break;
+  }
+}
+
 Cache::Handle* BlockBasedTable::GetEntryFromCache(
-    Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker,
-    Tickers block_cache_hit_ticker, uint64_t* block_cache_miss_stats,
-    uint64_t* block_cache_hit_stats, Statistics* statistics,
+    Cache* block_cache, const Slice& key, BlockType block_type,
     GetContext* get_context) const {
-  auto cache_handle = block_cache->Lookup(key, statistics);
+  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+
   if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
-                              static_cast<uint32_t>(rep_->level));
-    if (get_context != nullptr) {
-      // overall cache hit
-      get_context->get_context_stats_.num_cache_hit++;
-      // total bytes read from cache
-      get_context->get_context_stats_.num_cache_bytes_read +=
-          block_cache->GetUsage(cache_handle);
-      // block-type specific cache hit
-      (*block_cache_hit_stats)++;
-    } else {
-      // overall cache hit
-      RecordTick(statistics, BLOCK_CACHE_HIT);
-      // total bytes read from cache
-      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
-                 block_cache->GetUsage(cache_handle));
-      RecordTick(statistics, block_cache_hit_ticker);
-    }
+    UpdateCacheHitMetrics(block_type, get_context,
+                          block_cache->GetUsage(cache_handle));
   } else {
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
-                              static_cast<uint32_t>(rep_->level));
-    if (get_context != nullptr) {
-      // overall cache miss
-      get_context->get_context_stats_.num_cache_miss++;
-      // block-type specific cache miss
-      (*block_cache_miss_stats)++;
-    } else {
-      RecordTick(statistics, BLOCK_CACHE_MISS);
-      RecordTick(statistics, block_cache_miss_ticker);
-    }
+    UpdateCacheMissMetrics(block_type, get_context);
   }
 
   return cache_handle;
@@ -1170,7 +1310,7 @@ Status BlockBasedTable::ReadRangeDelBlock(
     ReadOptions read_options;
     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
         read_options, range_del_handle, nullptr /* input_iter */,
-        false /* is_index */, true /* key_includes_seq */,
+        BlockType::kRangeDeletion, true /* key_includes_seq */,
         true /* index_key_is_full */, nullptr /* get_context */, Status(),
         prefetch_buffer));
     assert(iter != nullptr);
@@ -1433,38 +1573,24 @@ Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, CachableEntry<Block>* block,
-    const UncompressionDict& uncompression_dict, bool is_index,
+    const UncompressionDict& uncompression_dict, BlockType block_type,
     GetContext* get_context) const {
   const size_t read_amp_bytes_per_bit =
-      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
   assert(block);
   assert(block->IsEmpty());
 
   Status s;
   BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
-  Statistics* statistics = rep_->ioptions.statistics;
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
-    auto cache_handle = GetEntryFromCache(
-        block_cache, block_cache_key,
-        is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
-        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
-        get_context
-            ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
-                        : &get_context->get_context_stats_.num_cache_data_miss)
-            : nullptr,
-        get_context
-            ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
-                        : &get_context->get_context_stats_.num_cache_data_hit)
-            : nullptr,
-        statistics, get_context);
+    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
+                                          block_type, get_context);
     if (cache_handle != nullptr) {
-      if (is_index) {
-        PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
-      }
-
       block->SetCachedValue(
           reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
           block_cache, cache_handle);
@@ -1482,6 +1608,9 @@ Status BlockBasedTable::GetDataBlockFromCache(
   assert(!compressed_block_cache_key.empty());
   block_cache_compressed_handle =
       block_cache_compressed->Lookup(compressed_block_cache_key);
+
+  Statistics* statistics = rep_->ioptions.statistics;
+
   // if we found in the compressed cache, then uncompress and insert into
   // uncompressed cache
   if (block_cache_compressed_handle == nullptr) {
@@ -1508,7 +1637,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
   // Insert uncompressed block into block cache
   if (s.ok()) {
     std::unique_ptr<Block> block_holder(
-        new Block(std::move(contents), rep_->get_global_seqno(is_index),
+        new Block(std::move(contents), rep_->get_global_seqno(block_type),
                   read_amp_bytes_per_bit, statistics));  // uncompressed block
 
     if (block_cache != nullptr && block_holder->own_bytes() &&
@@ -1526,32 +1655,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
         block->SetCachedValue(block_holder.release(), block_cache,
                               cache_handle);
 
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_add++;
-          get_context->get_context_stats_.num_cache_bytes_write += charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_ADD);
-          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-        }
-        if (is_index) {
-          if (get_context != nullptr) {
-            get_context->get_context_stats_.num_cache_index_add++;
-            get_context->get_context_stats_.num_cache_index_bytes_insert +=
-                charge;
-          } else {
-            RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-            RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-          }
-        } else {
-          if (get_context != nullptr) {
-            get_context->get_context_stats_.num_cache_data_add++;
-            get_context->get_context_stats_.num_cache_data_bytes_insert +=
-                charge;
-          } else {
-            RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-            RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
-          }
-        }
+        UpdateCacheInsertionMetrics(block_type, get_context, charge);
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
       }
@@ -1571,15 +1675,19 @@ Status BlockBasedTable::PutDataBlockToCache(
     CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
     CompressionType raw_block_comp_type,
     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-    MemoryAllocator* memory_allocator, bool is_index,
+    MemoryAllocator* memory_allocator, BlockType block_type,
     GetContext* get_context) const {
   const ImmutableCFOptions& ioptions = rep_->ioptions;
   const uint32_t format_version = rep_->table_options.format_version;
   const size_t read_amp_bytes_per_bit =
-      !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0;
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
   const Cache::Priority priority =
-      is_index && rep_->table_options
-                      .cache_index_and_filter_blocks_with_high_priority
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              (block_type == BlockType::kFilter ||
+               block_type == BlockType::kCompressionDictionary ||
+               block_type == BlockType::kIndex)
           ? Cache::Priority::HIGH
           : Cache::Priority::LOW;
   assert(cached_block);
@@ -1652,33 +1760,7 @@ Status BlockBasedTable::PutDataBlockToCache(
       cached_block->SetCachedValue(block_holder.release(), block_cache,
                                    cache_handle);
 
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += charge;
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-      }
-      if (is_index) {
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_index_add++;
-          get_context->get_context_stats_.num_cache_index_bytes_insert +=
-              charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-          RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-        }
-      } else {
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_data_add++;
-          get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-          RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
-        }
-      }
-      assert(reinterpret_cast<Block*>(block_cache->Value(
-                 cached_block->GetCacheHandle())) == cached_block->GetValue());
+      UpdateCacheInsertionMetrics(block_type, get_context, charge);
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
     }
@@ -1798,18 +1880,11 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                          filter_blk_handle, cache_key);
 
-  Statistics* statistics = rep_->ioptions.statistics;
-  Cache::Handle* cache_handle = GetEntryFromCache(
-      block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT,
-      get_context ? &get_context->get_context_stats_.num_cache_filter_miss
-                  : nullptr,
-      get_context ? &get_context->get_context_stats_.num_cache_filter_hit
-                  : nullptr,
-      statistics, get_context);
+  Cache::Handle* cache_handle =
+      GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
 
   FilterBlockReader* filter = nullptr;
   if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
     filter =
         reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
   } else if (no_io) {
@@ -1827,20 +1902,9 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
               : Cache::Priority::LOW);
       if (s.ok()) {
         PERF_COUNTER_ADD(filter_block_read_count, 1);
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_add++;
-          get_context->get_context_stats_.num_cache_bytes_write += usage;
-          get_context->get_context_stats_.num_cache_filter_add++;
-          get_context->get_context_stats_.num_cache_filter_bytes_insert +=
-              usage;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_ADD);
-          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
-          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
-          RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
-        }
+        UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
       } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
         delete filter;
         return CachableEntry<FilterBlockReader>();
       }
@@ -1867,16 +1931,9 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
   auto cache_key =
       GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                   rep_->compression_dict_handle, cache_key_buf);
-  auto cache_handle = GetEntryFromCache(
-      rep_->table_options.block_cache.get(), cache_key,
-      BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT,
-      get_context
-          ? &get_context->get_context_stats_.num_cache_compression_dict_miss
-          : nullptr,
-      get_context
-          ? &get_context->get_context_stats_.num_cache_compression_dict_hit
-          : nullptr,
-      rep_->ioptions.statistics, get_context);
+  auto cache_handle =
+      GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
+                        BlockType::kCompressionDictionary, get_context);
   UncompressionDict* dict = nullptr;
   if (cache_handle != nullptr) {
     dict = reinterpret_cast<UncompressionDict*>(
@@ -1887,43 +1944,31 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
     std::unique_ptr<const BlockContents> compression_dict_block;
     Status s =
         ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
-    size_t usage = 0;
     if (s.ok()) {
       assert(compression_dict_block != nullptr);
       // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      dict = new UncompressionDict(compression_dict_block->data.ToString(),
-                                   rep_->blocks_definitely_zstd_compressed,
-                                   rep_->ioptions.statistics);
-      usage = dict->ApproximateMemoryUsage();
+      std::unique_ptr<UncompressionDict> uncompression_dict(
+          new UncompressionDict(compression_dict_block->data.ToString(),
+                                rep_->blocks_definitely_zstd_compressed,
+                                rep_->ioptions.statistics));
+      const size_t usage = uncompression_dict->ApproximateMemoryUsage();
       s = rep_->table_options.block_cache->Insert(
-          cache_key, dict, usage, &DeleteCachedUncompressionDictEntry,
-          &cache_handle,
+          cache_key, uncompression_dict.get(), usage,
+          &DeleteCachedUncompressionDictEntry, &cache_handle,
           rep_->table_options.cache_index_and_filter_blocks_with_high_priority
               ? Cache::Priority::HIGH
               : Cache::Priority::LOW);
-    }
-    if (s.ok()) {
-      PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += usage;
-        get_context->get_context_stats_.num_cache_compression_dict_add++;
-        get_context->get_context_stats_
-            .num_cache_compression_dict_bytes_insert += usage;
+
+      if (s.ok()) {
+        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+        UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
+                                    get_context, usage);
+        dict = uncompression_dict.release();
       } else {
-        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD);
-        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
-        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
-        RecordTick(rep_->ioptions.statistics,
-                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage);
+        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
+        assert(dict == nullptr);
+        assert(cache_handle == nullptr);
       }
-    } else {
-      // There should be no way to get here if block cache insertion succeeded.
-      // Though it is still possible something failed earlier.
-      RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete dict;
-      dict = nullptr;
-      assert(cache_handle == nullptr);
     }
   }
   return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
@@ -1951,7 +1996,7 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
 template <typename TBlockIter>
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
-    bool is_index, bool key_includes_seq, bool index_key_is_full,
+    BlockType block_type, bool key_includes_seq, bool index_key_is_full,
     GetContext* get_context, Status s,
     FilePrefetchBuffer* prefetch_buffer) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
@@ -1972,7 +2017,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
-                    is_index, get_context);
+                    block_type, get_context);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
@@ -2037,7 +2082,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, bool is_index,
+    CachableEntry<Block>* block_entry, BlockType block_type,
     GetContext* get_context) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
@@ -2070,7 +2115,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     }
 
     s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                              ro, block_entry, uncompression_dict, is_index,
+                              ro, block_entry, uncompression_dict, block_type,
                               get_context);
 
     // Can't find the block from the cache. If I/O is allowed, read from the
@@ -2095,14 +2140,14 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       }
 
       if (s.ok()) {
-        SequenceNumber seq_no = rep_->get_global_seqno(is_index);
+        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
         // If filling cache is allowed and a cache is configured, try to put the
         // block to the cache.
         s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
                                 block_entry, &raw_block_contents,
                                 raw_block_comp_type, uncompression_dict, seq_no,
                                 GetMemoryAllocator(rep_->table_options),
-                                is_index, get_context);
+                                block_type, get_context);
       }
     }
   }
@@ -2113,16 +2158,19 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, bool is_index,
+    CachableEntry<Block>* block_entry, BlockType block_type,
     GetContext* get_context) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
   Status s;
-  if (!is_index || rep_->table_options.cache_index_and_filter_blocks) {
+  if (rep_->table_options.cache_index_and_filter_blocks ||
+      (block_type != BlockType::kFilter &&
+       block_type != BlockType::kCompressionDictionary &&
+       block_type != BlockType::kIndex)) {
     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
-                                     uncompression_dict, block_entry, is_index,
-                                     get_context);
+                                     uncompression_dict, block_entry,
+                                     block_type, get_context);
 
     if (!s.ok()) {
       return s;
@@ -2150,8 +2198,10 @@ Status BlockBasedTable::RetrieveBlock(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
         rep_->ioptions, rep_->blocks_maybe_compressed,
         rep_->blocks_maybe_compressed, uncompression_dict,
-        rep_->persistent_cache_options, rep_->get_global_seqno(is_index),
-        !is_index ? rep_->table_options.read_amp_bytes_per_bit : 0,
+        rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
+        block_type == BlockType::kData
+            ? rep_->table_options.read_amp_bytes_per_bit
+            : 0,
         GetMemoryAllocator(rep_->table_options));
   }
 
@@ -2178,18 +2228,13 @@ InternalIteratorBase<BlockHandle>*
 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
     const BlockHandle& handle) {
   // Return a block iterator on the index partition
-  auto rep = table_->get_rep();
   auto block = block_map_->find(handle.offset());
   // This is a possible scenario since block cache might not have had space
   // for the partition
   if (block != block_map_->end()) {
-    PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
-    Cache* block_cache = rep->table_options.block_cache.get();
-    assert(block_cache);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
-               block_cache->GetUsage(block->second.GetCacheHandle()));
+    auto rep = table_->get_rep();
+    assert(rep);
+
     Statistics* kNullStats = nullptr;
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
@@ -2531,7 +2576,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
 
     Status s;
     table_->NewDataBlockIterator<TBlockIter>(
-        read_options_, data_block_handle, &block_iter_, is_index_,
+        read_options_, data_block_handle, &block_iter_, block_type_,
         key_includes_seq_, index_key_is_full_,
         /* get_context */ nullptr, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
@@ -2623,7 +2668,6 @@ InternalIterator* BlockBasedTable::NewIterator(
     Arena* arena, bool skip_filters, bool for_compaction) {
   bool need_upper_bound_check =
       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
-  const bool kIsNotIndex = false;
   if (arena == nullptr) {
     return new BlockBasedTableIterator<DataBlockIter>(
         this, read_options, rep_->internal_comparator,
@@ -2633,7 +2677,7 @@ InternalIterator* BlockBasedTable::NewIterator(
                 rep_->index_type == BlockBasedTableOptions::kHashSearch),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, kIsNotIndex,
+        need_upper_bound_check, prefix_extractor, BlockType::kData,
         true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
   } else {
     auto* mem =
@@ -2643,7 +2687,7 @@ InternalIterator* BlockBasedTable::NewIterator(
         NewIndexIterator(read_options, need_upper_bound_check),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, kIsNotIndex,
+        need_upper_bound_check, prefix_extractor, BlockType::kData,
         true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
   }
 }
@@ -2780,7 +2824,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       } else {
         DataBlockIter biter;
         NewDataBlockIterator<DataBlockIter>(
-            read_options, iiter->value(), &biter, false,
+            read_options, iiter->value(), &biter, BlockType::kData,
             true /* key_includes_seq */, true /* index_key_is_full */,
             get_context);
 
@@ -2893,7 +2937,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
         DataBlockIter biter;
         NewDataBlockIterator<DataBlockIter>(
-            read_options, iiter->value(), &biter, false,
+            read_options, iiter->value(), &biter, BlockType::kData,
             true /* key_includes_seq */, get_context);
 
         if (read_options.read_tier == kBlockCacheTier &&
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index e53248fbcba..d8319a3e711 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -25,6 +25,7 @@
 #include "rocksdb/table.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
 #include "table/block_based/cachable_entry.h"
 #include "table/block_based/filter_block.h"
 #include "table/format.h"
@@ -220,8 +221,8 @@ class BlockBasedTable : public TableReader {
   // input_iter: if it is not null, update this one and return it as Iterator
   template <typename TBlockIter>
   TBlockIter* NewDataBlockIterator(
-      const ReadOptions& ro, const BlockHandle& block_hanlde,
-      TBlockIter* input_iter = nullptr, bool is_index = false,
+      const ReadOptions& ro, const BlockHandle& block_handle,
+      TBlockIter* input_iter = nullptr, BlockType block_type = BlockType::kData,
       bool key_includes_seq = true, bool index_key_is_full = true,
       GetContext* get_context = nullptr, Status s = Status(),
       FilePrefetchBuffer* prefetch_buffer = nullptr) const;
@@ -238,12 +239,14 @@ class BlockBasedTable : public TableReader {
   friend class MockedBlockBasedTable;
   static std::atomic<uint64_t> next_cache_key_id_;
 
+  void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+                             size_t usage) const;
+  void UpdateCacheMissMetrics(BlockType block_type,
+                              GetContext* get_context) const;
+  void UpdateCacheInsertionMetrics(BlockType block_type,
+                                   GetContext* get_context, size_t usage) const;
   Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                   Tickers block_cache_miss_ticker,
-                                   Tickers block_cache_hit_ticker,
-                                   uint64_t* block_cache_miss_stats,
-                                   uint64_t* block_cache_hit_stats,
-                                   Statistics* statistics,
+                                   BlockType block_type,
                                    GetContext* get_context) const;
 
   // If block cache enabled (compressed or uncompressed), looks for the block
@@ -258,7 +261,7 @@ class BlockBasedTable : public TableReader {
   Status MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      CachableEntry<Block>* block_entry, bool is_index = false,
+      CachableEntry<Block>* block_entry, BlockType block_type,
       GetContext* get_context = nullptr) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
@@ -267,7 +270,7 @@ class BlockBasedTable : public TableReader {
   Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
                        const ReadOptions& ro, const BlockHandle& handle,
                        const UncompressionDict& uncompression_dict,
-                       CachableEntry<Block>* block_entry, bool is_index,
+                       CachableEntry<Block>* block_entry, BlockType block_type,
                        GetContext* get_context) const;
 
   // For the following two functions:
@@ -311,7 +314,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, CachableEntry<Block>* block,
-      const UncompressionDict& uncompression_dict, bool is_index = false,
+      const UncompressionDict& uncompression_dict, BlockType block_type,
       GetContext* get_context = nullptr) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
@@ -324,16 +327,14 @@ class BlockBasedTable : public TableReader {
   // PutDataBlockToCache(). After the call, the object will be invalid.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
-  Status PutDataBlockToCache(const Slice& block_cache_key,
-                             const Slice& compressed_block_cache_key,
-                             Cache* block_cache, Cache* block_cache_compressed,
-                             CachableEntry<Block>* cached_block,
-                             BlockContents* raw_block_contents,
-                             CompressionType raw_block_comp_type,
-                             const UncompressionDict& uncompression_dict,
-                             SequenceNumber seq_no,
-                             MemoryAllocator* memory_allocator, bool is_index,
-                             GetContext* get_context) const;
+  Status PutDataBlockToCache(
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+      Cache* block_cache, Cache* block_cache_compressed,
+      CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
+      CompressionType raw_block_comp_type,
+      const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+      MemoryAllocator* memory_allocator, BlockType block_type,
+      GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -546,8 +547,12 @@ struct BlockBasedTable::Rep {
   bool closed = false;
   const bool immortal_table;
 
-  SequenceNumber get_global_seqno(bool is_index) const {
-    return is_index ? kDisableGlobalSequenceNumber : global_seqno;
+  SequenceNumber get_global_seqno(BlockType block_type) const {
+    return (block_type == BlockType::kFilter ||
+            block_type == BlockType::kIndex ||
+            block_type == BlockType::kCompressionDictionary)
+               ? kDisableGlobalSequenceNumber
+               : global_seqno;
   }
 };
 
@@ -560,8 +565,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
                           const InternalKeyComparator& icomp,
                           InternalIteratorBase<BlockHandle>* index_iter,
                           bool check_filter, bool need_upper_bound_check,
-                          const SliceTransform* prefix_extractor, bool is_index,
-                          bool key_includes_seq = true,
+                          const SliceTransform* prefix_extractor,
+                          BlockType block_type, bool key_includes_seq = true,
                           bool index_key_is_full = true,
                           bool for_compaction = false)
       : InternalIteratorBase<TValue>(false),
@@ -575,7 +580,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         check_filter_(check_filter),
         need_upper_bound_check_(need_upper_bound_check),
         prefix_extractor_(prefix_extractor),
-        is_index_(is_index),
+        block_type_(block_type),
         key_includes_seq_(key_includes_seq),
         index_key_is_full_(index_key_is_full),
         for_compaction_(for_compaction) {}
@@ -690,8 +695,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // TODO(Zhongyi): pick a better name
   bool need_upper_bound_check_;
   const SliceTransform* prefix_extractor_;
-  // If the blocks over which we iterate are index blocks
-  bool is_index_;
+  BlockType block_type_;
   // If the keys in the blocks over which we iterate include 8 byte sequence
   bool key_includes_seq_;
   bool index_key_is_full_;
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
new file mode 100644
index 00000000000..9b9c53946c9
--- /dev/null
+++ b/table/block_based/block_type.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+
+enum class BlockType : uint8_t {
+  kData,
+  kFilter,
+  kProperties,
+  kCompressionDictionary,
+  kRangeDeletion,
+  kMetaIndex,
+  kIndex,
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 315e63306f1..7874ce1874f 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -243,11 +243,6 @@ PartitionedFilterBlockReader::GetFilterPartition(
       // This is a possible scenario since block cache might not have had space
       // for the partition
       if (iter != filter_map_.end()) {
-        PERF_COUNTER_ADD(block_cache_hit_count, 1);
-        RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
-        RecordTick(statistics(), BLOCK_CACHE_HIT);
-        RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
-                   block_cache->GetUsage(iter->second.GetCacheHandle()));
         return {iter->second.GetValue(), nullptr /* cache */,
           nullptr /* cache_handle */, false /* own_value */};
       }

From d68f9f4580f083023f8e20939b2866cac48f9bb6 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 6 Jun 2019 13:52:39 -0700
Subject: [PATCH 114/572] simplify include directive involving inttypes (#5402)

Summary:
When using `PRIu64` type of printf specifier, current code base does the following:
```
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
```
However, this can be simplified to
```
#include <cinttypes>
```
as long as flag `-std=c++11` is used.
This should solve issues like https://github.com/facebook/rocksdb/issues/5159
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5402

Differential Revision: D15701195

Pulled By: miasantreble

fbshipit-source-id: 6dac0a05f52aadb55e9728038599d3d2e4b59d03
---
 cache/cache_bench.cc                                       | 5 +----
 cache/lru_cache.cc                                         | 4 ----
 cache/sharded_cache.cc                                     | 4 ----
 db/column_family.cc                                        | 6 +-----
 db/compaction/compaction.cc                                | 6 +-----
 db/compaction/compaction_job.cc                            | 7 +------
 db/compaction/compaction_job_stats_test.cc                 | 6 +-----
 db/compaction/compaction_job_test.cc                       | 6 +-----
 db/compaction/compaction_picker.cc                         | 6 +-----
 db/compaction/compaction_picker_fifo.cc                    | 6 +-----
 db/compaction/compaction_picker_level.cc                   | 4 ----
 db/compaction/compaction_picker_universal.cc               | 6 +-----
 db/corruption_test.cc                                      | 2 +-
 db/db_filesnapshot.cc                                      | 6 +-----
 db/db_impl/db_impl.cc                                      | 3 ---
 db/db_impl/db_impl_compaction_flush.cc                     | 5 +----
 db/db_impl/db_impl_experimental.cc                         | 6 +-----
 db/db_impl/db_impl_files.cc                                | 5 +----
 db/db_impl/db_impl_open.cc                                 | 5 +----
 db/db_impl/db_impl_secondary.cc                            | 5 +----
 db/db_impl/db_impl_write.cc                                | 5 +----
 db/db_info_dumper.cc                                       | 6 +-----
 db/db_test_util.h                                          | 6 +-----
 db/dbformat.cc                                             | 6 +-----
 db/external_sst_file_ingestion_job.cc                      | 6 +-----
 db/flush_job.cc                                            | 6 +-----
 db/forward_iterator_bench.cc                               | 4 ----
 db/internal_stats.cc                                       | 6 +-----
 db/memtable_list.cc                                        | 6 +-----
 db/range_tombstone_fragmenter.cc                           | 2 +-
 db/repair.cc                                               | 6 +-----
 db/transaction_log_impl.cc                                 | 5 +----
 db/version_builder.cc                                      | 6 +-----
 db/version_set.cc                                          | 6 +-----
 db/wal_manager.cc                                          | 6 +-----
 examples/multi_processes_example.cc                        | 2 +-
 file/delete_scheduler_test.cc                              | 6 +-----
 file/filename.cc                                           | 6 +-----
 file/sst_file_manager_impl.cc                              | 6 +-----
 include/rocksdb/utilities/backupable_db.h                  | 6 +-----
 logging/event_logger.cc                                    | 6 +-----
 memtable/memtablerep_bench.cc                              | 4 ----
 monitoring/histogram.cc                                    | 6 +-----
 monitoring/statistics.cc                                   | 6 +-----
 options/cf_options.cc                                      | 6 +-----
 options/db_options.cc                                      | 6 +-----
 options/options.cc                                         | 6 +-----
 options/options_settable_test.cc                           | 4 ----
 options/options_test.cc                                    | 6 +-----
 table/block_based/block_based_table_factory.cc             | 7 +------
 table/block_based/index_builder.cc                         | 2 +-
 table/block_based/index_builder.h                          | 2 +-
 table/block_fetcher.cc                                     | 2 +-
 table/cuckoo/cuckoo_table_reader_test.cc                   | 6 +-----
 table/format.cc                                            | 2 +-
 table/plain/plain_table_index.cc                           | 6 +-----
 table/sst_file_reader_test.cc                              | 2 +-
 test_util/transaction_test_util.cc                         | 6 +-----
 tools/db_bench.cc                                          | 4 ----
 tools/db_bench_tool.cc                                     | 6 +-----
 tools/db_stress.cc                                         | 6 +-----
 tools/dump/db_dump_tool.cc                                 | 6 +-----
 tools/ldb_cmd.cc                                           | 6 +-----
 tools/sst_dump_tool.cc                                     | 6 +-----
 tools/trace_analyzer_tool.cc                               | 4 ----
 tools/write_stress.cc                                      | 6 +-----
 util/crc32c_arm64.h                                        | 2 +-
 util/crc32c_ppc.c                                          | 2 +-
 util/duplicate_detector.h                                  | 6 +-----
 util/dynamic_bloom_test.cc                                 | 6 +-----
 util/rate_limiter_test.cc                                  | 6 +-----
 util/string_util.cc                                        | 6 +-----
 utilities/backupable/backupable_db.cc                      | 6 +-----
 utilities/blob_db/blob_db.cc                               | 6 +-----
 utilities/blob_db/blob_dump_tool.cc                        | 6 +-----
 utilities/blob_db/blob_file.cc                             | 6 +-----
 utilities/checkpoint/checkpoint_impl.cc                    | 6 +-----
 utilities/options/options_util_test.cc                     | 5 +----
 utilities/persistent_cache/persistent_cache_tier.cc        | 7 +------
 utilities/transactions/pessimistic_transaction_db.cc       | 6 +-----
 utilities/transactions/transaction_base.cc                 | 6 +-----
 utilities/transactions/transaction_lock_mgr.cc             | 6 +-----
 utilities/transactions/transaction_test.cc                 | 4 ----
 utilities/transactions/transaction_test.h                  | 6 +-----
 utilities/transactions/transaction_util.cc                 | 6 +-----
 utilities/transactions/write_prepared_transaction_test.cc  | 6 +-----
 utilities/transactions/write_prepared_txn.cc               | 6 +-----
 utilities/transactions/write_prepared_txn_db.cc            | 6 +-----
 utilities/transactions/write_prepared_txn_db.h             | 6 +-----
 .../transactions/write_unprepared_transaction_test.cc      | 4 ----
 utilities/transactions/write_unprepared_txn.cc             | 4 ----
 utilities/transactions/write_unprepared_txn_db.cc          | 4 ----
 utilities/transactions/write_unprepared_txn_db.h           | 4 ----
 93 files changed, 79 insertions(+), 405 deletions(-)

diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc
index 098813d9d74..35deb200596 100644
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@@ -3,9 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -14,7 +11,7 @@ int main() {
 }
 #else
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <sys/types.h>
 #include <stdio.h>
 
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index fdcbb4e86cb..676bed3051c 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "cache/lru_cache.h"
 
 #include <assert.h>
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index a48a32185bf..8fc0a7a17a3 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "cache/sharded_cache.h"
 
 #include <string>
diff --git a/db/column_family.cc b/db/column_family.cc
index 531cbeca681..2a2e6cb980f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -9,11 +9,7 @@
 
 #include "db/column_family.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 #include <string>
 #include <algorithm>
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 5dc7e83c8fc..6d7a3561660 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -7,11 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 5761345d8a2..ca8575a0dc9 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -7,12 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <functional>
 #include <list>
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index 5fb805df5f0..221ee3eaad3 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -7,11 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <iostream>
 #include <mutex>
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 838cda5eaca..66c3353fcf6 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -5,11 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <array>
 #include <map>
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index a03f7b46fd1..3357e06319d 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -9,11 +9,7 @@
 
 #include "db/compaction/compaction_picker.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 1fc6ed113d2..4ff301d21c3 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -10,11 +10,7 @@
 #include "db/compaction/compaction_picker_fifo.h"
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <vector>
 #include "db/column_family.h"
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index e9653da8e55..cc0f19b8171 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index e8aca00be81..5909ab576c3 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -10,11 +10,7 @@
 #include "db/compaction/compaction_picker_universal.h"
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 9e83c9080e6..82752161f39 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -13,7 +13,7 @@
 
 #include <errno.h>
 #include <fcntl.h>
-#include <inttypes.h>
+#include <cinttypes>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include "db/db_impl/db_impl.h"
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index ac544793ee4..3ff7c73f4e8 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -6,11 +6,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdint.h>
 #include <algorithm>
 #include <string>
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 96b911a6d37..bb6ec7db4c5 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -8,9 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 #include <stdint.h>
 #ifdef OS_SOLARIS
 #include <alloca.h>
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 7be9b62c5d6..bd1a8e74f48 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -8,10 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/builder.h"
 #include "db/error_handler.h"
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index a8fed40be01..f0e6fafccba 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -9,11 +9,7 @@
 
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 608c8ce4948..c018432c9b8 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -8,10 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 #include <set>
 #include <unordered_set>
 #include "db/event_helpers.h"
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 2fc12746d7d..69c9c4117d7 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -8,10 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/builder.h"
 #include "db/error_handler.h"
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 34364d124a8..827d99929a9 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -5,10 +5,7 @@
 
 #include "db/db_impl/db_impl_secondary.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/db_iter.h"
 #include "db/merge_context.h"
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 947194ace19..7ff2982d147 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -8,10 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "monitoring/perf_context_imp.h"
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index be85357c2e1..e2bb01e0e97 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -3,13 +3,9 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "db/db_info_dumper.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 #include <string>
 #include <algorithm>
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 4e9fcafadfa..6e1d0ed7a13 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -9,12 +9,8 @@
 
 #pragma once
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <fcntl.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <map>
diff --git a/db/dbformat.cc b/db/dbformat.cc
index cd2878198c4..bfaea868b53 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -8,11 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/dbformat.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index aec398552c7..0068685b0ba 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -7,11 +7,7 @@
 
 #include "db/external_sst_file_ingestion_job.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <string>
 #include <vector>
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 2b2696c10ba..589d81f2974 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -9,11 +9,7 @@
 
 #include "db/flush_job.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <vector>
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index 17b0ca16544..174a258a682 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -3,10 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #if !defined(GFLAGS) || defined(ROCKSDB_LITE)
 #include <cstdio>
 int main() {
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 21dde297ab6..50f6ed2e688 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -10,11 +10,7 @@
 
 #include "db/internal_stats.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <limits>
 #include <string>
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 045bfc9a2d3..0f796eb9a73 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -5,11 +5,7 @@
 //
 #include "db/memtable_list.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc
index e3eb18908a5..3d3a5c4520f 100644
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@@ -9,7 +9,7 @@
 #include <functional>
 #include <set>
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 
 #include "util/autovector.h"
diff --git a/db/repair.cc b/db/repair.cc
index 400e754ba45..6967a46e36c 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -60,11 +60,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/builder.h"
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index f92d563eb8e..2e4475bb6ac 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -4,12 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 
 #include "db/transaction_log_impl.h"
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/write_batch_internal.h"
 #include "util/file_reader_writer.h"
 
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 84e4dc6579a..9d2ba9ab4ee 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -9,11 +9,7 @@
 
 #include "db/version_builder.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <atomic>
 #include <functional>
diff --git a/db/version_set.cc b/db/version_set.cc
index ed9a316ac72..96bf22e57b4 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -9,11 +9,7 @@
 
 #include "db/version_set.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 #include <algorithm>
 #include <array>
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 71c2ffe4b22..58671d599c5 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -9,11 +9,7 @@
 
 #include "db/wal_manager.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <vector>
 #include <memory>
diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc
index b1c1d02ba25..7350e1be253 100644
--- a/examples/multi_processes_example.cc
+++ b/examples/multi_processes_example.cc
@@ -14,7 +14,7 @@
 // run for a while, tailing the logs of the primary. After process with primary
 // instance exits, this process will keep running until you hit 'CTRL+C'.
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <chrono>
 #include <cstdio>
 #include <cstdlib>
diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
index 510753b3b45..3549a9f84eb 100644
--- a/file/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -3,11 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <atomic>
 #include <thread>
 #include <vector>
diff --git a/file/filename.cc b/file/filename.cc
index c9f22e585b7..d4f7dd9ec7c 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -6,12 +6,8 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "file/filename.h"
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <ctype.h>
 #include <stdio.h>
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index efd9e30e6a5..08ea873258a 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -5,11 +5,7 @@
 
 #include "file/sst_file_manager_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
 #include "db/db_impl/db_impl.h"
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 7817c564965..1ca4fc9a670 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -10,11 +10,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <functional>
 #include <map>
 #include <string>
diff --git a/logging/event_logger.cc b/logging/event_logger.cc
index aceccdf93c0..182e282b2f0 100644
--- a/logging/event_logger.cc
+++ b/logging/event_logger.cc
@@ -3,13 +3,9 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "logging/event_logger.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <cassert>
 #include <sstream>
 #include <string>
diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc
index 003d59b2a86..1e2b5bdd1e5 100644
--- a/memtable/memtablerep_bench.cc
+++ b/memtable/memtablerep_bench.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc
index 4bc7139d304..29bf78ad7c9 100644
--- a/monitoring/histogram.cc
+++ b/monitoring/histogram.cc
@@ -7,13 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "monitoring/histogram.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <cassert>
 #include <math.h>
 #include <stdio.h>
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index fe2f2e25af3..15d702d1f4a 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -5,11 +5,7 @@
 //
 #include "monitoring/statistics.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include "rocksdb/statistics.h"
 #include "port/likely.h"
 #include <algorithm>
diff --git a/options/cf_options.cc b/options/cf_options.cc
index f7af3f834c9..5830fc6613d 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -5,11 +5,7 @@
 
 #include "options/cf_options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <cassert>
 #include <limits>
 #include <string>
diff --git a/options/db_options.cc b/options/db_options.cc
index 72e348b3227..bdcdd250a0a 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -5,11 +5,7 @@
 
 #include "options/db_options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "logging/logging.h"
 #include "port/port.h"
diff --git a/options/options.cc b/options/options.cc
index 8977b58905f..1d2b6193cbc 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -9,11 +9,7 @@
 
 #include "rocksdb/options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 
 #include "monitoring/statistics.h"
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2e21a2688f8..6044cc4b1c4 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <cstring>
 
 #include "options/options_helper.h"
diff --git a/options/options_test.cc b/options/options_test.cc
index 1aa3bace7dd..9fcd241d70f 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -7,14 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <cctype>
 #include <cstring>
 #include <unordered_map>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "cache/lru_cache.h"
 #include "cache/sharded_cache.h"
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 121cc916e25..cf205be72de 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -7,12 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdint.h>
 
 #include <memory>
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index 738b9e3e099..c1ce541ae56 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -10,7 +10,7 @@
 #include "table/block_based/index_builder.h"
 
 #include <assert.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <list>
 #include <string>
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 7e6a4bb0776..6baa9891b1d 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -10,7 +10,7 @@
 #pragma once
 
 #include <assert.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <list>
 #include <string>
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 72b567fc23d..afcbbaee4f5 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -9,7 +9,7 @@
 
 #include "table/block_fetcher.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 
 #include "logging/logging.h"
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index 681e0dfdf3e..dd65ffe8490 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -13,11 +13,7 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 #include <string>
 #include <map>
diff --git a/table/format.cc b/table/format.cc
index a4441fe5646..2046903a703 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -9,7 +9,7 @@
 
 #include "table/format.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 
 #include "block_fetcher.h"
diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc
index 196be22cfe9..b4207f348cb 100644
--- a/table/plain/plain_table_index.cc
+++ b/table/plain/plain_table_index.cc
@@ -5,11 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "table/plain/plain_table_index.h"
 #include "util/coding.h"
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 529634ccd75..dd7a5101677 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "rocksdb/db.h"
 #include "rocksdb/sst_file_reader.h"
diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc
index 3a7d9e97f50..b71ad0a1f56 100644
--- a/test_util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -4,13 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "test_util/transaction_test_util.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <numeric>
 #include <random>
diff --git a/tools/db_bench.cc b/tools/db_bench.cc
index 634bbba30ac..1ad77295fa6 100644
--- a/tools/db_bench.cc
+++ b/tools/db_bench.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index c6f19bed585..b254978c5ed 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
@@ -20,7 +16,7 @@
 #include <unistd.h>
 #endif
 #include <fcntl.h>
-#include <inttypes.h>
+#include <cinttypes>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index dc8f8152376..5fd84258b1f 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -28,12 +28,8 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif  // __STDC_FORMAT_MACROS
-
 #include <fcntl.h>
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc
index 8c5fa82e5b9..06a47ce725b 100644
--- a/tools/dump/db_dump_tool.cc
+++ b/tools/dump/db_dump_tool.cc
@@ -5,11 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <iostream>
 
 #include "rocksdb/db.h"
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index d6f9b415707..958d862fd32 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -7,11 +7,7 @@
 #ifndef ROCKSDB_LITE
 #include "rocksdb/utilities/ldb_cmd.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index aa051da01f5..ed5600194ad 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -8,11 +8,7 @@
 
 #include "tools/sst_dump_tool_imp.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <iostream>
 #include <map>
 #include <memory>
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 6ab606f6a6a..627610ae0f4 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -6,10 +6,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 8cde31e6b84..95948ef5730 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -56,11 +56,7 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif // __STDC_FORMAT_MACROS
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <atomic>
 #include <random>
 #include <set>
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 0e77ecd0ef5..80b3aca361a 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -6,7 +6,7 @@
 #ifndef UTIL_CRC32C_ARM64_H
 #define UTIL_CRC32C_ARM64_H
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #if defined(__aarch64__) || defined(__AARCH64__)
 #ifdef __ARM_FEATURE_CRC32
diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c
index 654d606aaad..ce0b9f27ce6 100644
--- a/util/crc32c_ppc.c
+++ b/util/crc32c_ppc.c
@@ -6,7 +6,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #define CRC_TABLE
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdlib.h>
 #include <strings.h>
 #include "util/crc32c_ppc_constants.h"
diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h
index 40a1cbd129b..1fab009751b 100644
--- a/util/duplicate_detector.h
+++ b/util/duplicate_detector.h
@@ -5,11 +5,7 @@
 
 #pragma once
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "util/set_comparator.h"
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 7ca8bb891aa..3f98ccd0189 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -11,11 +11,7 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <atomic>
 #include <functional>
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index d9f17cc3ac6..7795e01fc9d 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -7,13 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "util/rate_limiter.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <chrono>
 #include <limits>
 
diff --git a/util/string_util.cc b/util/string_util.cc
index 26e6759ac2a..74f6afbf0f4 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -5,12 +5,8 @@
 //
 #include "util/string_util.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <errno.h>
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 7a2e1940316..b7592a0ce2b 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -23,11 +23,7 @@
 #include "util/string_util.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif  // __STDC_FORMAT_MACROS
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdlib.h>
 #include <algorithm>
 #include <atomic>
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index d660def4908..bee36a667a2 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -5,13 +5,9 @@
 //
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/blob_db/blob_db.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index 37eee19dbe1..b74a211bc95 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -4,12 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/blob_db/blob_dump_tool.h"
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 #include <iostream>
 #include <memory>
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 03cff7834b9..3f128c7d55e 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -6,11 +6,7 @@
 #ifndef ROCKSDB_LITE
 #include "utilities/blob_db/blob_file.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <stdio.h>
 
 #include <algorithm>
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 7468c8eedee..4835f26da6e 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -11,11 +11,7 @@
 
 #include "utilities/checkpoint/checkpoint_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <string>
 #include <vector>
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 8c71dbf5dc3..3926275af5e 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -4,11 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <cctype>
 #include <unordered_map>
diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc
index 732762a1652..752a6fb70b6 100644
--- a/utilities/persistent_cache/persistent_cache_tier.cc
+++ b/utilities/persistent_cache/persistent_cache_tier.cc
@@ -5,14 +5,9 @@
 //
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/persistent_cache/persistent_cache_tier.h"
 
-#include "inttypes.h"
-
+#include <cinttypes>
 #include <string>
 #include <sstream>
 
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index e906b444ff5..2f9c918a3b4 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <unordered_set>
 #include <vector>
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 6553b49614c..5621a7fa372 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -7,11 +7,7 @@
 
 #include "utilities/transactions/transaction_base.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index 757b77fde4e..084d817ea08 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_lock_mgr.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <condition_variable>
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 6c9f4bccd62..35a9706830e 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5,10 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 
 #include <algorithm>
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index da2a08d3c52..9b634c11ca7 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -5,11 +5,7 @@
 
 #pragma once
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <functional>
 #include <string>
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index c582b73aa3e..407feaaa88a 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_util.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <vector>
 
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 8b52b1ae662..7c588f4ef69 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <atomic>
 #include <functional>
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 05650e2b3f9..f55615063e5 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -7,11 +7,7 @@
 
 #include "utilities/transactions/write_prepared_txn.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <map>
 #include <set>
 
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index e2a8fbbf20f..8e08d074134 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_prepared_txn_db.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <algorithm>
 #include <string>
 #include <unordered_set>
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index ffdf2f29d8f..876279cba23 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -6,11 +6,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <mutex>
 #include <queue>
 #include <set>
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 914f3f581e4..faa6c774578 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -5,10 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 #include "utilities/transactions/write_unprepared_txn.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index efd766514c8..73e9a8837a0 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -10,10 +10,6 @@
 #include "util/cast_util.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 namespace rocksdb {
 
 bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index a1aeedf2e15..ea655f88e3c 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -5,10 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_unprepared_txn_db.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h
index 4b4e31e1b60..fab8ce8263d 100644
--- a/utilities/transactions/write_unprepared_txn_db.h
+++ b/utilities/transactions/write_unprepared_txn_db.h
@@ -6,10 +6,6 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_prepared_txn_db.h"
 #include "utilities/transactions/write_unprepared_txn.h"
 

From fd94353ea36aa6680ef99faab644e23a33599720 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 6 Jun 2019 16:14:51 -0700
Subject: [PATCH 115/572] Remove the artifacts field from
 stress_crash/stress_crash_with_atomic_flush

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5422

Differential Revision: D15706212

Pulled By: ltamasi

fbshipit-source-id: 0acf060fb8568efee51c033e50b492bcf1095a4c
---
 build_tools/rocksdb-lego-determinator | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index 2447a19ae44..31bcbad38cb 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -109,13 +109,6 @@ else
   TASK_CREATION_TOOL="false"
 fi
 
-ARTIFACTS=" 'artifacts': [
-    {
-        'name':'database',
-        'paths':[ '/dev/shm/rocksdb' ],
-    }
-]"
-
 #
 # A mechanism to disable tests temporarily
 #
@@ -395,7 +388,6 @@ STRESS_CRASH_TEST_COMMANDS="[
                 $PARSER
             }
         ],
-        $ARTIFACTS,
         $REPORT
     }
 ]"
@@ -424,7 +416,6 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 $PARSER
             }
         ],
-        $ARTIFACTS,
         $REPORT
     }
 ]"

From ad52626cf4fd53b1549c4d04ea4c4dae9e4441d9 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 6 Jun 2019 17:30:57 -0700
Subject: [PATCH 116/572] Remove special characters from job names (#5424)

Summary:
Special characters like slashes and parentheses are not supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5424

Differential Revision: D15708067

Pulled By: ltamasi

fbshipit-source-id: 90527ec3ee882a0cdd1249c3946f5eff2ff7c115
---
 build_tools/rocksdb-lego-determinator | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index 31bcbad38cb..e47b2ef30d8 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -369,7 +369,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[
 #
 STRESS_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Stress/Crash Test',
+        'name':'Rocksdb Stress and Crash Test',
         'oncall':'$ONCALL',
         'timeout': 86400,
         'steps': [
@@ -397,7 +397,7 @@ STRESS_CRASH_TEST_COMMANDS="[
 #
 STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb Stress/Crash Test (atomic flush)',
+        'name':'Rocksdb Stress and Crash Test with atomic flush',
         'oncall':'$ONCALL',
         'timeout': 86400,
         'steps': [
@@ -489,7 +489,7 @@ ASAN_CRASH_TEST_COMMANDS="[
 #
 ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test (atomic flush) under ASAN',
+        'name':'Rocksdb crash test with atomic flush under ASAN',
         'oncall':'$ONCALL',
         'timeout': 86400,
         'steps': [
@@ -553,7 +553,7 @@ UBSAN_CRASH_TEST_COMMANDS="[
 #
 UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test (atomic flush) under UBSAN',
+        'name':'Rocksdb crash test with atomic flush under UBSAN',
         'oncall':'$ONCALL',
         'timeout': 86400,
         'steps': [

From 0f48e56f96c9ef360a09cb3a76830c165c9ae392 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 7 Jun 2019 15:13:43 -0700
Subject: [PATCH 117/572] Revert to checking the upper bound on a per-key basis
 in BlockBasedTableIterator (#5428)

Summary:
PR #5111 reduced the number of key comparisons when iterating with
upper/lower bounds; however, this caused a regression for MyRocks.
Reverting to the previous behavior in BlockBasedTableIterator as a hotfix.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5428

Differential Revision: D15721038

Pulled By: ltamasi

fbshipit-source-id: 5450106442f1763bccd17f6cfd648697f2ae8b6c
---
 db/db_iter.cc                                 |  5 +++++
 table/block_based/block_based_table_reader.cc | 10 ++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 29a1a9eac1a..633724c5763 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -467,6 +467,8 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
+    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
+           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
     if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
@@ -859,6 +861,9 @@ void DBIter::PrevInternal() {
       return;
     }
 
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.Compare(saved_key_.GetUserKey(),
+                                    *iterate_lower_bound_) >= 0);
     if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
         user_comparator_.Compare(saved_key_.GetUserKey(),
                                  *iterate_lower_bound_) < 0) {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 0d7e3cf53a0..68213f04149 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2597,9 +2597,15 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
       return;
     }
     // Whether next data block is out of upper bound, if there is one.
-    bool next_block_is_out_of_bound =
+    // TODO: we should be able to use !data_block_within_upper_bound_ here
+    // instead of performing the comparison; however, the flag can apparently
+    // be out of sync with the comparison in some cases. This should be
+    // investigated.
+    const bool next_block_is_out_of_bound =
         read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
+        block_iter_points_to_real_block_ &&
+        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                  index_iter_->user_key()) <= 0);
     ResetDataIter();
     index_iter_->Next();
     if (next_block_is_out_of_bound) {

From b703a56e5cd722aaf169baa3e28127426776b6a9 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Fri, 7 Jun 2019 15:31:40 -0700
Subject: [PATCH 118/572] Potential fix for stress test failure due to "SST
 file ahead of WAL" error (#5412)

Summary:
I'm not able to prove it, but the stress test failure may be caused by the following sequence of events -

1. Crash db_stress while writing the log file. This should result in a corrupted WAL.
2. Run db_stress with recycle_log_file_num=1. Crash during recovery immediately after writing manifest and updating the current file. The old log from the previous run is left behind, but the memtable would have been flushed during recovery and the CF log number will point to the newer log
3. Run db_stress with recycle_log_file_num=0. During recovery, the old log file will be processed and the corruption will be detected. Since the CF has moved ahead, we get the "SST file is ahead of WAL" error

Test -
1. stress_crash
2. make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5412

Differential Revision: D15699120

Pulled By: anand1976

fbshipit-source-id: 9092ce81e7c4a0b4b4e66560c23ea4812a4d9cbe
---
 db/db_impl/db_impl_compaction_flush.cc | 7 +++++++
 db/db_impl/db_impl_open.cc             | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index bd1a8e74f48..8cb37484cac 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -107,6 +107,13 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) {
       if (!s.ok()) {
         break;
       }
+
+      if (immutable_db_options_.recycle_log_file_num > 0) {
+        s = log->Close();
+        if (!s.ok()) {
+          break;
+        }
+      }
     }
     if (s.ok()) {
       s = directories_.GetWalDir()->Fsync();
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 69c9c4117d7..baa4fe707aa 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -555,12 +555,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   bool stop_replay_for_corruption = false;
   bool flushed = false;
   uint64_t corrupted_log_number = kMaxSequenceNumber;
+  uint64_t min_log_number = MinLogNumberToKeep();
   for (auto log_number : log_numbers) {
-    if (log_number < versions_->min_log_number_to_keep_2pc()) {
+    if (log_number < min_log_number) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Skipping log #%" PRIu64
                      " since it is older than min log to keep #%" PRIu64,
-                     log_number, versions_->min_log_number_to_keep_2pc());
+                     log_number, min_log_number);
       continue;
     }
     // The previous incarnation may not have written any MANIFEST

From a16d0cc494ea8853b84c606efc04b61e33878fff Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 7 Jun 2019 19:34:48 -0700
Subject: [PATCH 119/572] Fix build errors regarding const qualifier being
 ignored on cast result type (#5432)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This affects some TSAN builds:

env/env_test.cc: In member function ‘virtual void rocksdb::EnvPosixTestWithParam_MultiRead_Test::TestBody()’:
env/env_test.cc:1126:76: error: type qualifiers ignored on cast result type [-Werror=ignored-qualifiers]
       auto data = NewAligned(kSectorSize * 8, static_cast<const char>(i + 1));
                                                                            ^
env/env_test.cc:1154:77: error: type qualifiers ignored on cast result type [-Werror=ignored-qualifiers]
       auto buf = NewAligned(kSectorSize * 8, static_cast<const char>(i*2 + 1));
                                                                             ^
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5432

Differential Revision: D15727277

Pulled By: ltamasi

fbshipit-source-id: dc0e687b123e7c4d703ccc0c16b7167e07d1c9b0
---
 env/env_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/env/env_test.cc b/env/env_test.cc
index a2b6db5c475..6f225e37f67 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -1123,7 +1123,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
 #endif
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
     for (size_t i = 0; i < kNumSectors; ++i) {
-      auto data = NewAligned(kSectorSize * 8, static_cast<const char>(i + 1));
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
       Slice slice(data.get(), kSectorSize);
       ASSERT_OK(wfile->Append(slice));
     }
@@ -1151,7 +1151,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
     ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
     ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
     for (size_t i = 0; i < reqs.size(); ++i) {
-      auto buf = NewAligned(kSectorSize * 8, static_cast<const char>(i*2 + 1));
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i*2 + 1));
       ASSERT_OK(reqs[i].status);
       ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
     }

From c292dc85402e0da7b816076ceb4b404e427d5ab4 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 10 Jun 2019 11:47:16 -0700
Subject: [PATCH 120/572] WritePrepared: reduce prepared_mutex_ overhead
 (#5420)

Summary:
The patch reduces the contention over prepared_mutex_ using these techniques:
1) Move ::RemovePrepared() to be called from the commit callback when we have two write queues.
2) Use two separate mutex for PreparedHeap, one prepared_mutex_ needed for ::RemovePrepared, and one ::push_pop_mutex() needed for ::AddPrepared(). Given that we call ::AddPrepared only from the first write queue and ::RemovePrepared mostly from the 2nd, this will result into each the two write queues not competing with each other over a single mutex. ::RemovePrepared might occasionally need to acquire ::push_pop_mutex() if ::erase() ends up with calling ::pop()
3) Acquire ::push_pop_mutex() on the first callback of the write queue and release it on the last.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5420

Differential Revision: D15741985

Pulled By: maysamyabandeh

fbshipit-source-id: 84ce8016007e88bb6e10da5760ba1f0d26347735
---
 db/db_impl/db_impl_write.cc                   |  19 ++-
 db/pre_release_callback.h                     |   6 +-
 db/write_callback_test.cc                     |   3 +-
 .../transactions/pessimistic_transaction.cc   |   3 +-
 .../write_prepared_transaction_test.cc        |  61 ++++++----
 utilities/transactions/write_prepared_txn.cc  |  26 ++--
 .../transactions/write_prepared_txn_db.cc     |  67 +++++++----
 .../transactions/write_prepared_txn_db.h      | 113 +++++++++++++-----
 .../transactions/write_unprepared_txn.cc      |   4 +-
 .../transactions/write_unprepared_txn_db.cc   |   4 +-
 .../transactions/write_unprepared_txn_db.h    |   6 +-
 11 files changed, 218 insertions(+), 94 deletions(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7ff2982d147..21b123c3a94 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -263,6 +263,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     size_t total_count = 0;
     size_t valid_batches = 0;
     size_t total_byte_size = 0;
+    size_t pre_release_callback_cnt = 0;
     for (auto* writer : write_group) {
       if (writer->CheckCallback(this)) {
         valid_batches += writer->batch_cnt;
@@ -270,9 +271,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           total_count += WriteBatchInternal::Count(writer->batch);
           parallel = parallel && !writer->batch->HasMerge();
         }
-
         total_byte_size = WriteBatchInternal::AppendedByteSize(
             total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        if (writer->pre_release_callback) {
+          pre_release_callback_cnt++;
+        }
       }
     }
     // Note about seq_per_batch_: either disableWAL is set for the entire write
@@ -336,6 +339,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // PreReleaseCallback is called after WAL write and before memtable write
     if (status.ok()) {
       SequenceNumber next_sequence = current_sequence;
+      size_t index = 0;
       // Note: the logic for advancing seq here must be consistent with the
       // logic in WriteBatchInternal::InsertInto(write_group...) as well as
       // with WriteBatchInternal::InsertInto(write_batch...) that is called on
@@ -347,7 +351,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         writer->sequence = next_sequence;
         if (writer->pre_release_callback) {
           Status ws = writer->pre_release_callback->Callback(
-              writer->sequence, disable_memtable, writer->log_used);
+              writer->sequence, disable_memtable, writer->log_used, index++,
+              pre_release_callback_cnt);
           if (!ws.ok()) {
             status = ws;
             break;
@@ -675,11 +680,15 @@ Status DBImpl::WriteImplWALOnly(
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
 
+  size_t pre_release_callback_cnt = 0;
   size_t total_byte_size = 0;
   for (auto* writer : write_group) {
     if (writer->CheckCallback(this)) {
       total_byte_size = WriteBatchInternal::AppendedByteSize(
           total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+      if (writer->pre_release_callback) {
+        pre_release_callback_cnt++;
+      }
     }
   }
 
@@ -758,11 +767,13 @@ Status DBImpl::WriteImplWALOnly(
     WriteStatusCheck(status);
   }
   if (status.ok()) {
+    size_t index = 0;
     for (auto* writer : write_group) {
       if (!writer->CallbackFailed() && writer->pre_release_callback) {
         assert(writer->sequence != kMaxSequenceNumber);
         Status ws = writer->pre_release_callback->Callback(
-            writer->sequence, disable_memtable, writer->log_used);
+            writer->sequence, disable_memtable, writer->log_used, index++,
+            pre_release_callback_cnt);
         if (!ws.ok()) {
           status = ws;
           break;
@@ -1121,7 +1132,7 @@ Status DBImpl::WriteRecoverableState() {
         // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
         mutex_.Unlock();
         status = recoverable_state_pre_release_callback_->Callback(
-            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num);
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
         mutex_.Lock();
       }
     }
diff --git a/db/pre_release_callback.h b/db/pre_release_callback.h
index f91ef1b27ac..e4167904ff8 100644
--- a/db/pre_release_callback.h
+++ b/db/pre_release_callback.h
@@ -27,8 +27,12 @@ class PreReleaseCallback {
   // is_mem_disabled is currently used for debugging purposes to assert that
   // the callback is done from the right write queue.
   // If non-zero, log_number indicates the WAL log to which we wrote.
+  // index >= 0 specifies the order of callback in the same write thread.
+  // total > index specifies the total number of callbacks in the same write
+  // thread. Together with index, could be used to reduce the redundant
+  // operations among the callbacks.
   virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
-                          uint64_t log_number) = 0;
+                          uint64_t log_number, size_t index, size_t total) = 0;
 };
 
 }  //  namespace rocksdb
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index b5e26a8a7f0..1ab97b04589 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -304,7 +304,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                     PublishSeqCallback(DBImpl* db_impl_in)
                         : db_impl_(db_impl_in) {}
                     Status Callback(SequenceNumber last_seq, bool /*not used*/,
-                                    uint64_t) override {
+                                    uint64_t, size_t /*index*/,
+                                    size_t /*total*/) override {
                       db_impl_->SetLastPublishedSequence(last_seq);
                       return Status::OK();
                     }
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index ed7444894c7..1c0e2f06384 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -231,7 +231,8 @@ Status WriteCommittedTxn::PrepareInternal() {
       (void)two_write_queues_;  // to silence unused private field warning
     }
     virtual Status Callback(SequenceNumber, bool is_mem_disabled,
-                            uint64_t log_number) override {
+                            uint64_t log_number, size_t /*index*/,
+                            size_t /*total*/) override {
 #ifdef NDEBUG
       (void)is_mem_disabled;
 #endif
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 7c588f4ef69..e62b8344169 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -7,9 +7,9 @@
 
 #include "utilities/transactions/transaction_test.h"
 
-#include <cinttypes>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <string>
 #include <thread>
@@ -55,25 +55,17 @@ TEST(PreparedHeap, BasicsTest) {
   heap.push(34l);
   // Test that old min is still on top
   ASSERT_EQ(14l, heap.top());
-  heap.push(13l);
-  // Test that the new min will be on top
-  ASSERT_EQ(13l, heap.top());
-  // Test that it is persistent
-  ASSERT_EQ(13l, heap.top());
   heap.push(44l);
   heap.push(54l);
   heap.push(64l);
   heap.push(74l);
   heap.push(84l);
   // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
+  ASSERT_EQ(14l, heap.top());
   heap.erase(24l);
   // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
+  ASSERT_EQ(14l, heap.top());
   heap.erase(14l);
-  // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
-  heap.erase(13l);
   // Test that the new comes to the top after multiple erase
   ASSERT_EQ(34l, heap.top());
   heap.erase(34l);
@@ -3001,13 +2993,16 @@ TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
   ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value));
   port::Mutex txn_mutex_;
 
-  // t1) Insert prepared entry, t2) commit other entires to advance max
-  // evicted sec and finish checking the existing prepared entires, t1)
+  // t1) Insert prepared entry, t2) commit other entries to advance max
+  // evicted sec and finish checking the existing prepared entries, t1)
   // AddPrepared, t2) update max_evicted_seq_
   rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"AddPrepared::begin:pause", "AddPreparedBeforeMax::read_thread:start"},
-      {"AdvanceMaxEvictedSeq::update_max:pause", "AddPrepared::begin:resume"},
-      {"AddPrepared::end", "AdvanceMaxEvictedSeq::update_max:resume"},
+      {"AddPreparedCallback::AddPrepared::begin:pause",
+       "AddPreparedBeforeMax::read_thread:start"},
+      {"AdvanceMaxEvictedSeq::update_max:pause",
+       "AddPreparedCallback::AddPrepared::begin:resume"},
+      {"AddPreparedCallback::AddPrepared::end",
+       "AdvanceMaxEvictedSeq::update_max:resume"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3061,20 +3056,36 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
       ReOpen();
       std::atomic<const Snapshot*> snap = {nullptr};
       std::atomic<SequenceNumber> exp_prepare = {0};
+      std::atomic<bool> snapshot_taken = {false};
       // Value is synchronized via snap
       PinnableSlice value;
       // Take a snapshot after publish and before RemovePrepared:Start
+      auto snap_callback = [&]() {
+        ASSERT_EQ(nullptr, snap.load());
+        snap.store(db->GetSnapshot());
+        ReadOptions roptions;
+        roptions.snapshot = snap.load();
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
+        ASSERT_OK(s);
+        snapshot_taken.store(true);
+      };
       auto callback = [&](void* param) {
         SequenceNumber prep_seq = *((SequenceNumber*)param);
         if (prep_seq == exp_prepare.load()) {  // only for write_thread
-          ASSERT_EQ(nullptr, snap.load());
-          snap.store(db->GetSnapshot());
-          ReadOptions roptions;
-          roptions.snapshot = snap.load();
-          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
-          ASSERT_OK(s);
+          // We need to spawn a thread to avoid deadlock since getting a
+          // snpashot might end up calling AdvanceSeqByOne which needs joining
+          // the write queue.
+          auto t = rocksdb::port::Thread(snap_callback);
+          t.detach();
+          TEST_SYNC_POINT("callback:end");
         }
       };
+      // Wait for the first snapshot be taken in GetSnapshotInternal. Although
+      // it might be updated before GetSnapshotInternal finishes but this should
+      // cover most of the cases.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency({
+          {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"},
+      });
       SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback);
       SyncPoint::GetInstance()->EnableProcessing();
       // Thread to cause frequent evictions
@@ -3098,9 +3109,15 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           // Let an eviction to kick in
           std::this_thread::yield();
 
+          snapshot_taken.store(false);
           exp_prepare.store(txn->GetId());
           ASSERT_OK(txn->Commit());
           delete txn;
+          // Wait for the snapshot taking that is triggered by
+          // RemovePrepared:Start callback
+          while (!snapshot_taken) {
+            std::this_thread::yield();
+          }
 
           // Read with the snapshot taken before delayed_prepared_ cleanup
           ReadOptions roptions;
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index f55615063e5..f4c21d4769e 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -169,12 +169,15 @@ Status WritePreparedTxn::CommitInternal() {
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
-    if (LIKELY(s.ok())) {
-      // Note RemovePrepared should be called after WriteImpl that publishsed
+    if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
+                 s.ok())) {
+      // Note: RemovePrepared should be called after WriteImpl that publishsed
       // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
       wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
-    }
+    }  // else RemovePrepared is called from within PreReleaseCallback
     if (UNLIKELY(!do_one_write)) {
+      assert(!s.ok());
+      // Cleanup the prepared entry we added with add_prepared_callback
       wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
     }
     return s;
@@ -199,10 +202,14 @@ Status WritePreparedTxn::CommitInternal() {
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_aux_batch);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
-  // Note RemovePrepared should be called after WriteImpl that publishsed the
-  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
-  wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
-  wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+  if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues)) {
+    if (s.ok()) {
+      // Note: RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+    }
+    wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+  }  // else RemovePrepared is called from within PreReleaseCallback
   return s;
 }
 
@@ -348,6 +355,7 @@ Status WritePreparedTxn::RollbackInternal() {
     return s;
   }
   if (do_one_write) {
+    assert(!db_impl_->immutable_db_options().two_write_queues);
     wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
     return s;
   }  // else do the 2nd write for commit
@@ -370,9 +378,13 @@ Status WritePreparedTxn::RollbackInternal() {
   ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
                     "RollbackInternal (status=%s) commit: %" PRIu64,
                     s.ToString().c_str(), GetId());
+  // TODO(lth): For WriteUnPrepared that rollback is called frequently,
+  // RemovePrepared could be moved to the callback to reduce lock contention.
   if (s.ok()) {
     wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
   }
+  // Note: RemovePrepared for prepared batch is called from within
+  // PreReleaseCallback
   wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
 
   return s;
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 8e08d074134..96e1aa7a7ba 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -7,8 +7,8 @@
 
 #include "utilities/transactions/write_prepared_txn_db.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -61,8 +61,8 @@ Status WritePreparedTxnDB::Initialize(
     explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
         : db_(db) {}
     Status Callback(SequenceNumber commit_seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
       assert(!is_mem_disabled);
       db_->AddCommitted(commit_seq, commit_seq);
       return Status::OK();
@@ -211,9 +211,7 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
                           no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
-  // Note RemovePrepared should be called after WriteImpl that publishsed the
-  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
-  RemovePrepared(prepare_seq, batch_cnt);
+  // Note: RemovePrepared is called from within PreReleaseCallback
   return s;
 }
 
@@ -389,8 +387,8 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
       new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
 }
 
-void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) {
-  prepared_mutex_.AssertHeld();
+void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
+                                                 bool locked) {
   // When max_evicted_seq_ advances, move older entries from prepared_txns_
   // to delayed_prepared_. This guarantees that if a seq is lower than max,
   // then it is not in prepared_txns_ and save an expensive, synchronized
@@ -401,25 +399,42 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) {
       "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64,
       prepared_txns_.empty(),
       prepared_txns_.empty() ? 0 : prepared_txns_.top());
-  while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
-    auto to_be_popped = prepared_txns_.top();
-    delayed_prepared_.insert(to_be_popped);
-    ROCKS_LOG_WARN(info_log_,
-                   "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
-                   " new_max=%" PRIu64,
-                   static_cast<uint64_t>(delayed_prepared_.size()),
-                   to_be_popped, new_max);
-    prepared_txns_.pop();
-    delayed_prepared_empty_.store(false, std::memory_order_release);
+  const SequenceNumber prepared_top = prepared_txns_.top();
+  const bool empty = prepared_top == kMaxSequenceNumber;
+  // Preliminary check to avoid the synchronization cost
+  if (!empty && prepared_top <= new_max) {
+    if (locked) {
+      // Needed to avoid double locking in pop().
+      prepared_txns_.push_pop_mutex()->Unlock();
+    }
+    WriteLock wl(&prepared_mutex_);
+    // Need to fetch fresh values of ::top after mutex is acquired
+    while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
+      auto to_be_popped = prepared_txns_.top();
+      delayed_prepared_.insert(to_be_popped);
+      ROCKS_LOG_WARN(info_log_,
+                     "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
+                     " new_max=%" PRIu64,
+                     static_cast<uint64_t>(delayed_prepared_.size()),
+                     to_be_popped, new_max);
+      prepared_txns_.pop();
+      delayed_prepared_empty_.store(false, std::memory_order_release);
+    }
+    if (locked) {
+      prepared_txns_.push_pop_mutex()->Lock();
+    }
   }
 }
 
-void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
+void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) {
   ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64,
                     seq, max_evicted_seq_.load());
   TEST_SYNC_POINT("AddPrepared::begin:pause");
   TEST_SYNC_POINT("AddPrepared::begin:resume");
-  WriteLock wl(&prepared_mutex_);
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Lock();
+  }
+  prepared_txns_.push_pop_mutex()->AssertHeld();
   prepared_txns_.push(seq);
   auto new_max = future_max_evicted_seq_.load();
   if (UNLIKELY(seq <= new_max)) {
@@ -429,7 +444,10 @@ void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
         "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64
         " <= %" PRIu64,
         seq, new_max);
-    CheckPreparedAgainstMax(new_max);
+    CheckPreparedAgainstMax(new_max, true /*locked*/);
+  }
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Unlock();
   }
   TEST_SYNC_POINT("AddPrepared::end");
 }
@@ -582,10 +600,7 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
              std::memory_order_relaxed)) {
   };
 
-  {
-    WriteLock wl(&prepared_mutex_);
-    CheckPreparedAgainstMax(new_max);
-  }
+  CheckPreparedAgainstMax(new_max, false /*locked*/);
 
   // With each change to max_evicted_seq_ fetch the live snapshots behind it.
   // We use max as the version of snapshots to identify how fresh are the
@@ -641,6 +656,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
   // than the smallest uncommitted seq when the snapshot was taken.
   auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq();
   SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first");
   assert(snap_impl);
   SequenceNumber snap_seq = snap_impl->GetSequenceNumber();
   // Note: Check against future_max_evicted_seq_ (in contrast with
@@ -679,6 +695,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
       db_impl_->immutable_db_options().info_log,
       "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64,
       snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end");
   return snap_impl;
 }
 
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 876279cba23..acf2b97a99d 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -324,10 +324,11 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
 
   // Add the transaction with prepare sequence seq to the prepared list.
   // Note: must be called serially with increasing seq on each call.
-  void AddPrepared(uint64_t seq);
+  // locked is true if prepared_mutex_ is already locked.
+  void AddPrepared(uint64_t seq, bool locked = false);
   // Check if any of the prepared txns are less than new max_evicted_seq_. Must
   // be called with prepared_mutex_ write locked.
-  void CheckPreparedAgainstMax(SequenceNumber new_max);
+  void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked);
   // Remove the transaction with prepare sequence seq from the prepared list
   void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1);
   // Add the transaction with prepare sequence prepare_seq and commit sequence
@@ -461,6 +462,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       std::memory_order order = std::memory_order_relaxed);
 
  private:
+  friend class AddPreparedCallback;
   friend class PreparedHeap_BasicsTest_Test;
   friend class PreparedHeap_Concurrent_Test;
   friend class PreparedHeap_EmptyAtTheEnd_Test;
@@ -506,10 +508,15 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   // A heap with the amortized O(1) complexity for erase. It uses one extra heap
   // to keep track of erased entries that are not yet on top of the main heap.
   class PreparedHeap {
+    // The mutex is required for push and pop from PreparedHeap. ::erase will
+    // use external synchronization via prepared_mutex_.
+    port::Mutex push_pop_mutex_;
+    // TODO(myabandeh): replace it with deque
     std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
         heap_;
     std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
         erased_heap_;
+    std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
     // True when testing crash recovery
     bool TEST_CRASH_ = false;
     friend class WritePreparedTxnDB;
@@ -521,10 +528,19 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
         assert(erased_heap_.empty());
       }
     }
-    bool empty() { return heap_.empty(); }
-    uint64_t top() { return heap_.top(); }
-    void push(uint64_t v) { heap_.push(v); }
-    void pop() {
+    port::Mutex* push_pop_mutex() { return &push_pop_mutex_; }
+
+    inline bool empty() { return top() == kMaxSequenceNumber; }
+    // Returns kMaxSequenceNumber if empty() and the smallest otherwise.
+    inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
+    inline void push(uint64_t v) {
+      heap_.push(v);
+      heap_top_.store(heap_.top(), std::memory_order_release);
+    }
+    void pop(bool locked = false) {
+      if (!locked) {
+        push_pop_mutex()->Lock();
+      }
       heap_.pop();
       while (!heap_.empty() && !erased_heap_.empty() &&
              // heap_.top() > erased_heap_.top() could happen if we have erased
@@ -543,15 +559,23 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       while (heap_.empty() && !erased_heap_.empty()) {
         erased_heap_.pop();
       }
+      heap_top_.store(!heap_.empty() ? heap_.top() : kMaxSequenceNumber,
+                      std::memory_order_release);
+      if (!locked) {
+        push_pop_mutex()->Unlock();
+      }
     }
+    // Concurrrent calls needs external synchronization. It is safe to be called
+    // concurrent to push and pop though.
     void erase(uint64_t seq) {
       if (!heap_.empty()) {
-        if (seq < heap_.top()) {
+        auto top_seq = top();
+        if (seq < top_seq) {
           // Already popped, ignore it.
-        } else if (heap_.top() == seq) {
+        } else if (top_seq == seq) {
           pop();
           assert(heap_.empty() || heap_.top() != seq);
-        } else {  // (heap_.top() > seq)
+        } else {  // top() > seq
           // Down the heap, remember to pop it later
           erased_heap_.push(seq);
         }
@@ -596,27 +620,37 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
     // written in two steps, we also update prepared_txns_ at the first step
     // (via the same mechanism) so that their uncommitted data is reflected in
     // SmallestUnCommittedSeq.
-    ReadLock rl(&prepared_mutex_);
-    // Since we are holding the mutex, and GetLatestSequenceNumber is updated
-    // after prepared_txns_ are, the value of GetLatestSequenceNumber would
-    // reflect any uncommitted data that is not added to prepared_txns_ yet.
-    // Otherwise, if there is no concurrent txn, this value simply reflects that
-    // latest value in the memtable.
-    if (!delayed_prepared_.empty()) {
-      assert(!delayed_prepared_empty_.load());
-      return *delayed_prepared_.begin();
+    if (!delayed_prepared_empty_.load()) {
+      ReadLock rl(&prepared_mutex_);
+      if (!delayed_prepared_.empty()) {
+        return *delayed_prepared_.begin();
+      }
     }
-    if (prepared_txns_.empty()) {
-      return db_impl_->GetLatestSequenceNumber() + 1;
+    // This must be called before calling ::top. This is because the concurrent
+    // thread would call ::RemovePrepared before updating
+    // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
+    // that the ::top that we read would be lower the ::top if we had otherwise
+    // update/read them atomically.
+    auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
+    auto min_prepare = prepared_txns_.top();
+    bool empty = min_prepare == kMaxSequenceNumber;
+    if (empty) {
+      // Since GetLatestSequenceNumber is updated
+      // after prepared_txns_ are, the value of GetLatestSequenceNumber would
+      // reflect any uncommitted data that is not added to prepared_txns_ yet.
+      // Otherwise, if there is no concurrent txn, this value simply reflects
+      // that latest value in the memtable.
+      return next_prepare;
     } else {
-      return std::min(prepared_txns_.top(),
-                      db_impl_->GetLatestSequenceNumber() + 1);
+      return std::min(min_prepare, next_prepare);
     }
   }
+
   // Enhance the snapshot object by recording in it the smallest uncommitted seq
   inline void EnhanceSnapshot(SnapshotImpl* snapshot,
                               SequenceNumber min_uncommitted) {
     assert(snapshot);
+    assert(min_uncommitted <= snapshot->number_ + 1);
     snapshot->min_uncommitted_ = min_uncommitted;
   }
 
@@ -778,12 +812,28 @@ class AddPreparedCallback : public PreReleaseCallback {
   }
   virtual Status Callback(SequenceNumber prepare_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t log_number) override {
+                          uint64_t log_number, size_t index,
+                          size_t total) override {
+    assert(index < total);
+    // To reduce the cost of lock acquisition competing with the concurrent
+    // prepare requests, lock on the first callback and unlock on the last.
+    const bool do_lock = !two_write_queues_ || index == 0;
+    const bool do_unlock = !two_write_queues_ || index + 1 == total;
     // Always Prepare from the main queue
     assert(!two_write_queues_ || !is_mem_disabled);  // implies the 1st queue
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause");
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume");
+    if (do_lock) {
+      db_->prepared_txns_.push_pop_mutex()->Lock();
+    }
+    const bool kLocked = true;
     for (size_t i = 0; i < sub_batch_cnt_; i++) {
-      db_->AddPrepared(prepare_seq + i);
+      db_->AddPrepared(prepare_seq + i, kLocked);
     }
+    if (do_unlock) {
+      db_->prepared_txns_.push_pop_mutex()->Unlock();
+    }
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end");
     if (first_prepare_batch_) {
       assert(log_number != 0);
       db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
@@ -826,7 +876,8 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     // Always commit from the 2nd queue
     assert(!db_impl_->immutable_db_options().two_write_queues ||
            is_mem_disabled);
@@ -863,6 +914,14 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
       // publish sequence numbers will be in order, i.e., once a seq is
       // published all the seq prior to that are also publishable.
       db_impl_->SetLastPublishedSequence(last_commit_seq);
+      // Note RemovePrepared should be called after publishing the seq.
+      // Otherwise SmallestUnCommittedSeq optimization breaks.
+      if (prep_seq_ != kMaxSequenceNumber) {
+        db_->RemovePrepared(prep_seq_, prep_batch_cnt_);
+      }  // else there was no prepare phase
+      if (includes_aux_batch_) {
+        db_->RemovePrepared(aux_seq_, aux_batch_cnt_);
+      }
     }
     // else SequenceNumber that is updated as part of the write already does the
     // publishing
@@ -907,8 +966,8 @@ class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback {
     assert(prep_batch_cnt_ > 0);
   }
 
-  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled,
-                  uint64_t) override {
+  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t,
+                  size_t /*index*/, size_t /*total*/) override {
     // Always commit from the 2nd queue
     assert(is_mem_disabled);  // implies the 2nd queue
     assert(db_impl_->immutable_db_options().two_write_queues);
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 73e9a8837a0..a1fe213ddd3 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -319,8 +319,8 @@ Status WriteUnpreparedTxn::CommitInternal() {
     explicit PublishSeqPreReleaseCallback(DBImpl* db_impl)
         : db_impl_(db_impl) {}
     Status Callback(SequenceNumber seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
       assert(is_mem_disabled);
       assert(db_impl_->immutable_db_options().two_write_queues);
       db_impl_->SetLastPublishedSequence(seq);
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index ea655f88e3c..0c94183947f 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -185,8 +185,8 @@ Status WriteUnpreparedTxnDB::Initialize(
     explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
         : db_(db) {}
     Status Callback(SequenceNumber commit_seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
       assert(!is_mem_disabled);
       db_->AddCommitted(commit_seq, commit_seq);
       return Status::OK();
diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h
index fab8ce8263d..6405ba68381 100644
--- a/utilities/transactions/write_unprepared_txn_db.h
+++ b/utilities/transactions/write_unprepared_txn_db.h
@@ -57,7 +57,8 @@ class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
                                          ? commit_seq
                                          : commit_seq + data_batch_cnt_ - 1;
@@ -121,7 +122,8 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     assert(is_mem_disabled);  // implies the 2nd queue
     const uint64_t last_commit_seq = commit_seq;
     db_->AddCommitted(rollback_seq_, last_commit_seq);

From 6ce5580882bda5791bec61b033e03a452a7a8483 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 10 Jun 2019 12:53:56 -0700
Subject: [PATCH 121/572] Improve memtable earliest seqno assignment for
 secondary instance (#5413)

Summary:
In regular RocksDB instance, `MemTable::earliest_seqno_` is "db sequence number at the time of creation". However, we cannot use the db sequence number to set the value of `MemTable::earliest_seqno_` for secondary instance, i.e. `DBImplSecondary` due to the logic of MANIFEST and WAL replay.
When replaying the log files of the primary, the secondary instance first replays MANIFEST and updates the db sequence number if necessary. Next, the secondary replays WAL files, creates new memtables if necessary and inserts key-value pairs into memtables. The following can occur when the db has two or more column families.
Assume the db has column family "default" and "cf1". At a certain in time, both "default" and "cf1" have data in memtables.
1. Primary triggers a flush and flushes "cf1". "default" is **not** flushed.
2. Secondary replays the MANIFEST updates its db sequence number to the latest value learned from the MANIFEST.
3. Secondary starts to replay WAL that contains the writes to "default". It is possible that the write batches' sequence numbers are smaller than the db sequence number. In this case, these write batches will be skipped, and these updates will not be visible to reader until "default" is later flushed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5413

Differential Revision: D15637407

Pulled By: riversand963

fbshipit-source-id: 3de3fe35cfc6f1b9f844f3f926f0df29717b6580
---
 HISTORY.md                      |  1 +
 db/db_impl/db_impl_secondary.cc | 36 ++++++++++++++++++++++++---------
 db/db_impl/db_secondary_test.cc |  7 +++++++
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c88b436e40d..ad6c370b5a0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -27,6 +27,7 @@
 ### Bug Fixes
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
+* Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 827d99929a9..eb8c4c98738 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -199,16 +199,8 @@ Status DBImplSecondary::RecoverLogFiles(
             record.size(), Status::Corruption("log record too small"));
         continue;
       }
-      SequenceNumber seq = versions_->LastSequence();
       WriteBatchInternal::SetContents(&batch, record);
       SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
-      // If the write batch's sequence number is smaller than the last sequence
-      // number of the db, then we should skip this write batch because its
-      // data must reside in an SST that has already been added in the prior
-      // MANIFEST replay.
-      if (seq_of_batch < seq) {
-        continue;
-      }
       std::vector<uint32_t> column_family_ids;
       status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
       if (status.ok()) {
@@ -221,6 +213,17 @@ Status DBImplSecondary::RecoverLogFiles(
           if (cfds_changed->count(cfd) == 0) {
             cfds_changed->insert(cfd);
           }
+          const std::vector<FileMetaData*>& l0_files =
+              cfd->current()->storage_info()->LevelFiles(0);
+          SequenceNumber seq =
+              l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+          // If the write batch's sequence number is smaller than the last
+          // sequence number of the largest sequence persisted for this column
+          // family, then its data must reside in an SST that has already been
+          // added in the prior MANIFEST replay.
+          if (seq_of_batch <= seq) {
+            continue;
+          }
           auto curr_log_num = port::kMaxUint64;
           if (cfd_to_current_log_.count(cfd) > 0) {
             curr_log_num = cfd_to_current_log_[cfd];
@@ -233,7 +236,7 @@ Status DBImplSecondary::RecoverLogFiles(
             const MutableCFOptions mutable_cf_options =
                 *cfd->GetLatestMutableCFOptions();
             MemTable* new_mem =
-                cfd->ConstructNewMemtable(mutable_cf_options, seq);
+                cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
             cfd->mem()->SetNextLogNumber(log_number);
             cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
             new_mem->Ref();
@@ -452,6 +455,21 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
   InstrumentedMutexLock lock_guard(&mutex_);
   s = static_cast<ReactiveVersionSet*>(versions_.get())
           ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+                 static_cast<uint64_t>(versions_->LastSequence()));
+  for (ColumnFamilyData* cfd : cfds_changed) {
+    if (cfd->IsDropped()) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+                      cfd->GetName().c_str());
+      continue;
+    }
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Level summary: %s\n",
+                    cfd->GetName().c_str(),
+                    cfd->current()->storage_info()->LevelSummary(&tmp));
+  }
+
   // list wal_dir to discover new WALs and apply new changes to the secondary
   // instance
   if (s.ok()) {
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index 5b375422f02..c9aaa361191 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -576,6 +576,11 @@ TEST_F(DBSecondaryTest, SwitchWAL) {
 
 TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
   const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+       "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
   const std::string kCFName1 = "pikachu";
   Options options;
   options.env = env_;
@@ -629,8 +634,10 @@ TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
         Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
     ASSERT_OK(
         Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    TEST_SYNC_POINT("DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
     verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
   }
 }
 

From 63ace8ef0e644ab3384b0a19f0235cd6596f70c1 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Mon, 10 Jun 2019 13:28:18 -0700
Subject: [PATCH 122/572] Reuse data block iterator in
 BlockBasedTableReader::MultiGet() (#5314)

Summary:
Instead of creating a new DataBlockIterator for every key in a MultiGet batch, reuse it if the next key is in the same block. This results in a small 1-2% cpu improvement.

TEST_TMPDIR=/dev/shm/multiget numactl -C 10  ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4

Without the change -
multireadrandom :       3.066 micros/op 326122 ops/sec; (29375968 of 29375968 found)

With the change -
multireadrandom :       3.003 micros/op 332945 ops/sec; (29983968 of 29983968 found)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5314

Differential Revision: D15742108

Pulled By: anand1976

fbshipit-source-id: 220fb0b8eea9a0d602ddeb371528f7af7936d771
---
 table/block_based/block.h                     | 13 ++++++
 table/block_based/block_based_table_reader.cc | 43 ++++++++++++++++---
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/table/block_based/block.h b/table/block_based/block.h
index 3c54389b08a..2bb577d33bd 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -236,6 +236,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
     restart_index_ = num_restarts_;
     global_seqno_ = global_seqno;
     block_contents_pinned_ = block_contents_pinned;
+    cache_handle_ = nullptr;
   }
 
   // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
@@ -285,6 +286,10 @@ class BlockIter : public InternalIteratorBase<TValue> {
     return static_cast<uint32_t>(value_.data() - data_);
   }
 
+  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+  Cache::Handle* cache_handle() { return cache_handle_; }
+
  protected:
   // Note: The type could be changed to InternalKeyComparator but we see a weird
   // performance drop by that.
@@ -307,6 +312,14 @@ class BlockIter : public InternalIteratorBase<TValue> {
   bool block_contents_pinned_;
   SequenceNumber global_seqno_;
 
+ private:
+  // Store the cache handle, if the block is cached. We need this since the
+  // only other place the handle is stored is as an argument to the Cleanable
+  // function callback, which is hard to retrieve. When multiple value
+  // PinnableSlices reference the block, they need the cache handle in order
+  // to bump up the ref count
+  Cache::Handle* cache_handle_;
+
  public:
   // Return the offset in data_ just past the end of the current entry.
   inline uint32_t NextEntryOffset() const {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 68213f04149..a8e4e1d40db 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -129,6 +129,14 @@ void ForceReleaseCachedEntry(void* arg, void* h) {
   cache->Release(handle, true /* force_erase */);
 }
 
+// Release the cached entry and decrement its ref count.
+// Do not force erase
+void ReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, false /* force_erase */);
+}
+
 // For hash based index, return true if prefix_extractor and
 // prefix_extractor_block mismatch, false otherwise. This flag will be used
 // as total_order_seek via NewIndexIterator
@@ -2073,6 +2081,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
                               cache_handle);
       }
     }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
   }
 
   block.TransferTo(iter);
@@ -2933,6 +2943,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       iiter_unique_ptr.reset(iiter);
     }
 
+    DataBlockIter biter;
+    uint64_t offset = std::numeric_limits<uint64_t>::max();
     for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
          ++miter) {
       Status s;
@@ -2941,10 +2953,15 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       bool matched = false;  // if such user key matched a key in SST
       bool done = false;
       for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-        DataBlockIter biter;
-        NewDataBlockIterator<DataBlockIter>(
-            read_options, iiter->value(), &biter, BlockType::kData,
-            true /* key_includes_seq */, get_context);
+        bool reusing_block = true;
+        if (iiter->value().offset() != offset) {
+          offset = iiter->value().offset();
+          biter.Invalidate(Status::OK());
+          NewDataBlockIterator<DataBlockIter>(
+              read_options, iiter->value(), &biter, BlockType::kData, false,
+              true /* key_includes_seq */, get_context);
+          reusing_block = false;
+        }
 
         if (read_options.read_tier == kBlockCacheTier &&
             biter.status().IsIncomplete()) {
@@ -2971,13 +2988,27 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         // Call the *saver function on each entry/block until it returns false
         for (; biter.Valid(); biter.Next()) {
           ParsedInternalKey parsed_key;
+          Cleanable dummy;
+          Cleanable* value_pinner = nullptr;
+
           if (!ParseInternalKey(biter.key(), &parsed_key)) {
             s = Status::Corruption(Slice());
           }
+          if (biter.IsValuePinned()) {
+            if (reusing_block) {
+              Cache* block_cache = rep_->table_options.block_cache.get();
+              assert(biter.cache_handle() != nullptr);
+              block_cache->Ref(biter.cache_handle());
+              dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                                    biter.cache_handle());
+              value_pinner = &dummy;
+            } else {
+              value_pinner = &biter;
+            }
+          }
 
           if (!get_context->SaveValue(
-                  parsed_key, biter.value(), &matched,
-                  biter.IsValuePinned() ? &biter : nullptr)) {
+                  parsed_key, biter.value(), &matched, value_pinner)) {
             done = true;
             break;
           }

From 5efa0d6b0df1f3aea2ea8720c48c2b918b47ead1 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 10 Jun 2019 15:30:05 -0700
Subject: [PATCH 123/572] Create a BlockCacheLookupContext to enable
 fine-grained block cache tracing. (#5421)

Summary:
BlockCacheLookupContext only contains the caller for now.
We will trace block accesses at five places:
1. BlockBasedTable::GetFilter.
2. BlockBasedTable::GetUncompressedDict.
3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, and range deletion block.)
4. BlockBasedTable::Get. (To trace the referenced key and whether the referenced key exists in a fetched data block.)
5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the referenced key exists in a fetched data block.)

We create the context at:
1. BlockBasedTable::Get. (kUserGet)
2. BlockBasedTable::MultiGet. (kUserMGet)
3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or external SST ingestion calls this function.)
4. BlockBasedTable::Open. (kPrefetch)
5. Index/Filter::CacheDependencies. (kPrefetch)
6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or kUserApproximateSize).

I loaded 1 million key-value pairs into the database and ran the readrandom benchmark with a single thread. I gave the block cache 10 GB to make sure all reads hit the block cache after warmup. The throughput is comparable.
Throughput of this PR: 231334 ops/s.
Throughput of the master branch: 238428 ops/s.

Experiment setup:
RocksDB:    version 6.2
Date:       Mon Jun 10 10:42:51 2019
CPU:        24 * Intel Core Processor (Skylake)
CPUCache:   16384 KB
Keys:       20 bytes each
Values:     100 bytes each (100 bytes after compression)
Entries:    1000000
Prefix:    20 bytes
Keys per prefix:    0
RawSize:    114.4 MB (estimated)
FileSize:   114.4 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: NoCompression
Compression sampling rate: 0
Memtablerep: skip_list
Perf Level: 1

Load command: ./db_bench --benchmarks="fillseq" --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000

Run command: ./db_bench --benchmarks="readrandom,stats" --use_existing_db --threads=1 --duration=120 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --statistics --cache_index_and_filter_blocks --cache_size=10737418240 --disable_auto_compactions=1 --disable_wal=1 --compression_type=none --min_level_to_compress=-1 --compression_ratio=1 --num=1000000 --duration=120

TODOs:
1. Create a caller for external SST file ingestion and differentiate the callers for iterator.
2. Integrate tracer to trace block cache accesses.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5421

Differential Revision: D15704258

Pulled By: HaoyuHuang

fbshipit-source-id: 4aa8a55f8cb1576ffb367bfa3186a91d8f06d93a
---
 db/compaction/compaction_job.cc               |   3 +-
 db/db_impl/db_impl.cc                         |   4 +-
 db/version_set.cc                             |  21 +-
 db/version_set.h                              |   7 +-
 table/block_based/block_based_filter_block.cc |   6 +-
 table/block_based/block_based_filter_block.h  |  23 +-
 .../block_based_filter_block_test.cc          | 200 +++++++++---
 table/block_based/block_based_table_reader.cc | 300 +++++++++++-------
 table/block_based/block_based_table_reader.h  |  89 +++---
 table/block_based/filter_block.h              |  34 +-
 table/block_based/full_filter_block.cc        |  23 +-
 table/block_based/full_filter_block.h         |  59 ++--
 table/block_based/full_filter_block_test.cc   |  64 +++-
 table/block_based/partitioned_filter_block.cc |  37 ++-
 table/block_based/partitioned_filter_block.h  |  30 +-
 .../partitioned_filter_block_test.cc          |  18 +-
 table/cuckoo/cuckoo_table_reader.h            |   5 +-
 table/mock_table.h                            |   7 +-
 table/plain/plain_table_reader.cc             |   3 +-
 table/plain/plain_table_reader.h              |   3 +-
 table/table_reader.h                          |   3 +-
 trace_replay/block_cache_tracer.h             |  30 +-
 22 files changed, 634 insertions(+), 335 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index ca8575a0dc9..65efedad5b4 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
     // to the index block and may incur I/O cost in the process. Unlock db
     // mutex to reduce contention
     db_mutex_->Unlock();
-    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
+    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1,
+                                               /*for_compaction*/ true);
     db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
     sum += size;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index bb6ec7db4c5..b1a828f9f0e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2717,7 +2717,9 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
     InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = 0;
     if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) {
-      sizes[i] += versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
+      sizes[i] += versions_->ApproximateSize(
+          v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
+          /*for_compaction=*/false);
     }
     if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) {
       sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
diff --git a/db/version_set.cc b/db/version_set.cc
index 96bf22e57b4..8895879bfbf 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4827,7 +4827,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 // maintain state of where they first appear in the files.
 uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
                                      const Slice& end, int start_level,
-                                     int end_level) {
+                                     int end_level, bool for_compaction) {
   // pre-condition
   assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
 
@@ -4848,7 +4848,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
 
     if (!level) {
       // level 0 data is sorted order, handle the use case explicitly
-      size += ApproximateSizeLevel0(v, files_brief, start, end);
+      size += ApproximateSizeLevel0(v, files_brief, start, end, for_compaction);
       continue;
     }
 
@@ -4865,7 +4865,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
     // inferred from the sorted order
     for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
       uint64_t val;
-      val = ApproximateSize(v, files_brief.files[i], end);
+      val = ApproximateSize(v, files_brief.files[i], end, for_compaction);
       if (!val) {
         // the files after this will not have the range
         break;
@@ -4876,7 +4876,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
       if (i == idx_start) {
         // subtract the bytes needed to be scanned to get to the starting
         // key
-        val = ApproximateSize(v, files_brief.files[i], start);
+        val = ApproximateSize(v, files_brief.files[i], start, for_compaction);
         assert(size >= val);
         size -= val;
       }
@@ -4889,13 +4889,16 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
 uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
                                            const LevelFilesBrief& files_brief,
                                            const Slice& key_start,
-                                           const Slice& key_end) {
+                                           const Slice& key_end,
+                                           bool for_compaction) {
   // level 0 files are not in sorted order, we need to iterate through
   // the list to compute the total bytes that require scanning
   uint64_t size = 0;
   for (size_t i = 0; i < files_brief.num_files; i++) {
-    const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
-    const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
+    const uint64_t start =
+        ApproximateSize(v, files_brief.files[i], key_start, for_compaction);
+    const uint64_t end =
+        ApproximateSize(v, files_brief.files[i], key_end, for_compaction);
     assert(end >= start);
     size += end - start;
   }
@@ -4903,7 +4906,7 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
 }
 
 uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
-                                     const Slice& key) {
+                                     const Slice& key, bool for_compaction) {
   // pre-condition
   assert(v);
 
@@ -4923,7 +4926,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
         *f.file_metadata, nullptr /* range_del_agg */,
         v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr);
     if (table_reader_ptr != nullptr) {
-      result = table_reader_ptr->ApproximateOffsetOf(key);
+      result = table_reader_ptr->ApproximateOffsetOf(key, for_compaction);
     }
     delete iter;
   }
diff --git a/db/version_set.h b/db/version_set.h
index dc9e759655e..8a43b982366 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -982,7 +982,7 @@ class VersionSet {
   // in levels [start_level, end_level). If end_level == 0 it will search
   // through all non-empty levels
   uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
-                           int start_level = 0, int end_level = -1);
+                           int start_level, int end_level, bool for_compaction);
 
   // Return the size of the current manifest file
   uint64_t manifest_file_size() const { return manifest_file_size_; }
@@ -1032,10 +1032,11 @@ class VersionSet {
 
   // ApproximateSize helper
   uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
-                                 const Slice& start, const Slice& end);
+                                 const Slice& start, const Slice& end,
+                                 bool for_compaction);
 
   uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
-                           const Slice& key);
+                           const Slice& key, bool for_compaction);
 
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
index fb366b5d316..e5a32e4635f 100644
--- a/table/block_based/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -187,7 +187,8 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
 bool BlockBasedFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /* prefix_extractor */,
     uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    const Slice* const /*const_ikey_ptr*/,
+    BlockCacheLookupContext* /*context*/) {
   assert(block_offset != kNotValid);
   if (!whole_key_filtering_) {
     return true;
@@ -198,7 +199,8 @@ bool BlockBasedFilterBlockReader::KeyMayMatch(
 bool BlockBasedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
     uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    const Slice* const /*const_ikey_ptr*/,
+    BlockCacheLookupContext* /*context*/) {
   assert(block_offset != kNotValid);
   return MayMatch(prefix, block_offset);
 }
diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
index 74a2285e1e9..cd86ff5c8a5 100644
--- a/table/block_based/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -82,17 +82,18 @@ class BlockBasedFilterBlockReader : public FilterBlockReader {
                               const BlockBasedTableOptions& table_opt,
                               bool whole_key_filtering,
                               BlockContents&& contents, Statistics* statistics);
-  virtual bool IsBlockBased() override { return true; }
-
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual size_t ApproximateMemoryUsage() const override;
+  bool IsBlockBased() override { return true; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr,
+                   BlockCacheLookupContext* context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      BlockCacheLookupContext* context) override;
+  size_t ApproximateMemoryUsage() const override;
 
   // convert this object to a human readable form
   std::string ToString() const override;
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index e0ca57f1c51..220888dd2fb 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -57,8 +57,12 @@ TEST_F(FilterBlockTest, EmptyBuilder) {
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 TEST_F(FilterBlockTest, SingleChunk) {
@@ -76,13 +80,27 @@ TEST_F(FilterBlockTest, SingleChunk) {
   BlockContents block(builder.Finish());
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 TEST_F(FilterBlockTest, MultiChunk) {
@@ -110,28 +128,60 @@ TEST_F(FilterBlockTest, MultiChunk) {
                                      std::move(block), nullptr);
 
   // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0}));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check third filter (empty)
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 // Test for block based filter block
@@ -154,8 +204,12 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   delete builder;
   delete reader;
@@ -175,13 +229,27 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
   BlockContents block(builder->Finish());
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   delete builder;
   delete reader;
@@ -213,28 +281,60 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
       nullptr, table_options_, true, std::move(block), nullptr);
 
   // Check first filter
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0}));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check second filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check third filter (empty)
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   // Check last filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 
   delete builder;
   delete reader;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index a8e4e1d40db..d1beafed68b 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -178,6 +178,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
                                FilePrefetchBuffer* prefetch_buffer,
                                const ReadOptions& read_options,
                                GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
                                CachableEntry<Block>* index_block);
 
   const BlockBasedTable* table() const { return table_; }
@@ -211,6 +212,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
 
   Status GetOrReadIndexBlock(const ReadOptions& read_options,
                              GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
                              CachableEntry<Block>* index_block) const;
 
   size_t ApproximateIndexBlockMemoryUsage() const {
@@ -228,6 +230,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
     const ReadOptions& read_options, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
     CachableEntry<Block>* index_block) {
   PERF_TIMER_GUARD(read_index_block_nanos);
 
@@ -241,13 +244,14 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->footer.index_handle(),
       UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
-      get_context);
+      get_context, lookup_context);
 
   return s;
 }
 
 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
     const ReadOptions& read_options, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
     CachableEntry<Block>* index_block) const {
   assert(index_block != nullptr);
 
@@ -256,8 +260,8 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
     return Status::OK();
   }
 
-  return ReadIndexBlock(table_, nullptr /* prefetch_buffer */, read_options,
-                        get_context, index_block);
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        get_context, lookup_context, index_block);
 }
 
 // Index that allows binary search lookup in a two-level index structure.
@@ -269,7 +273,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   // unmodified.
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader) {
+                       bool prefetch, bool pin, IndexReader** index_reader,
+                       BlockCacheLookupContext* lookup_context) {
     assert(table != nullptr);
     assert(table->get_rep());
     assert(!pin || prefetch);
@@ -277,8 +282,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
 
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
-      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
-                                      nullptr /* get_context */, &index_block);
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                         /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
       }
@@ -296,10 +302,11 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   // return a two-level iterator: first level is on the partition index
   InternalIteratorBase<BlockHandle>* NewIterator(
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
-      IndexBlockIter* iter, GetContext* get_context) override {
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
     CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(read_options, get_context, &index_block);
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -352,6 +359,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
 
   void CacheDependencies(bool pin) override {
     // Before read partitions, prefetch them to avoid lots of IOs
+    BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
     auto rep = table()->rep_;
     IndexBlockIter biter;
     BlockHandle handle;
@@ -359,7 +367,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
 
     CachableEntry<Block> index_block;
     Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */,
-                                   &index_block);
+                                   &lookup_context, &index_block);
     if (!s.ok()) {
       ROCKS_LOG_WARN(rep->ioptions.info_log,
                      "Error retrieving top-level index block while trying to "
@@ -408,7 +416,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       // filter blocks
       s = table()->MaybeReadBlockAndLoadToCache(
           prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-          &block, BlockType::kIndex, nullptr /* get_context */);
+          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context);
 
       assert(s.ok() || block.GetValue() == nullptr);
       if (s.ok() && block.GetValue() != nullptr) {
@@ -451,7 +459,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
   // unmodified.
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader) {
+                       bool prefetch, bool pin, IndexReader** index_reader,
+                       BlockCacheLookupContext* lookup_context) {
     assert(table != nullptr);
     assert(table->get_rep());
     assert(!pin || prefetch);
@@ -459,8 +468,9 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
 
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
-      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
-                                      nullptr /* get_context */, &index_block);
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                         /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
       }
@@ -477,10 +487,11 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
 
   InternalIteratorBase<BlockHandle>* NewIterator(
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
-      IndexBlockIter* iter, GetContext* get_context) override {
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
     CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(read_options, get_context, &index_block);
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -526,7 +537,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer,
                        InternalIterator* meta_index_iter, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader) {
+                       bool prefetch, bool pin, IndexReader** index_reader,
+                       BlockCacheLookupContext* lookup_context) {
     assert(table != nullptr);
     assert(index_reader != nullptr);
     assert(!pin || prefetch);
@@ -536,8 +548,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
 
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
-      const Status s = ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
-                                      nullptr /* get_context */, &index_block);
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+                         /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
       }
@@ -616,10 +629,11 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
 
   InternalIteratorBase<BlockHandle>* NewIterator(
       const ReadOptions& read_options, bool disable_prefix_seek,
-      IndexBlockIter* iter, GetContext* get_context) override {
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
     CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(read_options, get_context, &index_block);
+    const Status s = GetOrReadIndexBlock(read_options, get_context,
+                                         lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -1055,6 +1069,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // Better not mutate rep_ after the creation. eg. internal_prefix_transform
   // raw pointer will be used to create HashIndexReader, whose reset may
   // access a dangling pointer.
+  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
                                       internal_comparator, skip_filters, level,
                                       immortal_table);
@@ -1095,13 +1110,13 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     return s;
   }
   s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
-                                   internal_comparator);
+                                   internal_comparator, &lookup_context);
   if (!s.ok()) {
     return s;
   }
   s = new_table->PrefetchIndexAndFilterBlocks(
       prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
-      table_options, level);
+      table_options, level, &lookup_context);
 
   if (s.ok()) {
     // Update tail prefetch stats
@@ -1304,7 +1319,8 @@ Status BlockBasedTable::ReadPropertiesBlock(
 
 Status BlockBasedTable::ReadRangeDelBlock(
     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    const InternalKeyComparator& internal_comparator) {
+    const InternalKeyComparator& internal_comparator,
+    BlockCacheLookupContext* lookup_context) {
   Status s;
   bool found_range_del_block;
   BlockHandle range_del_handle;
@@ -1317,10 +1333,10 @@ Status BlockBasedTable::ReadRangeDelBlock(
   } else if (found_range_del_block && !range_del_handle.IsNull()) {
     ReadOptions read_options;
     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
-        read_options, range_del_handle, nullptr /* input_iter */,
-        BlockType::kRangeDeletion, true /* key_includes_seq */,
-        true /* index_key_is_full */, nullptr /* get_context */, Status(),
-        prefetch_buffer));
+        read_options, range_del_handle,
+        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
     assert(iter != nullptr);
     s = iter->status();
     if (!s.ok()) {
@@ -1370,7 +1386,8 @@ Status BlockBasedTable::ReadCompressionDictBlock(
 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
     BlockBasedTable* new_table, bool prefetch_all,
-    const BlockBasedTableOptions& table_options, const int level) {
+    const BlockBasedTableOptions& table_options, const int level,
+    BlockCacheLookupContext* lookup_context) {
   Status s;
 
   // Find filter handle and filter type
@@ -1440,7 +1457,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   IndexReader* index_reader = nullptr;
   if (s.ok()) {
     s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
-                                     prefetch_index, pin_index, &index_reader);
+                                     prefetch_index, pin_index, &index_reader,
+                                     lookup_context);
     if (s.ok()) {
       assert(index_reader != nullptr);
       rep_->index_reader.reset(index_reader);
@@ -1467,7 +1485,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     if (s.ok() && prefetch_filter) {
       // Hack: Call GetFilter() to implicitly add filter to the block_cache
       auto filter_entry =
-          new_table->GetFilter(rep_->table_prefix_extractor.get());
+          new_table->GetFilter(rep_->table_prefix_extractor.get(),
+                               /*prefetch_buffer=*/nullptr, /*no_io=*/false,
+                               /*get_context=*/nullptr, lookup_context);
       if (filter_entry.GetValue() != nullptr && prefetch_all) {
         filter_entry.GetValue()->CacheDependencies(
             pin_all, rep_->table_prefix_extractor.get());
@@ -1653,8 +1673,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
       size_t charge = block_holder->ApproximateMemoryUsage();
       Cache::Handle* cache_handle = nullptr;
       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                              &DeleteCachedEntry<Block>,
-                              &cache_handle);
+                              &DeleteCachedEntry<Block>, &cache_handle);
 #ifndef NDEBUG
       block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
@@ -1758,8 +1777,7 @@ Status BlockBasedTable::PutDataBlockToCache(
     size_t charge = block_holder->ApproximateMemoryUsage();
     Cache::Handle* cache_handle = nullptr;
     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                            &DeleteCachedEntry<Block>,
-                            &cache_handle, priority);
+                            &DeleteCachedEntry<Block>, &cache_handle, priority);
 #ifndef NDEBUG
     block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
@@ -1849,25 +1867,28 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 
 CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
-    bool no_io, GetContext* get_context) const {
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
   const BlockHandle& filter_blk_handle = rep_->filter_handle;
   const bool is_a_filter_partition = true;
   return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
-                   no_io, get_context, prefix_extractor);
+                   no_io, get_context, lookup_context, prefix_extractor);
 }
 
 CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
     const bool is_a_filter_partition, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* /*lookup_context*/,
     const SliceTransform* prefix_extractor) const {
+  // TODO(haoyu): Trace filter block access here.
   // If cache_index_and_filter_blocks is false, filter should be pre-populated.
   // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
   // read fails at Open() time. We don't want to reload again since it will
   // most probably fail again.
   if (!is_a_filter_partition &&
       !rep_->table_options.cache_index_and_filter_blocks) {
-    return {rep_->filter.get(), nullptr /* cache */,
-      nullptr /* cache_handle */, false /* own_value */};
+    return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr,
+            /*own_value=*/false};
   }
 
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -1877,8 +1898,8 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   }
 
   if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
-    return {rep_->filter_entry.GetValue(), nullptr /* cache */,
-      nullptr /* cache_handle */, false /* own_value */};
+    return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
+    /*cache_handle=*/nullptr, /*own_value=*/false};
   }
 
   PERF_TIMER_GUARD(read_filter_block_nanos);
@@ -1920,12 +1941,13 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   }
 
   return {filter, cache_handle ? block_cache : nullptr, cache_handle,
-    false /* own_value */};
+          /*own_value=*/false};
 }
 
 CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
-    FilePrefetchBuffer* prefetch_buffer, bool no_io,
-    GetContext* get_context) const {
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* /*lookup_context*/) const {
+  // TODO(haoyu): Trace the access on the uncompression dictionary here.
   if (!rep_->table_options.cache_index_and_filter_blocks) {
     // block cache is either disabled or not used for meta-blocks. In either
     // case, BlockBasedTableReader is the owner of the uncompression dictionary.
@@ -1987,14 +2009,16 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
 InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
-    IndexBlockIter* input_iter, GetContext* get_context) const {
+    IndexBlockIter* input_iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
   assert(rep_ != nullptr);
   assert(rep_->index_reader != nullptr);
 
   // We don't return pinned data from index blocks, so no need
   // to set `block_contents_pinned`.
   return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
-                                         input_iter, get_context);
+                                         input_iter, get_context,
+                                         lookup_context);
 }
 
 // Convert an index iterator value (i.e., an encoded BlockHandle)
@@ -2005,7 +2029,7 @@ template <typename TBlockIter>
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
     BlockType block_type, bool key_includes_seq, bool index_key_is_full,
-    GetContext* get_context, Status s,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
     FilePrefetchBuffer* prefetch_buffer) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
@@ -2017,7 +2041,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   auto uncompression_dict_storage =
-      GetUncompressionDict(prefetch_buffer, no_io, get_context);
+      GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context);
   const UncompressionDict& uncompression_dict =
       uncompression_dict_storage.GetValue() == nullptr
           ? UncompressionDict::GetEmptyDict()
@@ -2025,7 +2049,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
-                    block_type, get_context);
+                    block_type, get_context, lookup_context);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
@@ -2093,7 +2117,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
-    GetContext* get_context) const {
+    GetContext* get_context,
+    BlockCacheLookupContext* /*lookup_context*/) const {
+  // TODO(haoyu): Trace data/index/range deletion block access here.
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -2169,7 +2195,7 @@ Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
-    GetContext* get_context) const {
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
@@ -2180,7 +2206,7 @@ Status BlockBasedTable::RetrieveBlock(
        block_type != BlockType::kIndex)) {
     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
                                      uncompression_dict, block_entry,
-                                     block_type, get_context);
+                                     block_type, get_context, lookup_context);
 
     if (!s.ok()) {
       return s;
@@ -2271,7 +2297,8 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
 bool BlockBasedTable::PrefixMayMatch(
     const Slice& internal_key, const ReadOptions& read_options,
     const SliceTransform* options_prefix_extractor,
-    const bool need_upper_bound_check) const {
+    const bool need_upper_bound_check,
+    BlockCacheLookupContext* lookup_context) const {
   if (!rep_->filter_policy) {
     return true;
   }
@@ -2295,7 +2322,9 @@ bool BlockBasedTable::PrefixMayMatch(
   Status s;
 
   // First, try check with full filter
-  auto filter_entry = GetFilter(prefix_extractor);
+  auto filter_entry =
+      GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false,
+                /*get_context=*/nullptr, lookup_context);
   FilterBlockReader* filter = filter_entry.GetValue();
   bool filter_checked = true;
   if (filter != nullptr) {
@@ -2304,7 +2333,7 @@ bool BlockBasedTable::PrefixMayMatch(
       may_match = filter->RangeMayExist(
           read_options.iterate_upper_bound, user_key, prefix_extractor,
           rep_->internal_comparator.user_comparator(), const_ikey_ptr,
-          &filter_checked, need_upper_bound_check);
+          &filter_checked, need_upper_bound_check, lookup_context);
     } else {
       // if prefix_extractor changed for block based filter, skip filter
       if (need_upper_bound_check) {
@@ -2323,9 +2352,10 @@ bool BlockBasedTable::PrefixMayMatch(
       // Then, try find it within each block
       // we already know prefix_extractor and prefix_extractor_name must match
       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
-      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
-          NewIndexIterator(no_io_read_options,
-                           /* need_upper_bound_check */ false));
+      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
+          no_io_read_options,
+          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+          /*need_upper_bound_check=*/nullptr, lookup_context));
       iiter->Seek(internal_prefix);
 
       if (!iiter->Valid()) {
@@ -2357,8 +2387,9 @@ bool BlockBasedTable::PrefixMayMatch(
         // possibly contain the key.  Thus, the corresponding data block
         // is the only on could potentially contain the prefix.
         BlockHandle handle = iiter->value();
-        may_match =
-            filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
+        may_match = filter->PrefixMayMatch(
+            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
+            /*const_key_ptr=*/nullptr, lookup_context);
       }
     }
   }
@@ -2588,7 +2619,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
     table_->NewDataBlockIterator<TBlockIter>(
         read_options_, data_block_handle, &block_iter_, block_type_,
         key_includes_seq_, index_key_is_full_,
-        /* get_context */ nullptr, s, prefetch_buffer_.get());
+        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
     if (read_options_.iterate_upper_bound != nullptr) {
       data_block_within_upper_bound_ =
@@ -2682,6 +2713,9 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
     Arena* arena, bool skip_filters, bool for_compaction) {
+  BlockCacheLookupContext lookup_context{
+      for_compaction ? BlockCacheLookupCaller::kCompaction
+                     : BlockCacheLookupCaller::kUserIterator};
   bool need_upper_bound_check =
       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
   if (arena == nullptr) {
@@ -2690,7 +2724,8 @@ InternalIterator* BlockBasedTable::NewIterator(
         NewIndexIterator(
             read_options,
             need_upper_bound_check &&
-                rep_->index_type == BlockBasedTableOptions::kHashSearch),
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
@@ -2700,7 +2735,9 @@ InternalIterator* BlockBasedTable::NewIterator(
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
     return new (mem) BlockBasedTableIterator<DataBlockIter>(
         this, read_options, rep_->internal_comparator,
-        NewIndexIterator(read_options, need_upper_bound_check),
+        NewIndexIterator(read_options, need_upper_bound_check,
+                         /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                         &lookup_context),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
@@ -2724,7 +2761,8 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
 bool BlockBasedTable::FullFilterKeyMayMatch(
     const ReadOptions& read_options, FilterBlockReader* filter,
     const Slice& internal_key, const bool no_io,
-    const SliceTransform* prefix_extractor) const {
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
   if (filter == nullptr || filter->IsBlockBased()) {
     return true;
   }
@@ -2735,15 +2773,16 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
     size_t ts_sz =
         rep_->internal_comparator.user_comparator()->timestamp_size();
     Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
-    may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor,
-                                    kNotValid, no_io, const_ikey_ptr);
+    may_match =
+        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
+                            no_io, const_ikey_ptr, lookup_context);
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0 &&
              prefix_extractor->InDomain(user_key) &&
              !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
                                      prefix_extractor, kNotValid, false,
-                                     const_ikey_ptr)) {
+                                     const_ikey_ptr, lookup_context)) {
     may_match = false;
   }
   if (may_match) {
@@ -2756,12 +2795,14 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
 void BlockBasedTable::FullFilterKeysMayMatch(
     const ReadOptions& read_options, FilterBlockReader* filter,
     MultiGetRange* range, const bool no_io,
-    const SliceTransform* prefix_extractor) const {
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
   if (filter == nullptr || filter->IsBlockBased()) {
     return;
   }
   if (filter->whole_key_filtering()) {
-    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io);
+    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
+                         lookup_context);
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0) {
@@ -2772,7 +2813,8 @@ void BlockBasedTable::FullFilterKeysMayMatch(
         range->SkipKey(iter);
       }
     }
-    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false);
+    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
+                             lookup_context);
   }
 }
 
@@ -2786,18 +2828,19 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   CachableEntry<FilterBlockReader> filter_entry;
   bool may_match;
   FilterBlockReader* filter = nullptr;
+  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet};
   {
     if (!skip_filters) {
-      filter_entry =
-          GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
-                    read_options.read_tier == kBlockCacheTier, get_context);
+      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
+                               read_options.read_tier == kBlockCacheTier,
+                               get_context, &lookup_context);
     }
     filter = filter_entry.GetValue();
 
     // First check the full filter
     // If full filter not useful, Then go into each block
     may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
-                                      prefix_extractor);
+                                      prefix_extractor, &lookup_context);
   }
   if (!may_match) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
@@ -2811,8 +2854,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       need_upper_bound_check = PrefixExtractorChanged(
           rep_->table_properties.get(), prefix_extractor);
     }
-    auto iiter = NewIndexIterator(read_options, need_upper_bound_check,
-                                  &iiter_on_stack, get_context);
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         get_context, &lookup_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -2828,7 +2872,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       bool not_exist_in_filter =
           filter != nullptr && filter->IsBlockBased() == true &&
           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
-                               prefix_extractor, handle.offset(), no_io);
+                               prefix_extractor, handle.offset(), no_io,
+                               /*const_ikey_ptr=*/nullptr, &lookup_context);
 
       if (not_exist_in_filter) {
         // Not found
@@ -2841,8 +2886,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         DataBlockIter biter;
         NewDataBlockIterator<DataBlockIter>(
             read_options, iiter->value(), &biter, BlockType::kData,
-            true /* key_includes_seq */, true /* index_key_is_full */,
-            get_context);
+            /*key_includes_seq=*/true,
+            /*index_key_is_full=*/true, get_context, &lookup_context,
+            /*s=*/Status(), /*prefetch_buffer*/ nullptr);
 
         if (read_options.read_tier == kBlockCacheTier &&
             biter.status().IsIncomplete()) {
@@ -2907,6 +2953,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                const MultiGetRange* mget_range,
                                const SliceTransform* prefix_extractor,
                                bool skip_filters) {
+  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet};
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   FilterBlockReader* filter = nullptr;
@@ -2915,16 +2962,16 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
   {
     if (!skip_filters) {
       // TODO: Figure out where the stats should go
-      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
+      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
                                read_options.read_tier == kBlockCacheTier,
-                               nullptr /*get_context*/);
+                               /*get_context=*/nullptr, &lookup_context);
     }
     filter = filter_entry.GetValue();
 
     // First check the full filter
     // If full filter not useful, Then go into each block
     FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
-                           prefix_extractor);
+                           prefix_extractor, &lookup_context);
   }
   if (skip_filters || !sst_file_range.empty()) {
     IndexBlockIter iiter_on_stack;
@@ -2937,7 +2984,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     }
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
-                         sst_file_range.begin()->get_context);
+                         sst_file_range.begin()->get_context, &lookup_context);
     std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
@@ -2958,11 +3005,12 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           offset = iiter->value().offset();
           biter.Invalidate(Status::OK());
           NewDataBlockIterator<DataBlockIter>(
-              read_options, iiter->value(), &biter, BlockType::kData, false,
-              true /* key_includes_seq */, get_context);
+              read_options, iiter->value(), &biter, BlockType::kData,
+              /*key_includes_seq=*/false,
+              /*index_key_is_full=*/true, get_context, &lookup_context,
+              Status(), nullptr);
           reusing_block = false;
         }
-
         if (read_options.read_tier == kBlockCacheTier &&
             biter.status().IsIncomplete()) {
           // couldn't get block from block_cache
@@ -3040,9 +3088,11 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   if (begin && end && comparator.Compare(*begin, *end) > 0) {
     return Status::InvalidArgument(*begin, *end);
   }
-
+  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
   IndexBlockIter iiter_on_stack;
-  auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
+  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                                &iiter_on_stack, /*get_context=*/nullptr,
+                                &lookup_context);
   std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
     iiter_unique_ptr =
@@ -3077,7 +3127,12 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 
     // Load the block specified by the block_handle into the block cache
     DataBlockIter biter;
-    NewDataBlockIterator<DataBlockIter>(ReadOptions(), block_handle, &biter);
+
+    NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        /*get_context=*/nullptr, &lookup_context, Status(),
+        /*prefetch_buffer=*/nullptr);
 
     if (!biter.status().ok()) {
       // there was an unexpected error while pre-fetching
@@ -3089,6 +3144,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 }
 
 Status BlockBasedTable::VerifyChecksum() {
+  // TODO(haoyu): This function is called by external sst ingestion and the
+  // verify checksum public API. We don't log its block cache accesses for now.
   Status s;
   // Check Meta blocks
   std::unique_ptr<Block> meta;
@@ -3104,8 +3161,9 @@ Status BlockBasedTable::VerifyChecksum() {
   }
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
-  InternalIteratorBase<BlockHandle>* iiter =
-      NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
+  InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
+      ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, /*lookup_contex=*/nullptr);
   std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
     iiter_unique_ptr =
@@ -3199,8 +3257,9 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
 
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
-      NewIndexIterator(options));
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
+      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
   iiter->Seek(key);
   assert(iiter->Valid());
 
@@ -3234,7 +3293,8 @@ BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
 Status BlockBasedTable::CreateIndexReader(
     FilePrefetchBuffer* prefetch_buffer,
     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
-    bool pin, IndexReader** index_reader) {
+    bool pin, IndexReader** index_reader,
+    BlockCacheLookupContext* lookup_context) {
   auto index_type_on_file = rep_->index_type;
 
   // kHashSearch requires non-empty prefix_extractor but bypass checking
@@ -3246,11 +3306,13 @@ Status BlockBasedTable::CreateIndexReader(
   switch (index_type_on_file) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
-                                          prefetch, pin, index_reader);
+                                          prefetch, pin, index_reader,
+                                          lookup_context);
     }
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
-                                             prefetch, pin, index_reader);
+                                             prefetch, pin, index_reader,
+                                             lookup_context);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -3264,14 +3326,16 @@ Status BlockBasedTable::CreateIndexReader(
           ROCKS_LOG_WARN(rep_->ioptions.info_log,
                          "Unable to read the metaindex block."
                          " Fall back to binary search index.");
-          return BinarySearchIndexReader::Create(
-              this, prefetch_buffer, use_cache, prefetch, pin, index_reader);
+          return BinarySearchIndexReader::Create(this, prefetch_buffer,
+                                                 use_cache, prefetch, pin,
+                                                 index_reader, lookup_context);
         }
         meta_index_iter = meta_iter_guard.get();
       }
 
       return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
-                                     use_cache, prefetch, pin, index_reader);
+                                     use_cache, prefetch, pin, index_reader,
+                                     lookup_context);
     }
     default: {
       std::string error_message =
@@ -3281,9 +3345,15 @@ Status BlockBasedTable::CreateIndexReader(
   }
 }
 
-uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              bool for_compaction) {
+  BlockCacheLookupContext context(
+      for_compaction ? BlockCacheLookupCaller::kCompaction
+                     : BlockCacheLookupCaller::kUserApproximateSize);
   std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
-      NewIndexIterator(ReadOptions()));
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/&context));
 
   index_iter->Seek(key);
   uint64_t result;
@@ -3319,7 +3389,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const {
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     std::vector<KVPairBlock>* kv_pair_blocks) {
   std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
 
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -3337,7 +3409,11 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        ReadOptions(), blockhandles_iter->value()));
+        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
+        /*type=*/BlockType::kData,
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
     s = datablock_iter->status();
 
     if (!s.ok()) {
@@ -3545,7 +3621,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
       "Index Details:\n"
       "--------------------------------------\n");
   std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_file->Append("Can not read Index Block \n\n");
@@ -3594,7 +3672,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
 
 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
   std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_file->Append("Can not read Index Block \n\n");
@@ -3628,7 +3708,11 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        ReadOptions(), blockhandles_iter->value()));
+        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
+        /*type=*/BlockType::kData,
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
     s = datablock_iter->status();
 
     if (!s.ok()) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index d8319a3e711..a92289f9bee 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -113,17 +113,22 @@ class BlockBasedTable : public TableReader {
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
                       const SliceTransform* options_prefix_extractor,
-                      const bool need_upper_bound_check) const;
+                      const bool need_upper_bound_check,
+                      BlockCacheLookupContext* lookup_context) const;
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
   // @param skip_filters Disables loading/accessing the filter block
-  InternalIterator* NewIterator(const ReadOptions&,
-                                const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false) override;
+  InternalIterator* NewIterator(
+      const ReadOptions&, const SliceTransform* prefix_extractor,
+      Arena* arena = nullptr, bool skip_filters = false,
+      // TODO(haoyu) 1. External SST ingestion sets for_compaction as false. 2.
+      // Compaction also sets it to false when paranoid_file_checks is true,
+      // i.e., it will populate the block cache with blocks in the new SST
+      // files. We treat those as a user is calling iterator for now. We should
+      // differentiate the callers.
+      bool for_compaction = false) override;
 
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
@@ -149,7 +154,7 @@ class BlockBasedTable : public TableReader {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
+  uint64_t ApproximateOffsetOf(const Slice& key, bool for_compaction) override;
 
   bool TEST_BlockInCache(const BlockHandle& handle) const;
 
@@ -193,7 +198,8 @@ class BlockBasedTable : public TableReader {
     // returned object.
     virtual InternalIteratorBase<BlockHandle>* NewIterator(
         const ReadOptions& read_options, bool disable_prefix_seek,
-        IndexBlockIter* iter, GetContext* get_context) = 0;
+        IndexBlockIter* iter, GetContext* get_context,
+        BlockCacheLookupContext* lookup_context) = 0;
 
     // Report an approximation of how much memory has been used other than
     // memory that was allocated in block cache.
@@ -222,10 +228,10 @@ class BlockBasedTable : public TableReader {
   template <typename TBlockIter>
   TBlockIter* NewDataBlockIterator(
       const ReadOptions& ro, const BlockHandle& block_handle,
-      TBlockIter* input_iter = nullptr, BlockType block_type = BlockType::kData,
-      bool key_includes_seq = true, bool index_key_is_full = true,
-      GetContext* get_context = nullptr, Status s = Status(),
-      FilePrefetchBuffer* prefetch_buffer = nullptr) const;
+      TBlockIter* input_iter, BlockType block_type, bool key_includes_seq,
+      bool index_key_is_full, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context, Status s,
+      FilePrefetchBuffer* prefetch_buffer) const;
 
   class PartitionedIndexIteratorState;
 
@@ -262,7 +268,7 @@ class BlockBasedTable : public TableReader {
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
       CachableEntry<Block>* block_entry, BlockType block_type,
-      GetContext* get_context = nullptr) const;
+      GetContext* get_context, BlockCacheLookupContext* lookup_context) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
@@ -271,23 +277,25 @@ class BlockBasedTable : public TableReader {
                        const ReadOptions& ro, const BlockHandle& handle,
                        const UncompressionDict& uncompression_dict,
                        CachableEntry<Block>* block_entry, BlockType block_type,
-                       GetContext* get_context) const;
+                       GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context) const;
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
   // were they not present in cache yet.
   CachableEntry<FilterBlockReader> GetFilter(
-      const SliceTransform* prefix_extractor = nullptr,
-      FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false,
-      GetContext* get_context = nullptr) const;
+      const SliceTransform* prefix_extractor,
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
   virtual CachableEntry<FilterBlockReader> GetFilter(
       FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
       const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-      const SliceTransform* prefix_extractor = nullptr) const;
+      BlockCacheLookupContext* lookup_context,
+      const SliceTransform* prefix_extractor) const;
 
   CachableEntry<UncompressionDict> GetUncompressionDict(
-      FilePrefetchBuffer* prefetch_buffer, bool no_io,
-      GetContext* get_context) const;
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
 
   // Get the iterator from the index reader.
   // If input_iter is not set, return new Iterator
@@ -300,9 +308,9 @@ class BlockBasedTable : public TableReader {
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
   InternalIteratorBase<BlockHandle>* NewIndexIterator(
-      const ReadOptions& read_options, bool need_upper_bound_check = false,
-      IndexBlockIter* input_iter = nullptr,
-      GetContext* get_context = nullptr) const;
+      const ReadOptions& read_options, bool need_upper_bound_check,
+      IndexBlockIter* input_iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
@@ -352,17 +360,20 @@ class BlockBasedTable : public TableReader {
   Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
                            InternalIterator* preloaded_meta_index_iter,
                            bool use_cache, bool prefetch, bool pin,
-                           IndexReader** index_reader);
+                           IndexReader** index_reader,
+                           BlockCacheLookupContext* lookup_context);
 
-  bool FullFilterKeyMayMatch(
-      const ReadOptions& read_options, FilterBlockReader* filter,
-      const Slice& user_key, const bool no_io,
-      const SliceTransform* prefix_extractor = nullptr) const;
+  bool FullFilterKeyMayMatch(const ReadOptions& read_options,
+                             FilterBlockReader* filter, const Slice& user_key,
+                             const bool no_io,
+                             const SliceTransform* prefix_extractor,
+                             BlockCacheLookupContext* lookup_context) const;
 
-  void FullFilterKeysMayMatch(
-      const ReadOptions& read_options, FilterBlockReader* filter,
-      MultiGetRange* range, const bool no_io,
-      const SliceTransform* prefix_extractor = nullptr) const;
+  void FullFilterKeysMayMatch(const ReadOptions& read_options,
+                              FilterBlockReader* filter, MultiGetRange* range,
+                              const bool no_io,
+                              const SliceTransform* prefix_extractor,
+                              BlockCacheLookupContext* lookup_context) const;
 
   static Status PrefetchTail(
       RandomAccessFileReader* file, uint64_t file_size,
@@ -380,14 +391,16 @@ class BlockBasedTable : public TableReader {
                              const SequenceNumber largest_seqno);
   Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
                            InternalIterator* meta_iter,
-                           const InternalKeyComparator& internal_comparator);
+                           const InternalKeyComparator& internal_comparator,
+                           BlockCacheLookupContext* lookup_context);
   Status ReadCompressionDictBlock(
       FilePrefetchBuffer* prefetch_buffer,
       std::unique_ptr<const BlockContents>* compression_dict_block) const;
   Status PrefetchIndexAndFilterBlocks(
       FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
       BlockBasedTable* new_table, bool prefetch_all,
-      const BlockBasedTableOptions& table_options, const int level);
+      const BlockBasedTableOptions& table_options, const int level,
+      BlockCacheLookupContext* lookup_context);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
   Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
@@ -583,7 +596,10 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         block_type_(block_type),
         key_includes_seq_(key_includes_seq),
         index_key_is_full_(index_key_is_full),
-        for_compaction_(for_compaction) {}
+        for_compaction_(for_compaction),
+        lookup_context_(for_compaction
+                            ? BlockCacheLookupCaller::kCompaction
+                            : BlockCacheLookupCaller::kUserIterator) {}
 
   ~BlockBasedTableIterator() { delete index_iter_; }
 
@@ -644,7 +660,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool CheckPrefixMayMatch(const Slice& ikey) {
     if (check_filter_ &&
         !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
-                                need_upper_bound_check_)) {
+                                need_upper_bound_check_, &lookup_context_)) {
       // TODO remember the iterator is invalidated because of prefix
       // match. This can avoid the upper level file iterator to falsely
       // believe the position is the end of the SST file and move to
@@ -702,6 +718,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // If this iterator is created for compaction
   bool for_compaction_;
   BlockHandle prev_index_value_;
+  BlockCacheLookupContext lookup_context_;
 
   // All the below fields control iterator readahead
   static const size_t kInitAutoReadaheadSize = 8 * 1024;
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 378cdacfff6..d54de5ae1ab 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -30,6 +30,7 @@
 #include "rocksdb/table.h"
 #include "table/format.h"
 #include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -99,18 +100,19 @@ class FilterBlockReader {
    */
   virtual bool KeyMayMatch(const Slice& key,
                            const SliceTransform* prefix_extractor,
-                           uint64_t block_offset = kNotValid,
-                           const bool no_io = false,
-                           const Slice* const const_ikey_ptr = nullptr) = 0;
+                           uint64_t block_offset, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           BlockCacheLookupContext* context) = 0;
 
   virtual void KeysMayMatch(MultiGetRange* range,
                             const SliceTransform* prefix_extractor,
-                            uint64_t block_offset = kNotValid,
-                            const bool no_io = false) {
+                            uint64_t block_offset, const bool no_io,
+                            BlockCacheLookupContext* context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
-      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey)) {
+      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
+                       context)) {
         range->SkipKey(iter);
       }
     }
@@ -121,19 +123,19 @@ class FilterBlockReader {
    */
   virtual bool PrefixMayMatch(const Slice& prefix,
                               const SliceTransform* prefix_extractor,
-                              uint64_t block_offset = kNotValid,
-                              const bool no_io = false,
-                              const Slice* const const_ikey_ptr = nullptr) = 0;
+                              uint64_t block_offset, const bool no_io,
+                              const Slice* const const_ikey_ptr,
+                              BlockCacheLookupContext* context) = 0;
 
   virtual void PrefixesMayMatch(MultiGetRange* range,
                                 const SliceTransform* prefix_extractor,
-                                uint64_t block_offset = kNotValid,
-                                const bool no_io = false) {
+                                uint64_t block_offset, const bool no_io,
+                                BlockCacheLookupContext* context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
       if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                       block_offset, no_io, &ikey)) {
+                       block_offset, no_io, &ikey, context)) {
         range->SkipKey(iter);
       }
     }
@@ -156,13 +158,13 @@ class FilterBlockReader {
 
   virtual bool RangeMayExist(
       const Slice* /*iterate_upper_bound*/, const Slice& user_key,
-      const SliceTransform* prefix_extractor,
-      const Comparator* /*comparator*/, const Slice* const const_ikey_ptr,
-      bool* filter_checked, bool /*need_upper_bound_check*/) {
+      const SliceTransform* prefix_extractor, const Comparator* /*comparator*/,
+      const Slice* const const_ikey_ptr, bool* filter_checked,
+      bool /*need_upper_bound_check*/, BlockCacheLookupContext* context) {
     *filter_checked = true;
     Slice prefix = prefix_extractor->Transform(user_key);
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr);
+                          const_ikey_ptr, context);
   }
 
  protected:
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 56dc74c6710..6d2b9d70a50 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -124,7 +124,8 @@ FullFilterBlockReader::FullFilterBlockReader(
 bool FullFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /*prefix_extractor*/,
     uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    const Slice* const /*const_ikey_ptr*/,
+    BlockCacheLookupContext* /*context*/) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
@@ -138,7 +139,8 @@ bool FullFilterBlockReader::KeyMayMatch(
 bool FullFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
     uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    const Slice* const /*const_ikey_ptr*/,
+    BlockCacheLookupContext* /*context*/) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
@@ -161,7 +163,8 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
 
 void FullFilterBlockReader::KeysMayMatch(
     MultiGetRange* range, const SliceTransform* /*prefix_extractor*/,
-    uint64_t block_offset, const bool /*no_io*/) {
+    uint64_t block_offset, const bool /*no_io*/,
+    BlockCacheLookupContext* /*context*/) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
@@ -177,7 +180,8 @@ void FullFilterBlockReader::KeysMayMatch(
 
 void FullFilterBlockReader::PrefixesMayMatch(
     MultiGetRange* range, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/) {
+    uint64_t block_offset, const bool /*no_io*/,
+    BlockCacheLookupContext* /*context*/) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
@@ -224,10 +228,11 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
   return usage;
 }
 
-bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
-    const Slice& user_key, const SliceTransform* prefix_extractor,
-    const Comparator* comparator, const Slice* const const_ikey_ptr,
-    bool* filter_checked, bool need_upper_bound_check) {
+bool FullFilterBlockReader::RangeMayExist(
+    const Slice* iterate_upper_bound, const Slice& user_key,
+    const SliceTransform* prefix_extractor, const Comparator* comparator,
+    const Slice* const const_ikey_ptr, bool* filter_checked,
+    bool need_upper_bound_check, BlockCacheLookupContext* context) {
   if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
     *filter_checked = false;
     return true;
@@ -240,7 +245,7 @@ bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
   } else {
     *filter_checked = true;
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr);
+                          const_ikey_ptr, context);
   }
 }
 
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 61df028c920..99e5299b34f 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -95,35 +95,38 @@ class FullFilterBlockReader : public FilterBlockReader {
 
   // bits_reader is created in filter_policy, it should be passed in here
   // directly. and be deleted here
-  ~FullFilterBlockReader() {}
+  ~FullFilterBlockReader() override {}
+
+  bool IsBlockBased() override { return false; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr,
+                   BlockCacheLookupContext* context) override;
+
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      BlockCacheLookupContext* context) override;
+
+  void KeysMayMatch(MultiGetRange* range,
+                    const SliceTransform* prefix_extractor,
+                    uint64_t block_offset, const bool no_io,
+                    BlockCacheLookupContext* context) override;
+
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        uint64_t block_offset, const bool no_io,
+                        BlockCacheLookupContext* context) override;
+  size_t ApproximateMemoryUsage() const override;
+  bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                     const SliceTransform* prefix_extractor,
+                     const Comparator* comparator,
+                     const Slice* const const_ikey_ptr, bool* filter_checked,
+                     bool need_upper_bound_check,
+                     BlockCacheLookupContext* context) override;
 
-  virtual bool IsBlockBased() override { return false; }
-
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-
-  virtual void KeysMayMatch(MultiGetRange* range,
-                            const SliceTransform* prefix_extractor,
-                            uint64_t block_offset = kNotValid,
-                            const bool no_io = false) override;
-
-  virtual void PrefixesMayMatch(MultiGetRange* range,
-                                const SliceTransform* prefix_extractor,
-                                uint64_t block_offset = kNotValid,
-                                const bool no_io = false) override;
-  virtual size_t ApproximateMemoryUsage() const override;
-  virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
-                             const SliceTransform* prefix_extractor,
-                             const Comparator* comparator,
-                             const Slice* const const_ikey_ptr, bool* filter_checked,
-                             bool need_upper_bound_check) override;
  private:
   const SliceTransform* prefix_extractor_;
   Slice contents_;
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 82c43b34ed6..57ff158c5c7 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -112,7 +112,9 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
@@ -127,13 +129,27 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
   FullFilterBlockReader reader(
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 class FullFilterBlockTest : public testing::Test {
@@ -157,7 +173,9 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) {
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 TEST_F(FullFilterBlockTest, DuplicateEntries) {
@@ -207,13 +225,27 @@ TEST_F(FullFilterBlockTest, SingleChunk) {
   FullFilterBlockReader reader(
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
 }
 
 }  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 7874ce1874f..e80085dfb5b 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -162,8 +162,8 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
 
 bool PartitionedFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* prefix_extractor,
-    uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    BlockCacheLookupContext* context) {
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
   if (!whole_key_filtering_) {
@@ -177,19 +177,20 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
     return false;
   }
   auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         prefix_extractor);
+      GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io,
+                         prefix_extractor, context);
   if (UNLIKELY(!filter_partition.GetValue())) {
     return true;
   }
-  return filter_partition.GetValue()->KeyMayMatch(key, prefix_extractor,
-                                                  block_offset, no_io);
+  return filter_partition.GetValue()->KeyMayMatch(
+      key, prefix_extractor, block_offset, no_io, /*const_ikey_ptr=*/nullptr,
+      context);
 }
 
 bool PartitionedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* prefix_extractor,
-    uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    BlockCacheLookupContext* context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
@@ -206,13 +207,14 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
     return false;
   }
   auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         prefix_extractor);
+      GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io,
+                         prefix_extractor, context);
   if (UNLIKELY(!filter_partition.GetValue())) {
     return true;
   }
-  return filter_partition.GetValue()->PrefixMayMatch(prefix, prefix_extractor,
-                                                     kNotValid, no_io);
+  return filter_partition.GetValue()->PrefixMayMatch(
+      prefix, prefix_extractor, kNotValid, no_io, /*const_ikey_ptr=*/nullptr,
+      context);
 }
 
 BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
@@ -234,7 +236,8 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
 CachableEntry<FilterBlockReader>
 PartitionedFilterBlockReader::GetFilterPartition(
     FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
-    const bool no_io, const SliceTransform* prefix_extractor) {
+    const bool no_io, const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* context) {
   const bool is_a_filter_partition = true;
   auto block_cache = table_->rep_->table_options.block_cache.get();
   if (LIKELY(block_cache != nullptr)) {
@@ -247,9 +250,10 @@ PartitionedFilterBlockReader::GetFilterPartition(
           nullptr /* cache_handle */, false /* own_value */};
       }
     }
-    return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
+    return table_->GetFilter(/*prefetch_buffer=*/nullptr, fltr_blk_handle,
                              is_a_filter_partition, no_io,
-                             /* get_context */ nullptr, prefix_extractor);
+                             /*get_context=*/nullptr, context,
+                             prefix_extractor);
   } else {
     auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
                                      is_a_filter_partition, prefix_extractor);
@@ -273,6 +277,7 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
 void PartitionedFilterBlockReader::CacheDependencies(
     bool pin, const SliceTransform* prefix_extractor) {
   // Before read partitions, prefetch them to avoid lots of IOs
+  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
   IndexBlockIter biter;
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
@@ -304,7 +309,7 @@ void PartitionedFilterBlockReader::CacheDependencies(
     const bool is_a_filter_partition = true;
     auto filter = table_->GetFilter(
         prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
-        /* get_context */ nullptr, prefix_extractor);
+        /*get_context=*/nullptr, &lookup_context, prefix_extractor);
     if (LIKELY(filter.IsCached())) {
       if (pin) {
         filter_map_[handle.offset()] = std::move(filter);
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 6860bf82fec..4b0fb523d0d 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -77,26 +77,28 @@ class PartitionedFilterBlockReader : public FilterBlockReader {
       Statistics* stats, const InternalKeyComparator comparator,
       const BlockBasedTable* table, const bool index_key_includes_seq,
       const bool index_value_is_full);
-  virtual ~PartitionedFilterBlockReader();
+  ~PartitionedFilterBlockReader() override;
 
-  virtual bool IsBlockBased() override { return false; }
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual size_t ApproximateMemoryUsage() const override;
+  bool IsBlockBased() override { return false; }
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr,
+                   BlockCacheLookupContext* context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      BlockCacheLookupContext* context) override;
+  size_t ApproximateMemoryUsage() const override;
 
  private:
   BlockHandle GetFilterPartitionHandle(const Slice& entry);
   CachableEntry<FilterBlockReader> GetFilterPartition(
       FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
-      const bool no_io, const SliceTransform* prefix_extractor = nullptr);
-  virtual void CacheDependencies(
-      bool bin, const SliceTransform* prefix_extractor) override;
+      const bool no_io, const SliceTransform* prefix_extractor,
+      BlockCacheLookupContext* context);
+  void CacheDependencies(bool bin,
+                         const SliceTransform* prefix_extractor) override;
 
   const SliceTransform* prefix_extractor_;
   std::unique_ptr<Block> idx_on_fltr_blk_;
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 70e5bbd3bbd..5af7034968a 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -31,6 +31,7 @@ class MockedBlockBasedTable : public BlockBasedTable {
   CachableEntry<FilterBlockReader> GetFilter(
       FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
       const bool /* unused */, bool /* unused */, GetContext* /* unused */,
+      BlockCacheLookupContext* /*context*/,
       const SliceTransform* prefix_extractor) const override {
     Slice slice = slices[filter_blk_handle.offset()];
     auto obj = new FullFilterBlockReader(
@@ -168,14 +169,15 @@ class PartitionedFilterBlockTest
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
-                                      &ikey_slice));
+                                      &ikey_slice, /*context=*/nullptr));
     }
     {
       // querying a key twice
       auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid,
-                                      !no_io, &ikey_slice));
+                                      !no_io, &ikey_slice,
+                                      /*context=*/nullptr));
     }
     // querying missing keys
     for (auto key : missing_keys) {
@@ -183,11 +185,13 @@ class PartitionedFilterBlockTest
       const Slice ikey_slice = Slice(*ikey.rep());
       if (empty) {
         ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                        !no_io, &ikey_slice));
+                                        !no_io, &ikey_slice,
+                                        /*context=*/nullptr));
       } else {
         // assuming a good hash function
         ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                         !no_io, &ikey_slice));
+                                         !no_io, &ikey_slice,
+                                         /*context=*/nullptr));
       }
     }
   }
@@ -335,9 +339,9 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   for (auto key : pkeys) {
     auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
     const Slice ikey_slice = Slice(*ikey.rep());
-    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
-                                       prefix_extractor.get(), kNotValid,
-                                       false /*no_io*/, &ikey_slice));
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*context=*/nullptr));
   }
 }
 
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index b37d46373e1..0080a76e158 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -56,7 +56,10 @@ class CuckooTableReader: public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // Following methods are not implemented for Cuckoo Table Reader
-  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               bool /*for_compaction*/ = false) override {
+    return 0;
+  }
   void SetupForCompaction() override {}
   // End of methods not implemented.
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 42e28266d99..005de1c3dc2 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -50,9 +50,12 @@ class MockTableReader : public TableReader {
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               bool /*for_compaction*/ = false) override {
+    return 0;
+  }
 
-  virtual size_t ApproximateMemoryUsage() const override { return 0; }
+  size_t ApproximateMemoryUsage() const override { return 0; }
 
   void SetupForCompaction() override {}
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 38852059bf9..15f7be1c253 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -613,7 +613,8 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
   return Status::OK();
 }
 
-uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) {
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+                                               bool /*for_compaction*/) {
   return 0;
 }
 
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index 6c1c12ab8bb..774e2eb36ef 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -89,7 +89,8 @@ class PlainTableReader: public TableReader {
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               bool for_compaction = false) override;
 
   uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
   void SetupForCompaction() override;
diff --git a/table/table_reader.h b/table/table_reader.h
index 037dbc33818..bf3289818d6 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -61,7 +61,8 @@ class TableReader {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+  virtual uint64_t ApproximateOffsetOf(const Slice& key,
+                                       bool for_compaction = false) = 0;
 
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 7b3c82e2b7e..5fd14cbf11b 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -17,12 +17,38 @@ enum BlockCacheLookupCaller : char {
   kUserGet = 1,
   kUserMGet = 2,
   kUserIterator = 3,
-  kPrefetch = 4,
-  kCompaction = 5,
+  kUserApproximateSize = 4,
+  kPrefetch = 5,
+  kCompaction = 6,
   // All callers should be added before kMaxBlockCacheLookupCaller.
   kMaxBlockCacheLookupCaller
 };
 
+// Lookup context for tracing block cache accesses.
+// We trace block accesses at five places:
+// 1. BlockBasedTable::GetFilter
+// 2. BlockBasedTable::GetUncompressedDict.
+// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
+// and range deletion block.)
+// 4. BlockBasedTable::Get. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// The context is created at:
+// 1. BlockBasedTable::Get. (kUserGet)
+// 2. BlockBasedTable::MultiGet. (kUserMGet)
+// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
+// external SST ingestion calls this function.)
+// 4. BlockBasedTable::Open. (kPrefetch)
+// 5. Index/Filter::CacheDependencies. (kPrefetch)
+// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
+// kUserApproximateSize).
+struct BlockCacheLookupContext {
+  BlockCacheLookupContext(const BlockCacheLookupCaller& _caller)
+      : caller(_caller) {}
+  const BlockCacheLookupCaller caller;
+};
+
 enum Boolean : char { kTrue = 1, kFalse = 0 };
 
 struct BlockCacheTraceRecord {

From 641cc8d541685cad1629bd99bc08ca958458d456 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 10 Jun 2019 15:53:46 -0700
Subject: [PATCH 124/572] Use CreateLoggerFromOptions function (#5427)

Summary:
Use `CreateLoggerFromOptions` function to reduce code duplication.

Test plan (on my machine)
```
$make clean && make -j32 db_secondary_test
$KEEP_DB=1 ./db_secondary_test
```
Verify all info logs of the secondary instance are properly logged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5427

Differential Revision: D15748922

Pulled By: riversand963

fbshipit-source-id: bad7261df1b8373efc504f141efc7871e375a311
---
 db/db_impl/db_impl_secondary.cc | 35 +++++----------------------------
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index eb8c4c98738..2737df0ae8c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -521,39 +521,14 @@ Status DB::OpenAsSecondary(
   }
 
   DBOptions tmp_opts(db_options);
+  Status s;
   if (nullptr == tmp_opts.info_log) {
-    Env* env = tmp_opts.env;
-    assert(env != nullptr);
-    std::string secondary_abs_path;
-    env->GetAbsolutePath(secondary_path, &secondary_abs_path);
-    std::string fname = InfoLogFileName(secondary_path, secondary_abs_path,
-                                        tmp_opts.db_log_dir);
-
-    env->CreateDirIfMissing(secondary_path);
-    if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) {
-      AutoRollLogger* result = new AutoRollLogger(
-          env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size,
-          tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level);
-      Status s = result->GetStatus();
-      if (!s.ok()) {
-        delete result;
-      } else {
-        tmp_opts.info_log.reset(result);
-      }
-    }
-    if (nullptr == tmp_opts.info_log) {
-      env->RenameFile(
-          fname, OldInfoLogFileName(secondary_path, env->NowMicros(),
-                                    secondary_abs_path, tmp_opts.db_log_dir));
-      Status s = env->NewLogger(fname, &(tmp_opts.info_log));
-      if (tmp_opts.info_log != nullptr) {
-        tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level);
-      }
+    s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+    if (!s.ok()) {
+      tmp_opts.info_log = nullptr;
     }
   }
 
-  assert(tmp_opts.info_log != nullptr);
-
   handles->clear();
   DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
   impl->versions_.reset(new ReactiveVersionSet(
@@ -563,7 +538,7 @@ Status DB::OpenAsSecondary(
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
   impl->mutex_.Lock();
-  Status s = impl->Recover(column_families, true, false, false);
+  s = impl->Recover(column_families, true, false, false);
   if (s.ok()) {
     for (auto cf : column_families) {
       auto cfd =

From b2584577fa66ccb16c3b67a0347188d2474660ce Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 10 Jun 2019 16:46:04 -0700
Subject: [PATCH 125/572] Remove global locks from FlushScheduler (#5372)

Summary:
FlushScheduler's methods are instrumented with debug-time locks to check the scheduler state against a simple container definition. Since https://github.com/facebook/rocksdb/pull/2286 the scope of such locks are widened to the entire methods' body. The result is that the concurrency tested during testing (in debug mode) is stricter than the concurrency level manifested at runtime (in release mode).
The patch reverts this change to reduce the scope of such locks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5372

Differential Revision: D15545831

Pulled By: maysamyabandeh

fbshipit-source-id: 01d69191afb1dd807d4bdc990fc74813ae7b5426
---
 db/flush_scheduler.cc | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc
index 8735a6b369b..9c6c04efe33 100644
--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@@ -13,9 +13,11 @@ namespace rocksdb {
 
 void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
 #ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-  assert(checking_set_.count(cfd) == 0);
-  checking_set_.insert(cfd);
+  {
+    std::lock_guard<std::mutex> lock(checking_mutex_);
+    assert(checking_set_.count(cfd) == 0);
+    checking_set_.insert(cfd);
+  }
 #endif  // NDEBUG
   cfd->Ref();
 // Suppress false positive clang analyzer warnings.
@@ -32,9 +34,6 @@ void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
 }
 
 ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
-#ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-#endif  // NDEBUG
   while (true) {
     if (head_.load(std::memory_order_relaxed) == nullptr) {
       return nullptr;
@@ -47,9 +46,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
     delete node;
 
 #ifndef NDEBUG
-    auto iter = checking_set_.find(cfd);
-    assert(iter != checking_set_.end());
-    checking_set_.erase(iter);
+    {
+      std::lock_guard<std::mutex> lock(checking_mutex_);
+      auto iter = checking_set_.find(cfd);
+      assert(iter != checking_set_.end());
+      checking_set_.erase(iter);
+    }
 #endif  // NDEBUG
 
     if (!cfd->IsDropped()) {
@@ -65,12 +67,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
 }
 
 bool FlushScheduler::Empty() {
-#ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-#endif  // NDEBUG
   auto rv = head_.load(std::memory_order_relaxed) == nullptr;
 #ifndef NDEBUG
-  assert(rv == checking_set_.empty());
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+  // only miss the recent schedules.
+  assert((rv == checking_set_.empty()) || rv);
 #endif  // NDEBUG
   return rv;
 }

From c8c1a549f0cf88fa3d7c82d1f0d96d4b7dcffbf1 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 10 Jun 2019 17:02:23 -0700
Subject: [PATCH 126/572] Avoid deadlock between mutex_ and log_write_mutex_
 (#5437)

Summary:
To avoid deadlock mutex_ should never be acquired before log_write_mutex_. The patch documents that and also fixes one case in ::FlushWAL that acquires mutex_ through ::WriteStatusCheck when it already holds lock on log_write_mutex_.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5437

Differential Revision: D15749722

Pulled By: maysamyabandeh

fbshipit-source-id: f57b69c44b4b80cc6d7ddf3d3fdf4a9eb5a5a45a
---
 db/db_impl/db_impl.cc | 11 +++++++----
 db/db_impl/db_impl.h  |  2 ++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index b1a828f9f0e..0a480a4a2eb 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1046,10 +1046,13 @@ int DBImpl::FindMinimumEmptyLevelFitting(
 
 Status DBImpl::FlushWAL(bool sync) {
   if (manual_wal_flush_) {
-    // We need to lock log_write_mutex_ since logs_ might change concurrently
-    InstrumentedMutexLock wl(&log_write_mutex_);
-    log::Writer* cur_log_writer = logs_.back().writer;
-    auto s = cur_log_writer->WriteBuffer();
+    Status s;
+    {
+      // We need to lock log_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&log_write_mutex_);
+      log::Writer* cur_log_writer = logs_.back().writer;
+      s = cur_log_writer->WriteBuffer();
+    }
     if (!s.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
                       s.ToString().c_str());
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 111a91e04f3..4c80b6a4d0c 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1521,6 +1521,8 @@ class DBImpl : public DB {
   // logfile_number_. With two_write_queues it also protects alive_log_files_,
   // and log_empty_. Refer to the definition of each variable below for more
   // details.
+  // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then log_write_mutex_.
   InstrumentedMutex log_write_mutex_;
 
   std::atomic<bool> shutting_down_;

From a94aef6596f876561b28aad7cdcd0c92f04cc1d6 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 10 Jun 2019 18:43:32 -0700
Subject: [PATCH 127/572] Fix DBTest.DynamicMiscOptions so it passes even with
 Snappy disabled (#5438)

Summary:
This affects our "no compression" automated tests. Since PR #5368, DBTest.DynamicMiscOptions has been failing with:

db/db_test.cc:4889: Failure
dbfull()->SetOptions({{"compression", "kSnappyCompression"}})
Invalid argument: Compression type Snappy is not linked with the binary.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5438

Differential Revision: D15752100

Pulled By: ltamasi

fbshipit-source-id: 3f19eff7cafc03b333965be0203c5853d2a9cb71
---
 db/db_test.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 27cf790ee57..a27a5eeb97f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4884,14 +4884,15 @@ TEST_F(DBTest, DynamicMiscOptions) {
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
                                                      &mutable_cf_options));
   ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
-  // Appveyor fails with: Compression type Snappy is not linked with the binary
-#ifndef OS_WIN
-  ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
-  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
-                                                     &mutable_cf_options));
-  ASSERT_EQ(CompressionType::kSnappyCompression,
-            mutable_cf_options.compression);
-#endif
+
+  if (Snappy_Supported()) {
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+    ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                       &mutable_cf_options));
+    ASSERT_EQ(CompressionType::kSnappyCompression,
+              mutable_cf_options.compression);
+  }
+
   // Test paranoid_file_checks already done in db_block_cache_test
   ASSERT_OK(
       dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));

From 58c4aee42e9ebe008efa2cfdfad107206879446c Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 11 Jun 2019 11:42:19 -0700
Subject: [PATCH 128/572] TransactionUtil::CheckKey() to skip unnecessary
 history (#4941)

Summary:
If a memtable definitely covers a key, there isn't a need to check older memtables.
We can skip them by checking the earliest sequence number.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4941

Differential Revision: D13932666

fbshipit-source-id: b9d52f234b8ad9dd3bf6547645cd457175a3ca9b
---
 db/db_impl/db_impl.cc                         |  22 ++-
 db/db_impl/db_impl.h                          |   8 +-
 utilities/blob_db/blob_db_impl.cc             |   4 +-
 .../optimistic_transaction_test.cc            | 118 +++++++++++++++
 utilities/transactions/transaction_util.cc    |  19 ++-
 utilities/transactions/transaction_util.h     |  11 ++
 .../write_prepared_transaction_test.cc        | 141 ++++++++++++++++++
 7 files changed, 318 insertions(+), 5 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 0a480a4a2eb..27d48539c35 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3412,7 +3412,9 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
 
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
-                                       bool cache_only, SequenceNumber* seq,
+                                       bool cache_only,
+                                       SequenceNumber lower_bound_seq,
+                                       SequenceNumber* seq,
                                        bool* found_record_for_key,
                                        bool* is_blob_index) {
   Status s;
@@ -3445,6 +3447,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+  if (lower_bound_in_mem != kMaxSequenceNumber &&
+      lower_bound_in_mem < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
   // Check if there is a record for this key in the immutable memtables
   sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
                seq, read_options, nullptr /*read_callback*/, is_blob_index);
@@ -3464,6 +3473,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+  if (lower_bound_in_imm != kMaxSequenceNumber &&
+      lower_bound_in_imm < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
   // Check if there is a record for this key in the immutable memtables
   sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
                           &max_covering_tombstone_seq, seq, read_options,
@@ -3485,6 +3501,10 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+  // check here to skip the history if possible. But currently the caller
+  // already does that. Maybe we should move the logic here later.
+
   // TODO(agiardullo): possible optimization: consider checking cached
   // SST files if cache_only=true?
   if (!cache_only) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 4c80b6a4d0c..4de15f0324d 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -413,11 +413,17 @@ class DBImpl : public DB {
   // snapshot, we know that no key could have existing after this snapshot
   // (since we do not compact keys that have an earlier snapshot).
   //
+  // Only records newer than or at `lower_bound_seq` are guaranteed to be
+  // returned. Memtables and files may not be checked if it only contains data
+  // older than `lower_bound_seq`.
+  //
   // Returns OK or NotFound on success,
   // other status on unexpected error.
   // TODO(andrewkr): this API need to be aware of range deletion operations
   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
-                                 bool cache_only, SequenceNumber* seq,
+                                 bool cache_only,
+                                 SequenceNumber lower_bound_seq,
+                                 SequenceNumber* seq,
                                  bool* found_record_for_key,
                                  bool* is_blob_index = nullptr);
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 25583fa981a..86eb1460c15 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1426,8 +1426,8 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
     bool found_record_for_key = false;
     bool is_blob_index = false;
     Status s = db_impl->GetLatestSequenceForKey(
-        sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key,
-        &is_blob_index);
+        sv, key_, false /*cache_only*/, 0 /*lower_bound_seq*/, &latest_seq,
+        &found_record_for_key, &is_blob_index);
     db_impl->ReturnAndCleanupSuperVersion(cfd_, sv);
     if (!s.ok() && !s.IsNotFound()) {
       // Error.
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index 5e1af2fb1f5..3aa6c207a48 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -9,11 +9,15 @@
 #include <string>
 #include <thread>
 
+
+#include "db/db_impl/db_impl.h"
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
+#include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/transaction_test_util.h"
 #include "util/crc32c.h"
@@ -308,6 +312,120 @@ TEST_F(OptimisticTransactionTest, FlushTest2) {
   delete txn;
 }
 
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_F(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    options.max_write_buffer_number_to_maintain = 3;
+    Reopen();
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    ReadOptions snapshot_read_options;
+    ReadOptions snapshot_read_options2;
+    string value;
+    Status s;
+
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn != nullptr);
+
+    Transaction* txn2 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn2 != nullptr);
+
+    snapshot_read_options.snapshot = txn->GetSnapshot();
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+    snapshot_read_options2.snapshot = txn2->GetSnapshot();
+    ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2")));
+
+    // txn updates "foo" and txn2 updates "foo2", and now a write is
+    // issued for "foo", which conflicts with txn but not txn2
+    ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"OptimisticTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(txn_db->Flush(flush_ops));
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
+      db_impl->TEST_SwitchMemtable();
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                       &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, when this transaction is commited,
+    // only need to check the active memtable
+    Transaction* txn3 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn3 != nullptr);
+
+    // Commit both of txn and txn2. txn will conflict but txn2 will
+    // pass. In both ways, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+
+    get_perf_context()->Reset();
+    s = txn->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    // txn should fail because of conflict, even if the memtable
+    // has flushed, because it is still preserved in history.
+    ASSERT_TRUE(s.IsBusy());
+
+    get_perf_context()->Reset();
+    s = txn2->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    txn3->Put(Slice("foo2"), Slice("bar2"));
+    get_perf_context()->Reset();
+    s = txn3->Commit();
+    // txn3 is created after the active memtable is created, so that is the only
+    // memtable to check.
+    ASSERT_EQ(1, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable");
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+  }
+}
+
 TEST_F(OptimisticTransactionTest, NoSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index 407feaaa88a..ba3b75e15bf 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -52,6 +52,12 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                                  const std::string& key, bool cache_only,
                                  ReadCallback* snap_checker,
                                  SequenceNumber min_uncommitted) {
+  // When `min_uncommitted` is provided, keys are not always committed
+  // in sequence number order, and `snap_checker` is used to check whether
+  // specific sequence number is in the database is visible to the transaction.
+  // So `snap_checker` must be provided.
+  assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
+
   Status result;
   bool need_to_read_sst = false;
 
@@ -100,8 +106,19 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
     SequenceNumber seq = kMaxSequenceNumber;
     bool found_record_for_key = false;
 
+    // When min_uncommitted == kMaxSequenceNumber, writes are committed in
+    // sequence number order, so only keys larger than `snap_seq` can cause
+    // conflict.
+    // When min_uncommitted != kMaxSequenceNumber, keys lower than
+    // min_uncommitted will not triggered conflicts, while keys larger than
+    // min_uncommitted might create conflicts, so we need  to read them out
+    // from the DB, and call callback to snap_checker to determine. So only
+    // keys lower than min_uncommitted can be skipped.
+    SequenceNumber lower_bound_seq =
+        (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
     Status s = db_impl->GetLatestSequenceForKey(sv, key, !need_to_read_sst,
-                                                &seq, &found_record_for_key);
+                                                lower_bound_seq, &seq,
+                                                &found_record_for_key);
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       result = s;
diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h
index 0fe0e87d862..1d910134b66 100644
--- a/utilities/transactions/transaction_util.h
+++ b/utilities/transactions/transaction_util.h
@@ -50,6 +50,9 @@ class TransactionUtil {
   // SST files.  This will make it more likely this function will
   // return an error if it is unable to determine if there are any conflicts.
   //
+  // See comment of CheckKey() for explanation of `snap_seq`, `snap_checker`
+  // and `min_uncommitted`.
+  //
   // Returns OK on success, BUSY if there is a conflicting write, or other error
   // status for any unexpected errors.
   static Status CheckKeyForConflicts(
@@ -72,6 +75,14 @@ class TransactionUtil {
                                       bool cache_only);
 
  private:
+  // If `snap_checker` == nullptr, writes are always commited in sequence number
+  // order. All sequence number <= `snap_seq` will not conflict with any
+  // write, and all keys > `snap_seq` of `key` will trigger conflict.
+  // If `snap_checker` != nullptr, writes may not commit in sequence number
+  // order. In this case `min_uncommitted` is a lower bound.
+  //  seq < `min_uncommitted`: no conflict
+  //  seq > `snap_seq`: applicable to conflict
+  //  `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
   static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
                          SequenceNumber earliest_seq, SequenceNumber snap_seq,
                          const std::string& key, bool cache_only,
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index e62b8344169..88f4ea032a9 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -761,6 +761,147 @@ TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
   MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
 }
 
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    options.max_write_buffer_number_to_maintain = 3;
+    ReOpen();
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    TransactionOptions txn_options;
+    txn_options.set_snapshot = true;
+    string value;
+    Status s;
+
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn != nullptr);
+    ASSERT_OK(txn->SetName("txn"));
+
+    Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn2 != nullptr);
+    ASSERT_OK(txn2->SetName("txn2"));
+
+    // This transaction is created to cause potential conflict.
+    Transaction* txn_x = db->BeginTransaction(write_options);
+    ASSERT_OK(txn_x->SetName("txn_x"));
+    ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3")));
+    ASSERT_OK(txn_x->Prepare());
+
+    // Create snapshots after the prepare, but there should still
+    // be a conflict when trying to read "foo".
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(db->Flush(flush_ops));
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
+      db_impl->TEST_SwitchMemtable();
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                   &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, but this transaction also needs to
+    // check all memtables because of they contains uncommitted data.
+    Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn3 != nullptr);
+    ASSERT_OK(txn3->SetName("txn3"));
+
+    // Commit the pending write
+    ASSERT_OK(txn_x->Commit());
+
+    // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will
+    // pass. In all cases, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    // We should have checked two memtables, and since there is no
+    // conflict, another Get() will be made and fetch the data from
+    // DB. If it is in immutable memtable, two extra memtable reads
+    // will be issued. If it is not (in history), only one will
+    // be made, which is to the active memtable.
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
+    }
+
+    Transaction* txn4 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn4 != nullptr);
+    ASSERT_OK(txn4->SetName("txn4"));
+    get_perf_context()->Reset();
+    ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value));
+    if (attempt == kAttemptHistoryMemtable) {
+      // Active memtable will be checked in snapshot validation and when
+      // getting the value.
+      ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    } else {
+      // Only active memtable will be checked in snapshot validation but
+      // both of active and immutable snapshot will be queried when
+      // getting the value.
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    }
+
+    ASSERT_OK(txn2->Commit());
+    ASSERT_OK(txn4->Commit());
+
+    TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable");
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+    delete txn4;
+    delete txn_x;
+  }
+}
+
 // Reproduce the bug with two snapshots with the same seuqence number and test
 // that the release of the first snapshot will not affect the reads by the other
 // snapshot

From 9bbccda01e127c942c71c3c7fc21c494a2fd1992 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Tue, 11 Jun 2019 12:18:37 -0700
Subject: [PATCH 129/572] First commit for block cache trace analyzer (#5425)

Summary:
This PR contains the first commit for block cache trace analyzer. It reads a block cache trace file and prints statistics of the traces.

We will extend this class to provide more functionalities.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5425

Differential Revision: D15709580

Pulled By: HaoyuHuang

fbshipit-source-id: 2f43bd2311f460ab569880819d95eeae217c20bb
---
 CMakeLists.txt                           |   2 +
 Makefile                                 |   4 +
 src.mk                                   |   2 +
 tools/block_cache_trace_analyzer.cc      | 408 +++++++++++++++++++++++
 tools/block_cache_trace_analyzer.h       | 131 ++++++++
 tools/block_cache_trace_analyzer_test.cc | 229 +++++++++++++
 trace_replay/block_cache_tracer.cc       |   3 +-
 trace_replay/block_cache_tracer.h        |   2 +
 8 files changed, 780 insertions(+), 1 deletion(-)
 create mode 100644 tools/block_cache_trace_analyzer.cc
 create mode 100644 tools/block_cache_trace_analyzer.h
 create mode 100644 tools/block_cache_trace_analyzer_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cef1f85d797..006f6798666 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -621,6 +621,7 @@ set(SOURCES
         test_util/sync_point_impl.cc
         test_util/testutil.cc
         test_util/transaction_test_util.cc
+        tools/block_cache_trace_analyzer.cc
         tools/db_bench_tool.cc
         tools/dump/db_dump_tool.cc
         tools/ldb_cmd.cc
@@ -966,6 +967,7 @@ if(WITH_TESTS)
         table/merger_test.cc
         table/sst_file_reader_test.cc
         table/table_test.cc
+        tools/block_cache_trace_analyzer_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
diff --git a/Makefile b/Makefile
index 3ee85ad67d0..425c75eb5f5 100644
--- a/Makefile
+++ b/Makefile
@@ -562,6 +562,7 @@ TESTS = \
 	sst_file_reader_test \
 	db_secondary_test \
 	block_cache_tracer_test \
+	block_cache_trace_analyzer_test \
 
 PARALLEL_TEST = \
 	backupable_db_test \
@@ -1592,6 +1593,9 @@ db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS
 block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+block_cache_trace_analyzer_test: tools/block_cache_trace_analyzer_test.o tools/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
diff --git a/src.mk b/src.mk
index 6303997cd59..150b1c10af9 100644
--- a/src.mk
+++ b/src.mk
@@ -240,6 +240,7 @@ TOOL_LIB_SOURCES = \
   utilities/blob_db/blob_dump_tool.cc                           \
 
 ANALYZER_LIB_SOURCES = \
+  tools/block_cache_trace_analyzer.cc                           \
   tools/trace_analyzer_tool.cc					\
 
 MOCK_LIB_SOURCES = \
@@ -365,6 +366,7 @@ MAIN_SOURCES =                                                          \
   table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
+  tools/block_cache_trace_analyzer_test.cc                              \
   tools/db_bench.cc                                                     \
   tools/db_bench_tool_test.cc                                           \
   tools/db_sanity_test.cc                                               \
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
new file mode 100644
index 00000000000..5d9b2d18409
--- /dev/null
+++ b/tools/block_cache_trace_analyzer.cc
@@ -0,0 +1,408 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "tools/block_cache_trace_analyzer.h"
+
+#include <cinttypes>
+#include <set>
+#include "monitoring/histogram.h"
+
+namespace rocksdb {
+namespace {
+std::string block_type_to_string(TraceType type) {
+  switch (type) {
+    case kBlockTraceFilterBlock:
+      return "Filter";
+    case kBlockTraceDataBlock:
+      return "Data";
+    case kBlockTraceIndexBlock:
+      return "Index";
+    case kBlockTraceRangeDeletionBlock:
+      return "RangeDeletion";
+    case kBlockTraceUncompressionDictBlock:
+      return "UncompressionDict";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidType";
+}
+
+std::string caller_to_string(BlockCacheLookupCaller caller) {
+  switch (caller) {
+    case kUserGet:
+      return "Get";
+    case kUserMGet:
+      return "MultiGet";
+    case kUserIterator:
+      return "Iterator";
+    case kPrefetch:
+      return "Prefetch";
+    case kCompaction:
+      return "Compaction";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidCaller";
+}
+}  // namespace
+
+BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
+    const std::string& trace_file_path)
+    : trace_file_path_(trace_file_path) {
+  env_ = rocksdb::Env::Default();
+}
+
+void BlockCacheTraceAnalyzer::RecordAccess(
+    const BlockCacheTraceRecord& access) {
+  ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
+  SSTFileAccessInfoAggregate& file_aggr =
+      cf_aggr.fd_aggregates_map[access.sst_fd_number];
+  file_aggr.level = access.level;
+  BlockTypeAccessInfoAggregate& block_type_aggr =
+      file_aggr.block_type_aggregates_map[access.block_type];
+  BlockAccessInfo& block_access_info =
+      block_type_aggr.block_access_info_map[access.block_key];
+  block_access_info.AddAccess(access);
+}
+
+Status BlockCacheTraceAnalyzer::Analyze() {
+  std::unique_ptr<TraceReader> trace_reader;
+  Status s =
+      NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  BlockCacheTraceReader reader(std::move(trace_reader));
+  s = reader.ReadHeader(&header_);
+  if (!s.ok()) {
+    return s;
+  }
+  while (s.ok()) {
+    BlockCacheTraceRecord access;
+    s = reader.ReadAccess(&access);
+    if (!s.ok()) {
+      return s;
+    }
+    RecordAccess(access);
+  }
+  return Status::OK();
+}
+
+void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
+  HistogramStat bs_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          bs_stats.Add(block_access_info.second.block_size);
+          bt_stats_map[type].Add(block_access_info.second.block_size);
+          cf_bt_stats_map[cf_name][type].Add(
+              block_access_info.second.block_size);
+        }
+      }
+    }
+  }
+  fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
+  for (auto const& bt_stats : bt_stats_map) {
+    fprintf(stdout, "Block size stats for block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      fprintf(stdout,
+              "Block size stats for column family %s and block type %s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
+  HistogramStat access_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          access_stats.Add(block_access_info.second.num_accesses);
+          bt_stats_map[type].Add(block_access_info.second.num_accesses);
+          cf_bt_stats_map[cf_name][type].Add(
+              block_access_info.second.num_accesses);
+        }
+      }
+    }
+  }
+  fprintf(stdout, "Block access count stats: \n%s",
+          access_stats.ToString().c_str());
+  for (auto const& bt_stats : bt_stats_map) {
+    fprintf(stdout, "Block access count stats for block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      fprintf(stdout,
+              "Block access count stats for column family %s and block type "
+              "%s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
+  HistogramStat existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_existing_keys_stats_map;
+  HistogramStat non_existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map;
+  HistogramStat block_access_stats;
+  std::map<std::string, HistogramStat> cf_block_access_info;
+
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          if (block_access_info.second.num_keys == 0) {
+            continue;
+          }
+          // Use four decimal points.
+          uint64_t percent_referenced_for_existing_keys = (uint64_t)(
+              ((double)block_access_info.second.key_num_access_map.size() /
+               (double)block_access_info.second.num_keys) *
+              10000.0);
+          uint64_t percent_referenced_for_non_existing_keys =
+              (uint64_t)(((double)block_access_info.second
+                              .non_exist_key_num_access_map.size() /
+                          (double)block_access_info.second.num_keys) *
+                         10000.0);
+          uint64_t percent_accesses_for_existing_keys = (uint64_t)(
+              ((double)
+                   block_access_info.second.num_referenced_key_exist_in_block /
+               (double)block_access_info.second.num_accesses) *
+              10000.0);
+          existing_keys_stats.Add(percent_referenced_for_existing_keys);
+          cf_existing_keys_stats_map[cf_name].Add(
+              percent_referenced_for_existing_keys);
+          non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
+          cf_non_existing_keys_stats_map[cf_name].Add(
+              percent_referenced_for_non_existing_keys);
+          block_access_stats.Add(percent_accesses_for_existing_keys);
+          cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
+        }
+      }
+    }
+  }
+  fprintf(stdout,
+          "Histogram on percentage of referenced keys existing in a block over "
+          "the total number of keys in a block: \n%s",
+          existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_existing_keys_stats_map) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  fprintf(
+      stdout,
+      "Histogram on percentage of referenced keys DO NOT exist in a block over "
+      "the total number of keys in a block: \n%s",
+      non_existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  fprintf(stdout,
+          "Histogram on percentage of accesses on keys exist in a block over "
+          "the total number of accesses in a block: \n%s",
+          block_access_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_block_access_info) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
+  uint64_t total_num_files = 0;
+  uint64_t total_num_blocks = 0;
+  uint64_t total_num_accesses = 0;
+  std::map<TraceType, uint64_t> bt_num_blocks_map;
+  std::map<BlockCacheLookupCaller, uint64_t> caller_num_access_map;
+  std::map<BlockCacheLookupCaller, std::map<TraceType, uint64_t>>
+      caller_bt_num_access_map;
+  std::map<BlockCacheLookupCaller, std::map<uint32_t, uint64_t>>
+      caller_level_num_access_map;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    uint64_t cf_num_files = 0;
+    uint64_t cf_num_blocks = 0;
+    std::map<TraceType, uint64_t> cf_bt_blocks;
+    uint64_t cf_num_accesses = 0;
+    std::map<BlockCacheLookupCaller, uint64_t> cf_caller_num_accesses_map;
+    std::map<BlockCacheLookupCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_level_num_accesses_map;
+    std::map<BlockCacheLookupCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_file_num_accesses_map;
+    std::map<BlockCacheLookupCaller, std::map<TraceType, uint64_t>>
+        cf_caller_bt_num_accesses_map;
+    total_num_files += cf_aggregates.second.fd_aggregates_map.size();
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      cf_num_files++;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        cf_bt_blocks[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        total_num_blocks +=
+            block_type_aggregates.second.block_access_info_map.size();
+        bt_num_blocks_map[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          cf_num_blocks++;
+          for (auto const& stats :
+               block_access_info.second.caller_num_access_map) {
+            // Stats per caller.
+            const BlockCacheLookupCaller caller = stats.first;
+            const uint64_t num_accesses = stats.second;
+            // Overall stats.
+            total_num_accesses += num_accesses;
+            caller_num_access_map[caller] += num_accesses;
+            caller_bt_num_access_map[caller][type] += num_accesses;
+            caller_level_num_access_map[caller][level] += num_accesses;
+            // Column Family stats.
+            cf_num_accesses++;
+            cf_caller_num_accesses_map[caller] += num_accesses;
+            cf_caller_level_num_accesses_map[caller][level] += num_accesses;
+            cf_caller_file_num_accesses_map[caller][fd] += num_accesses;
+            cf_caller_bt_num_accesses_map[caller][type] += num_accesses;
+          }
+        }
+      }
+    }
+
+    // Print stats.
+    fprintf(
+        stdout,
+        "***************************************************************\n");
+    fprintf(
+        stdout,
+        "***************************************************************\n");
+    fprintf(
+        stdout,
+        "***************************************************************\n");
+    fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
+    fprintf(stdout,
+            "Number of files:%" PRIu64 "Number of blocks: %" PRIu64
+            "Number of accesses: %" PRIu64 "\n",
+            cf_num_files, cf_num_blocks, cf_num_accesses);
+    for (auto block_type : cf_bt_blocks) {
+      fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n",
+              block_type_to_string(block_type.first).c_str(),
+              block_type.second);
+    }
+    for (auto caller : cf_caller_num_accesses_map) {
+      fprintf(
+          stdout,
+          "***************************************************************\n");
+      fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
+              caller_to_string(caller.first).c_str(), caller.second);
+      fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_level :
+           cf_caller_level_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 "\n",
+                naccess_level.first, naccess_level.second);
+      }
+      fprintf(stdout, "Caller %s: Number of accesses per file break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t File %" PRIu64 ": Number of accesses: %" PRIu64 "\n",
+                naccess_file.first, naccess_file.second);
+      }
+      fprintf(stdout,
+              "Caller %s: Number of accesses per block type break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) {
+        fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n",
+                block_type_to_string(naccess_type.first).c_str(),
+                naccess_type.second);
+      }
+    }
+  }
+  fprintf(stdout,
+          "***************************************************************\n");
+  fprintf(stdout,
+          "***************************************************************\n");
+  fprintf(stdout,
+          "***************************************************************\n");
+  fprintf(stdout, "Overall statistics:\n");
+  fprintf(stdout,
+          "Number of files: %" PRIu64 " Number of blocks: %" PRIu64
+          " Number of accesses: %" PRIu64 "\n",
+          total_num_files, total_num_blocks, total_num_accesses);
+  for (auto block_type : bt_num_blocks_map) {
+    fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n",
+            block_type_to_string(block_type.first).c_str(), block_type.second);
+  }
+  for (auto caller : caller_num_access_map) {
+    fprintf(
+        stdout,
+        "***************************************************************\n");
+    fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
+            caller_to_string(caller.first).c_str(), caller.second);
+    fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_level : caller_level_num_access_map[caller.first]) {
+      fprintf(stdout, "\t Level %d: Number of accesses: %" PRIu64 "\n",
+              naccess_level.first, naccess_level.second);
+    }
+    fprintf(stdout, "Caller %s: Number of accesses per block type break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_type : caller_bt_num_access_map[caller.first]) {
+      fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n",
+              block_type_to_string(naccess_type.first).c_str(),
+              naccess_type.second);
+    }
+  }
+}
+
+}  // namespace rocksdb
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
new file mode 100644
index 00000000000..9dde8a939b5
--- /dev/null
+++ b/tools/block_cache_trace_analyzer.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace rocksdb {
+
+// Statistics of a block.
+struct BlockAccessInfo {
+  uint64_t num_accesses = 0;
+  uint64_t block_size = 0;
+  uint64_t first_access_time = 0;
+  uint64_t last_access_time = 0;
+  uint64_t num_keys = 0;
+  std::map<std::string, uint64_t>
+      key_num_access_map;  // for keys exist in this block.
+  std::map<std::string, uint64_t>
+      non_exist_key_num_access_map;  // for keys do not exist in this block.
+  uint64_t num_referenced_key_exist_in_block = 0;
+  std::map<BlockCacheLookupCaller, uint64_t> caller_num_access_map;
+
+  void AddAccess(const BlockCacheTraceRecord& access) {
+    if (first_access_time == 0) {
+      first_access_time = access.access_timestamp;
+    }
+    last_access_time = access.access_timestamp;
+    block_size = access.block_size;
+    caller_num_access_map[access.caller]++;
+    num_accesses++;
+    if (ShouldTraceReferencedKey(access)) {
+      num_keys = access.num_keys_in_block;
+
+      if (access.is_referenced_key_exist_in_block == Boolean::kTrue) {
+        key_num_access_map[access.referenced_key]++;
+        num_referenced_key_exist_in_block++;
+      } else {
+        non_exist_key_num_access_map[access.referenced_key]++;
+      }
+    }
+  }
+};
+
+// Aggregates stats of a block given a block type.
+struct BlockTypeAccessInfoAggregate {
+  std::map<std::string, BlockAccessInfo> block_access_info_map;
+};
+
+// Aggregates BlockTypeAggregate given a SST file.
+struct SSTFileAccessInfoAggregate {
+  uint32_t level;
+  std::map<TraceType, BlockTypeAccessInfoAggregate> block_type_aggregates_map;
+};
+
+// Aggregates SSTFileAggregate given a column family.
+struct ColumnFamilyAccessInfoAggregate {
+  std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map;
+};
+
+class BlockCacheTraceAnalyzer {
+ public:
+  BlockCacheTraceAnalyzer(const std::string& trace_file_path);
+  ~BlockCacheTraceAnalyzer() = default;
+  // No copy and move.
+  BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete;
+  BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete;
+
+  // Read all access records in the given trace_file, maintains the stats of
+  // a block, and aggregates the information by block type, sst file, and column
+  // family. Subsequently, the caller may call Print* functions to print
+  // statistics.
+  Status Analyze();
+
+  // Print a summary of statistics of the trace, e.g.,
+  // Number of files: 2 Number of blocks: 50 Number of accesses: 50
+  // Number of Index blocks: 10
+  // Number of Filter blocks: 10
+  // Number of Data blocks: 10
+  // Number of UncompressionDict blocks: 10
+  // Number of RangeDeletion blocks: 10
+  // ***************************************************************
+  // Caller Get: Number of accesses 10
+  // Caller Get: Number of accesses per level break down
+  //          Level 0: Number of accesses: 10
+  // Caller Get: Number of accesses per block type break down
+  //          Block Type Index: Number of accesses: 2
+  //          Block Type Filter: Number of accesses: 2
+  //          Block Type Data: Number of accesses: 2
+  //          Block Type UncompressionDict: Number of accesses: 2
+  //          Block Type RangeDeletion: Number of accesses: 2
+  void PrintStatsSummary() const;
+
+  // Print block size distribution and the distribution break down by block type
+  // and column family.
+  void PrintBlockSizeStats() const;
+
+  // Print access count distribution and the distribution break down by block
+  // type and column family.
+  void PrintAccessCountStats() const;
+
+  // Print data block accesses by user Get and Multi-Get.
+  // It prints out 1) A histogram on the percentage of keys accessed in a data
+  // block break down by if a referenced key exists in the data block andthe
+  // histogram break down by column family. 2) A histogram on the percentage of
+  // accesses on keys exist in a data block and its break down by column family.
+  void PrintDataBlockAccessStats() const;
+
+  const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
+  TEST_cf_aggregates_map() const {
+    return cf_aggregates_map_;
+  }
+
+ private:
+  void RecordAccess(const BlockCacheTraceRecord& access);
+
+  rocksdb::Env* env_;
+  std::string trace_file_path_;
+  BlockCacheTraceHeader header_;
+  std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
+};
+
+}  // namespace rocksdb
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
new file mode 100644
index 00000000000..96f52c1ec00
--- /dev/null
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -0,0 +1,229 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <map>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/block_cache_trace_analyzer.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace rocksdb {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTStoringEvenKeys = 100;
+const uint64_t kSSTStoringOddKeys = 101;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    env_ = rocksdb::Env::Default();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+  }
+
+  ~BlockCacheTracerTest() override {
+    if (getenv("KEEP_DB")) {
+      printf("The trace file is still at %s\n", trace_file_path_.c_str());
+      return;
+    }
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  BlockCacheLookupCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return BlockCacheLookupCaller::kPrefetch;
+      case 1:
+        return BlockCacheLookupCaller::kCompaction;
+      case 2:
+        return BlockCacheLookupCaller::kUserGet;
+      case 3:
+        return BlockCacheLookupCaller::kUserMGet;
+      case 4:
+        return BlockCacheLookupCaller::kUserIterator;
+    }
+    // This cannot happend.
+    assert(false);
+    return BlockCacheLookupCaller::kUserGet;
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+      record.access_timestamp = env_->NowMicros();
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      if (key_id % 2 == 0) {
+        record.sst_fd_number = kSSTStoringEvenKeys;
+      } else {
+        record.sst_fd_number = kSSTStoringOddKeys;
+      }
+      record.is_cache_hit = Boolean::kFalse;
+      record.no_insert = Boolean::kFalse;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+      record.is_referenced_key_exist_in_block = Boolean::kTrue;
+      record.num_keys_in_block = kNumKeysInBlock;
+      ASSERT_OK(writer->WriteBlockAccess(record));
+    }
+  }
+
+  void AssertBlockAccessInfo(
+      uint32_t key_id, TraceType type,
+      const std::map<std::string, BlockAccessInfo>& block_access_info_map) {
+    auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+    ASSERT_TRUE(block_access_info_map.find(key_id_str) !=
+                block_access_info_map.end());
+    auto& block_access_info = block_access_info_map.find(key_id_str)->second;
+    ASSERT_EQ(1, block_access_info.num_accesses);
+    ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size);
+    ASSERT_GT(block_access_info.first_access_time, 0);
+    ASSERT_GT(block_access_info.last_access_time, 0);
+    ASSERT_EQ(1, block_access_info.caller_num_access_map.size());
+    BlockCacheLookupCaller expected_caller = GetCaller(key_id);
+    ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) !=
+                block_access_info.caller_num_access_map.end());
+    ASSERT_EQ(
+        1,
+        block_access_info.caller_num_access_map.find(expected_caller)->second);
+
+    if ((expected_caller == BlockCacheLookupCaller::kUserGet ||
+         expected_caller == BlockCacheLookupCaller::kUserMGet) &&
+        type == TraceType::kBlockTraceDataBlock) {
+      ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys);
+      ASSERT_EQ(1, block_access_info.key_num_access_map.size());
+      ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size());
+      ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block);
+    }
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    // It contains two SST files with 25 blocks of odd numbered block_key in
+    // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in
+    // kSSTStoringEvenKeys.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
+                     10);
+    WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    // Read blocks.
+    BlockCacheTraceAnalyzer analyzer(trace_file_path_);
+    // The analyzer ends when it detects an incomplete access record.
+    ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
+    const uint64_t expected_num_cfs = 1;
+    std::vector<uint64_t> expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys};
+    const std::vector<TraceType> expected_types{
+        TraceType::kBlockTraceUncompressionDictBlock,
+        TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock,
+        TraceType::kBlockTraceIndexBlock,
+        TraceType::kBlockTraceRangeDeletionBlock};
+    const uint64_t expected_num_keys_per_type = 5;
+
+    auto& stats = analyzer.TEST_cf_aggregates_map();
+    ASSERT_EQ(expected_num_cfs, stats.size());
+    ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end());
+    auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second;
+    ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size());
+    for (auto fd_id : expected_fds) {
+      ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) !=
+                  cf_stats.fd_aggregates_map.end());
+      ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level);
+      auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id)
+                                            ->second.block_type_aggregates_map;
+      ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size());
+      uint32_t key_id = 0;
+      for (auto type : expected_types) {
+        ASSERT_TRUE(block_type_aggregates_map.find(type) !=
+                    block_type_aggregates_map.end());
+        auto& block_access_info_map =
+            block_type_aggregates_map.find(type)->second.block_access_info_map;
+        // Each block type has 5 blocks.
+        ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size());
+        for (uint32_t i = 0; i < 10; i++) {
+          // Verify that odd numbered blocks are stored in kSSTStoringOddKeys
+          // and even numbered blocks are stored in kSSTStoringEvenKeys.
+          auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+          if (fd_id == kSSTStoringOddKeys) {
+            if (key_id % 2 == 1) {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            } else {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            }
+          } else {
+            if (key_id % 2 == 1) {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            } else {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            }
+          }
+          key_id++;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 8d0119a6891..58c7df70b20 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -15,12 +15,13 @@ namespace rocksdb {
 
 namespace {
 const unsigned int kCharSize = 1;
+}  // namespace
+
 bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) {
   return (record.block_type == TraceType::kBlockTraceDataBlock) &&
          (record.caller == BlockCacheLookupCaller::kUserGet ||
           record.caller == BlockCacheLookupCaller::kUserMGet);
 }
-}  // namespace
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
     Env* env, const TraceOptions& trace_options,
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 5fd14cbf11b..e24d5a5ef35 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -77,6 +77,8 @@ struct BlockCacheTraceHeader {
   uint32_t rocksdb_minor_version;
 };
 
+bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record);
+
 // BlockCacheTraceWriter captures all RocksDB block cache accesses using a
 // user-provided TraceWriter. Every RocksDB operation is written as a single
 // trace. Each trace will have a timestamp and type, followed by the trace

From 7177dc46a13332c96332d524b20f14b7e1372d07 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 11 Jun 2019 13:04:59 -0700
Subject: [PATCH 130/572] Handle missing WAL in secondary mode (#5323)

Summary:
In secondary mode, it is possible that the secondary lists the primary's WAL
directory, finds a WAL and tries to open it. It is possible that the primary
deletes the WAL after secondary listing dir but before the secondary opening
it. Then the secondary will fail to open the WAL file with a PathNotFound
status. In this case, we can return OK without replaying WAL and optionally
replay more MANIFEST.

Test Plan (on my dev machine):
Without this PR, the following will fail several times out of 100 runs.
```
~/gtest-parallel/gtest-parallel -r 100 -w 16 ./db_secondary_test --gtest_filter=DBSecondaryTest.SwitchToNewManifestDuringOpen
```
With this PR, the above should always succeed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5323

Differential Revision: D15763878

Pulled By: riversand963

fbshipit-source-id: c7164fa7cb8d9001abc258b6a2dc93613e4f38ff
---
 db/db_impl/db_impl_secondary.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 2737df0ae8c..5cd0beb1f0c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -60,6 +60,12 @@ Status DBImplSecondary::Recover(
     s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
   }
 
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
   // TODO: update options_file_number_ needed?
 
   job_context.Clean();
@@ -475,6 +481,12 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
   if (s.ok()) {
     s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
   }
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
   if (s.ok()) {
     for (auto cfd : cfds_changed) {
       cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),

From ba64a4cf52cce5cf180135e5aeddaa90b7887f9d Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 11 Jun 2019 16:19:13 -0700
Subject: [PATCH 131/572] Revert "Reduce iterator key comparison for
 upper/lower bound check (#5111)" (#5440)

Summary:
This reverts commit f3a7847598d89ef8f9f531b10fabb7ce044a38f8.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5440

Differential Revision: D15765967

Pulled By: ltamasi

fbshipit-source-id: d027fe24132e3729289cd7c01857a7eb449d9dd0
---
 HISTORY.md                                    |  1 -
 db/db_iter.cc                                 |  9 +----
 db/version_set.cc                             | 40 +++++--------------
 table/block_based/block_based_table_reader.cc | 26 +++++-------
 table/block_based/block_based_table_reader.h  |  9 +----
 table/internal_iterator.h                     | 25 +-----------
 table/iterator_wrapper.h                      | 22 ++--------
 table/merging_iterator.cc                     | 24 -----------
 8 files changed, 28 insertions(+), 128 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index ad6c370b5a0..5574c769878 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -18,7 +18,6 @@
 * Reduce binary search when iterator reseek into the same data block.
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
-* Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 
 ### General Improvements
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 633724c5763..b89d7301131 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -467,9 +467,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
-    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
-           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
-    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
+    if (iterate_upper_bound_ != nullptr &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
     }
@@ -861,10 +859,7 @@ void DBIter::PrevInternal() {
       return;
     }
 
-    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
-           user_comparator_.Compare(saved_key_.GetUserKey(),
-                                    *iterate_lower_bound_) >= 0);
-    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+    if (iterate_lower_bound_ != nullptr &&
         user_comparator_.Compare(saved_key_.GetUserKey(),
                                  *iterate_lower_bound_) < 0) {
       // We've iterated earlier than the user-specified lower bound.
diff --git a/db/version_set.cc b/db/version_set.cc
index 8895879bfbf..658a397fa58 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -885,7 +885,7 @@ class LevelIterator final : public InternalIterator {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(IterateResult* result) override;
+  bool NextAndGetResult(Slice* ret_key) override;
   void Prev() override;
 
   bool Valid() const override { return file_iter_.Valid(); }
@@ -893,38 +893,23 @@ class LevelIterator final : public InternalIterator {
     assert(Valid());
     return file_iter_.key();
   }
-
   Slice value() const override {
     assert(Valid());
     return file_iter_.value();
   }
-
   Status status() const override {
     return file_iter_.iter() ? file_iter_.status() : Status::OK();
   }
-
-  inline bool MayBeOutOfLowerBound() override {
-    assert(Valid());
-    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
-  }
-
-  inline bool MayBeOutOfUpperBound() override {
-    assert(Valid());
-    return file_iter_.MayBeOutOfUpperBound();
-  }
-
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     if (file_iter_.iter()) {
       file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
     }
   }
-
   bool IsKeyPinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsKeyPinned();
   }
-
   bool IsValuePinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsValuePinned();
@@ -968,16 +953,12 @@ class LevelIterator final : public InternalIterator {
       smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
       largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
     }
-    may_be_out_of_lower_bound_ =
-        read_options_.iterate_lower_bound != nullptr &&
-        user_comparator_.Compare(ExtractUserKey(file_smallest_key(file_index_)),
-                                 *read_options_.iterate_lower_bound) < 0;
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
-        nullptr /* don't need reference to table */, file_read_hist_,
-        for_compaction_, nullptr /* arena */, skip_filters_, level_,
-        smallest_compaction_key, largest_compaction_key);
+        nullptr /* don't need reference to table */,
+        file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
+        level_, smallest_compaction_key, largest_compaction_key);
   }
 
   TableCache* table_cache_;
@@ -993,7 +974,6 @@ class LevelIterator final : public InternalIterator {
   bool should_sample_;
   bool for_compaction_;
   bool skip_filters_;
-  bool may_be_out_of_lower_bound_ = true;
   size_t file_index_;
   int level_;
   RangeDelAggregator* range_del_agg_;
@@ -1062,12 +1042,11 @@ void LevelIterator::SeekToLast() {
 
 void LevelIterator::Next() { NextImpl(); }
 
-bool LevelIterator::NextAndGetResult(IterateResult* result) {
+bool LevelIterator::NextAndGetResult(Slice* ret_key) {
   NextImpl();
   bool is_valid = Valid();
   if (is_valid) {
-    result->key = key();
-    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+    *ret_key = key();
   }
   return is_valid;
 }
@@ -4363,9 +4342,10 @@ Status VersionSet::Recover(
         ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
-        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
-        last_sequence_.load(), log_number, prev_log_number_,
-        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
+        manifest_path.c_str(), manifest_file_number_,
+        next_file_number_.load(), last_sequence_.load(), log_number,
+        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
+        min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index d1beafed68b..75c8301c5c2 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2535,12 +2535,11 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
 
 template <class TBlockIter, typename TValue>
 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
-    IterateResult* result) {
+    Slice* ret_key) {
   Next();
   bool is_valid = Valid();
   if (is_valid) {
-    result->key = key();
-    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+    *ret_key = key();
   }
   return is_valid;
 }
@@ -2621,11 +2620,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
         key_includes_seq_, index_key_is_full_,
         /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
-    if (read_options_.iterate_upper_bound != nullptr) {
-      data_block_within_upper_bound_ =
-          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                    index_iter_->user_key()) > 0);
-    }
   }
 }
 
@@ -2638,15 +2632,13 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
       return;
     }
     // Whether next data block is out of upper bound, if there is one.
-    // TODO: we should be able to use !data_block_within_upper_bound_ here
-    // instead of performing the comparison; however, the flag can apparently
-    // be out of sync with the comparison in some cases. This should be
-    // investigated.
-    const bool next_block_is_out_of_bound =
-        read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_ &&
-        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                  index_iter_->user_key()) <= 0);
+    bool next_block_is_out_of_bound = false;
+    if (read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_) {
+      next_block_is_out_of_bound =
+          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                    index_iter_->user_key()) <= 0);
+    }
     ResetDataIter();
     index_iter_->Next();
     if (next_block_is_out_of_bound) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index a92289f9bee..420da25932b 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -608,7 +608,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(IterateResult* result) override;
+  bool NextAndGetResult(Slice* ret_key) override;
   void Prev() override;
   bool Valid() const override {
     return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
@@ -639,11 +639,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // Whether iterator invalidated for being out of bound.
   bool IsOutOfBound() override { return is_out_of_bound_; }
 
-  inline bool MayBeOutOfUpperBound() override {
-    assert(Valid());
-    return !data_block_within_upper_bound_;
-  }
-
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
@@ -705,8 +700,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   TBlockIter block_iter_;
   bool block_iter_points_to_real_block_;
   bool is_out_of_bound_ = false;
-  // Whether current data block being fully within iterate upper bound.
-  bool data_block_within_upper_bound_ = false;
   bool check_filter_;
   // TODO(Zhongyi): pick a better name
   bool need_upper_bound_check_;
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 1f57399c7f7..8f1cc9dd68e 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -17,11 +17,6 @@ namespace rocksdb {
 
 class PinnedIteratorsManager;
 
-struct IterateResult {
-  Slice key;
-  bool may_be_out_of_upper_bound;
-};
-
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
@@ -60,20 +55,11 @@ class InternalIteratorBase : public Cleanable {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  // Moves to the next entry in the source, and return result. Iterator
-  // implementation should override this method to help methods inline better,
-  // or when MayBeOutOfUpperBound() is non-trivial.
-  // REQUIRES: Valid()
-  virtual bool NextAndGetResult(IterateResult* result) {
+  virtual bool NextAndGetResult(Slice* ret_key) {
     Next();
     bool is_valid = Valid();
     if (is_valid) {
-      result->key = key();
-      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
-      // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
-      // it should also override NextAndGetResult().
-      result->may_be_out_of_upper_bound = true;
-      assert(MayBeOutOfUpperBound());
+      *ret_key = key();
     }
     return is_valid;
   }
@@ -108,13 +94,6 @@ class InternalIteratorBase : public Cleanable {
   // upper bound
   virtual bool IsOutOfBound() { return false; }
 
-  // Keys return from this iterator can be smaller than iterate_lower_bound.
-  virtual bool MayBeOutOfLowerBound() { return true; }
-
-  // Keys return from this iterator can be larger or equal to
-  // iterate_upper_bound.
-  virtual bool MayBeOutOfUpperBound() { return true; }
-
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index a5aa5c49eac..a570e53c1e2 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -56,10 +56,7 @@ class IteratorWrapperBase {
 
   // Iterator interface methods
   bool Valid() const        { return valid_; }
-  Slice key() const {
-    assert(Valid());
-    return result_.key;
-  }
+  Slice key() const         { assert(Valid()); return key_; }
   TValue value() const {
     assert(Valid());
     return iter_->value();
@@ -68,7 +65,7 @@ class IteratorWrapperBase {
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next() {
     assert(iter_);
-    valid_ = iter_->NextAndGetResult(&result_);
+    valid_ = iter_->NextAndGetResult(&key_);
     assert(!valid_ || iter_->status().ok());
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
@@ -86,16 +83,6 @@ class IteratorWrapperBase {
   void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
   void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
 
-  bool MayBeOutOfLowerBound() {
-    assert(Valid());
-    return iter_->MayBeOutOfLowerBound();
-  }
-
-  bool MayBeOutOfUpperBound() {
-    assert(Valid());
-    return result_.may_be_out_of_upper_bound;
-  }
-
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
     assert(iter_);
     iter_->SetPinnedItersMgr(pinned_iters_mgr);
@@ -113,15 +100,14 @@ class IteratorWrapperBase {
   void Update() {
     valid_ = iter_->Valid();
     if (valid_) {
+      key_ = iter_->key();
       assert(iter_->status().ok());
-      result_.key = iter_->key();
-      result_.may_be_out_of_upper_bound = true;
     }
   }
 
   InternalIteratorBase<TValue>* iter_;
-  IterateResult result_;
   bool valid_;
+  Slice key_;
 };
 
 using IteratorWrapper = IteratorWrapperBase<Slice>;
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 1a0d4df8995..207066b5a1e 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -227,16 +227,6 @@ class MergingIterator : public InternalIterator {
     current_ = CurrentForward();
   }
 
-  bool NextAndGetResult(IterateResult* result) override {
-    Next();
-    bool is_valid = Valid();
-    if (is_valid) {
-      result->key = key();
-      result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
-    }
-    return is_valid;
-  }
-
   void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
@@ -306,20 +296,6 @@ class MergingIterator : public InternalIterator {
     return current_->value();
   }
 
-  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
-  // from current child iterator. Potentially as long as one of child iterator
-  // report out of bound is not possible, we know current key is within bound.
-
-  bool MayBeOutOfLowerBound() override {
-    assert(Valid());
-    return current_->MayBeOutOfLowerBound();
-  }
-
-  bool MayBeOutOfUpperBound() override {
-    assert(Valid());
-    return current_->MayBeOutOfUpperBound();
-  }
-
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     for (auto& child : children_) {

From ca1aee2a198f8b461f4c168232ed65d9a205ce9e Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Tue, 11 Jun 2019 17:58:31 -0700
Subject: [PATCH 132/572] WriteUnprepared: commit only from the 2nd queue
 (#5439)

Summary:
This is a port of this PR into WriteUnprepared:
https://github.com/facebook/rocksdb/pull/5014

This also reverts this test change to restore some flaky write unprepared
tests: https://github.com/facebook/rocksdb/pull/5315

Tested with:
$ gtest-parallel ./transaction_test --gtest_filter=MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/9 --repeat=128
[128/128] MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/9 (18250 ms)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5439

Differential Revision: D15761405

Pulled By: lth

fbshipit-source-id: ae2581fd942d8a5b3f9278fd6bc3c1ac0b2c964c
---
 utilities/transactions/transaction_test.cc    |  4 ++
 .../transactions/write_unprepared_txn.cc      | 54 ++++++++++---------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 35a9706830e..a410c5b5196 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -74,6 +74,10 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
 #endif  // ROCKSDB_VALGRIND_RUN
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index a1fe213ddd3..54d478c9466 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -281,23 +281,30 @@ Status WriteUnpreparedTxn::CommitInternal() {
   const bool disable_memtable = !includes_data;
   const bool do_one_write =
       !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
-  const bool publish_seq = do_one_write;
-  // Note: CommitTimeWriteBatch does not need AddPrepared since it is written to
-  // DB in one shot. min_uncommitted still works since it requires capturing
-  // data that is written to DB but not yet committed, while
-  // CommitTimeWriteBatch commits with PreReleaseCallback.
+
   WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
-      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt, publish_seq);
+      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt);
+  const bool kFirstPrepareBatch = true;
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, commit_batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
   uint64_t seq_used = kMaxSequenceNumber;
-  // Since the prepared batch is directly written to memtable, there is already
-  // a connection between the memtable and its WAL, so there is no need to
-  // redundantly reference the log that contains the prepared data.
+  // Since the prepared batch is directly written to memtable, there is
+  // already a connection between the memtable and its WAL, so there is no
+  // need to redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
   auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
                                zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, &update_commit_map);
+                               batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
     if (LIKELY(s.ok())) {
       // Note RemovePrepared should be called after WriteImpl that publishsed
@@ -306,30 +313,25 @@ Status WriteUnpreparedTxn::CommitInternal() {
         wpt_db_->RemovePrepared(seq.first, seq.second);
       }
     }
+    if (UNLIKELY(!do_one_write)) {
+      wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+    }
     unprep_seqs_.clear();
     write_set_keys_.clear();
     return s;
   }  // else do the 2nd write to publish seq
+
+  // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the
+  // commit write batch as just another "unprepared" batch. This will also
+  // update the unprep_seqs_ in the update_commit_map callback.
+  unprep_seqs_[commit_batch_seq] = commit_batch_cnt;
+
   // Note: the 2nd write comes with a performance penality. So if we have too
   // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
   // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
   // two_write_queues should be disabled to avoid many additional writes here.
-  class PublishSeqPreReleaseCallback : public PreReleaseCallback {
-   public:
-    explicit PublishSeqPreReleaseCallback(DBImpl* db_impl)
-        : db_impl_(db_impl) {}
-    Status Callback(SequenceNumber seq,
-                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
-                    size_t /*index*/, size_t /*total*/) override {
-      assert(is_mem_disabled);
-      assert(db_impl_->immutable_db_options().two_write_queues);
-      db_impl_->SetLastPublishedSequence(seq);
-      return Status::OK();
-    }
 
-   private:
-    DBImpl* db_impl_;
-  } publish_seq_callback(db_impl_);
+  // Update commit map only from the 2nd queue
   WriteBatch empty_batch;
   empty_batch.PutLogData(Slice());
   // In the absence of Prepare markers, use Noop as a batch separator
@@ -339,7 +341,7 @@ Status WriteUnpreparedTxn::CommitInternal() {
   const uint64_t NO_REF_LOG = 0;
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
-                          &publish_seq_callback);
+                          &update_commit_map);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   // Note RemovePrepared should be called after WriteImpl that publishsed the
   // seq. Otherwise SmallestUnCommittedSeq optimization breaks.

From 773f914a40a0b9901b32f28d738d6f8eb97bb0b9 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 11 Jun 2019 19:52:08 -0700
Subject: [PATCH 133/572] WritePrepared: switch PreparedHeap from
 priority_queue to deque (#5436)

Summary:
Internally PreparedHeap is currently using a priority_queue. The rationale was the in the initial design PreparedHeap::AddPrepared could be called in arbitrary order. With the recent optimizations, we call ::AddPrepared only from the main write queue, which results into in-order insertion into PreparedHeap. The patch thus replaces the underlying priority_queue with a more efficient deque implementation.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5436

Differential Revision: D15752147

Pulled By: maysamyabandeh

fbshipit-source-id: e6960f2b2097e13137dded1ceeff3b10b03b0aeb
---
 .../write_prepared_transaction_test.cc        | 112 +++++++++++-------
 .../transactions/write_prepared_txn_db.cc     |   7 ++
 .../transactions/write_prepared_txn_db.h      |  31 +++--
 .../transactions/write_unprepared_txn_db.cc   |  13 +-
 4 files changed, 102 insertions(+), 61 deletions(-)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 88f4ea032a9..7830cbd75fc 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -48,18 +48,21 @@ using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
 
 TEST(PreparedHeap, BasicsTest) {
   WritePreparedTxnDB::PreparedHeap heap;
-  heap.push(14l);
-  // Test with one element
-  ASSERT_EQ(14l, heap.top());
-  heap.push(24l);
-  heap.push(34l);
-  // Test that old min is still on top
-  ASSERT_EQ(14l, heap.top());
-  heap.push(44l);
-  heap.push(54l);
-  heap.push(64l);
-  heap.push(74l);
-  heap.push(84l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(14l);
+    // Test with one element
+    ASSERT_EQ(14l, heap.top());
+    heap.push(24l);
+    heap.push(34l);
+    // Test that old min is still on top
+    ASSERT_EQ(14l, heap.top());
+    heap.push(44l);
+    heap.push(54l);
+    heap.push(64l);
+    heap.push(74l);
+    heap.push(84l);
+  }
   // Test that old min is still on top
   ASSERT_EQ(14l, heap.top());
   heap.erase(24l);
@@ -81,11 +84,14 @@ TEST(PreparedHeap, BasicsTest) {
   ASSERT_EQ(64l, heap.top());
   heap.erase(84l);
   ASSERT_EQ(64l, heap.top());
-  heap.push(85l);
-  heap.push(86l);
-  heap.push(87l);
-  heap.push(88l);
-  heap.push(89l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(85l);
+    heap.push(86l);
+    heap.push(87l);
+    heap.push(88l);
+    heap.push(89l);
+  }
   heap.erase(87l);
   heap.erase(85l);
   heap.erase(89l);
@@ -106,13 +112,19 @@ TEST(PreparedHeap, BasicsTest) {
 // not resurface again.
 TEST(PreparedHeap, EmptyAtTheEnd) {
   WritePreparedTxnDB::PreparedHeap heap;
-  heap.push(40l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
   ASSERT_EQ(40l, heap.top());
   // Although not a recommended scenario, we must be resilient against erase
   // without a prior push.
   heap.erase(50l);
   ASSERT_EQ(40l, heap.top());
-  heap.push(60l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
   ASSERT_EQ(40l, heap.top());
 
   heap.erase(60l);
@@ -120,11 +132,17 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
   heap.erase(40l);
   ASSERT_TRUE(heap.empty());
 
-  heap.push(40l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
   ASSERT_EQ(40l, heap.top());
   heap.erase(50l);
   ASSERT_EQ(40l, heap.top());
-  heap.push(60l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
   ASSERT_EQ(40l, heap.top());
 
   heap.erase(40l);
@@ -139,30 +157,37 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
 // successfully emptied at the end.
 TEST(PreparedHeap, Concurrent) {
   const size_t t_cnt = 10;
-  rocksdb::port::Thread t[t_cnt];
-  Random rnd(1103);
+  rocksdb::port::Thread t[t_cnt + 1];
   WritePreparedTxnDB::PreparedHeap heap;
   port::RWMutex prepared_mutex;
+  std::atomic<size_t> last;
 
   for (size_t n = 0; n < 100; n++) {
-    for (size_t i = 0; i < t_cnt; i++) {
-      // This is not recommended usage but we should be resilient against it.
-      bool skip_push = rnd.OneIn(5);
-      t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, skip_push, i]() {
-        auto seq = i;
-        std::this_thread::yield();
+    last = 0;
+    t[0] = rocksdb::port::Thread([&heap, t_cnt, &last]() {
+      Random rnd(1103);
+      for (size_t seq = 1; seq <= t_cnt; seq++) {
+        // This is not recommended usage but we should be resilient against it.
+        bool skip_push = rnd.OneIn(5);
         if (!skip_push) {
-          WriteLock wl(&prepared_mutex);
+          MutexLock ml(heap.push_pop_mutex());
+          std::this_thread::yield();
           heap.push(seq);
+          last.store(seq);
         }
-        std::this_thread::yield();
-        {
-          WriteLock wl(&prepared_mutex);
-          heap.erase(seq);
-        }
+      }
+    });
+    for (size_t i = 1; i <= t_cnt; i++) {
+      t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, &last, i]() {
+        auto seq = i;
+        do {
+          std::this_thread::yield();
+        } while (last.load() < seq);
+        WriteLock wl(&prepared_mutex);
+        heap.erase(seq);
       });
     }
-    for (size_t i = 0; i < t_cnt; i++) {
+    for (size_t i = 0; i <= t_cnt; i++) {
       t[i].join();
     }
     ASSERT_TRUE(heap.empty());
@@ -3197,7 +3222,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
       ReOpen();
       std::atomic<const Snapshot*> snap = {nullptr};
       std::atomic<SequenceNumber> exp_prepare = {0};
-      std::atomic<bool> snapshot_taken = {false};
+      rocksdb::port::Thread callback_thread;
       // Value is synchronized via snap
       PinnableSlice value;
       // Take a snapshot after publish and before RemovePrepared:Start
@@ -3208,7 +3233,6 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
         roptions.snapshot = snap.load();
         auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
         ASSERT_OK(s);
-        snapshot_taken.store(true);
       };
       auto callback = [&](void* param) {
         SequenceNumber prep_seq = *((SequenceNumber*)param);
@@ -3216,8 +3240,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           // We need to spawn a thread to avoid deadlock since getting a
           // snpashot might end up calling AdvanceSeqByOne which needs joining
           // the write queue.
-          auto t = rocksdb::port::Thread(snap_callback);
-          t.detach();
+          callback_thread = rocksdb::port::Thread(snap_callback);
           TEST_SYNC_POINT("callback:end");
         }
       };
@@ -3250,15 +3273,12 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           // Let an eviction to kick in
           std::this_thread::yield();
 
-          snapshot_taken.store(false);
           exp_prepare.store(txn->GetId());
           ASSERT_OK(txn->Commit());
           delete txn;
           // Wait for the snapshot taking that is triggered by
           // RemovePrepared:Start callback
-          while (!snapshot_taken) {
-            std::this_thread::yield();
-          }
+          callback_thread.join();
 
           // Read with the snapshot taken before delayed_prepared_ cleanup
           ReadOptions roptions;
@@ -3278,9 +3298,9 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
       });
       write_thread.join();
       eviction_thread.join();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
     }
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
   }
 }
 
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 96e1aa7a7ba..a3b523a22cf 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -32,12 +32,19 @@ Status WritePreparedTxnDB::Initialize(
   auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
   for (auto rtxn : rtxns) {
     // There should only one batch for WritePrepared policy.
     assert(rtxn.second->batches_.size() == 1);
     const auto& seq = rtxn.second->batches_.begin()->first;
     const auto& batch_info = rtxn.second->batches_.begin()->second;
     auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+    ordered_seq_cnt[seq] = cnt;
+  }
+  // AddPrepared must be called in order
+  for (auto seq_cnt: ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
     for (size_t i = 0; i < cnt; i++) {
       AddPrepared(seq + i);
     }
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index acf2b97a99d..9561bfada17 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -511,9 +511,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
     // The mutex is required for push and pop from PreparedHeap. ::erase will
     // use external synchronization via prepared_mutex_.
     port::Mutex push_pop_mutex_;
-    // TODO(myabandeh): replace it with deque
-    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
-        heap_;
+    std::deque<uint64_t> heap_;
     std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
         erased_heap_;
     std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
@@ -534,21 +532,27 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
     // Returns kMaxSequenceNumber if empty() and the smallest otherwise.
     inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
     inline void push(uint64_t v) {
-      heap_.push(v);
-      heap_top_.store(heap_.top(), std::memory_order_release);
+      push_pop_mutex_.AssertHeld();
+      if (heap_.empty()) {
+        heap_top_.store(v, std::memory_order_release);
+      } else {
+        assert(heap_top_.load() < v);
+      }
+      heap_.push_back(v); 
     }
     void pop(bool locked = false) {
       if (!locked) {
         push_pop_mutex()->Lock();
       }
-      heap_.pop();
+      push_pop_mutex_.AssertHeld();
+      heap_.pop_front();
       while (!heap_.empty() && !erased_heap_.empty() &&
              // heap_.top() > erased_heap_.top() could happen if we have erased
              // a non-existent entry. Ideally the user should not do that but we
              // should be resilient against it.
-             heap_.top() >= erased_heap_.top()) {
-        if (heap_.top() == erased_heap_.top()) {
-          heap_.pop();
+             heap_.front() >= erased_heap_.top()) {
+        if (heap_.front() == erased_heap_.top()) {
+          heap_.pop_front();
         }
         uint64_t erased __attribute__((__unused__));
         erased = erased_heap_.top();
@@ -559,7 +563,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       while (heap_.empty() && !erased_heap_.empty()) {
         erased_heap_.pop();
       }
-      heap_top_.store(!heap_.empty() ? heap_.top() : kMaxSequenceNumber,
+      heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber,
                       std::memory_order_release);
       if (!locked) {
         push_pop_mutex()->Unlock();
@@ -568,13 +572,16 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
     // Concurrrent calls needs external synchronization. It is safe to be called
     // concurrent to push and pop though.
     void erase(uint64_t seq) {
-      if (!heap_.empty()) {
+      if (!empty()) {
         auto top_seq = top();
         if (seq < top_seq) {
           // Already popped, ignore it.
         } else if (top_seq == seq) {
           pop();
-          assert(heap_.empty() || heap_.top() != seq);
+#ifndef NDEBUG
+          MutexLock ml(push_pop_mutex());
+          assert(heap_.empty() || heap_.front() != seq);
+#endif
         } else {  // top() > seq
           // Down the heap, remember to pop it later
           erased_heap_.push(seq);
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 0c94183947f..9382edfad2b 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -225,6 +225,7 @@ Status WriteUnpreparedTxnDB::Initialize(
 
   // create 'real' transactions from recovered shell transactions
   auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
   for (auto rtxn : rtxns) {
     auto recovered_trx = rtxn.second;
     assert(recovered_trx);
@@ -266,9 +267,7 @@ Status WriteUnpreparedTxnDB::Initialize(
       auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
       assert(batch_info.log_number_);
 
-      for (size_t i = 0; i < cnt; i++) {
-        AddPrepared(seq + i);
-      }
+      ordered_seq_cnt[seq] = cnt;
       assert(wupt->unprep_seqs_.count(seq) == 0);
       wupt->unprep_seqs_[seq] = cnt;
       KeySetBuilder keyset_handler(wupt,
@@ -288,6 +287,14 @@ Status WriteUnpreparedTxnDB::Initialize(
       break;
     }
   }
+  // AddPrepared must be called in order
+  for (auto seq_cnt: ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
+    for (size_t i = 0; i < cnt; i++) {
+      AddPrepared(seq + i);
+    }
+  }
 
   SequenceNumber prev_max = max_evicted_seq_;
   SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();

From 4a285d0dd318985b99a88318c96514fd738aa1e6 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 12 Jun 2019 09:42:49 -0700
Subject: [PATCH 134/572] Remove passing const variable to thread (#5443)

Summary:
CLANG complains that passing const to thread is not necessary. The patch removes it form PreparedHeap::Concurrent test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5443

Differential Revision: D15781598

Pulled By: maysamyabandeh

fbshipit-source-id: 3aceb05d96182fa4726d6d37eed45fd3aac4c016
---
 utilities/transactions/write_prepared_transaction_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 7830cbd75fc..66ea8fa530f 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -164,7 +164,7 @@ TEST(PreparedHeap, Concurrent) {
 
   for (size_t n = 0; n < 100; n++) {
     last = 0;
-    t[0] = rocksdb::port::Thread([&heap, t_cnt, &last]() {
+    t[0] = rocksdb::port::Thread([&heap, &last]() {
       Random rnd(1103);
       for (size_t seq = 1; seq <= t_cnt; seq++) {
         // This is not recommended usage but we should be resilient against it.

From f43edff9ac78f8f08edc15092f9e08d4bea10282 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 12 Jun 2019 10:29:56 -0700
Subject: [PATCH 135/572] Disable kPipelinedWrite in MultiThreaded (#5442)

Summary:
TSAN tests report a race condition. We temporarily exclude kPipelinedWrite from MultiThreaded until the race condition is fixed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5442

Differential Revision: D15782349

Pulled By: maysamyabandeh

fbshipit-source-id: 42b4f9b3fa9137f0675e13ad132c0a06800c1bdd
---
 db/db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index a27a5eeb97f..3bac53f2f0a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2285,6 +2285,7 @@ class MultiThreadedDBTest
 };
 
 TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  if (option_config_ == kPipelinedWrite) return;
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   Options options = CurrentOptions(options_override);

From f9842869cf2dc2278322a4f00ccb45a978c7a923 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 12 Jun 2019 11:09:02 -0700
Subject: [PATCH 136/572] Disable pipeline writes in stress test (#5445)

Summary:
The tsan crash tests are failing with a data race compliant with pipelined write option. Temporarily disable it until its concurrency issue are fixed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5445

Differential Revision: D15783824

Pulled By: maysamyabandeh

fbshipit-source-id: 413a0c3230b86f524fc7eeea2cf8e8375406e65b
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 6487562d8bb..173a6a8da9c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -37,7 +37,8 @@
     "delpercent": 4,
     "delrangepercent": 1,
     "destroy_db_initially": 0,
-    "enable_pipelined_write": lambda: random.randint(0, 1),
+    # Temporarily disable it until its concurrency issue are fixed
+    "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
     "max_background_compactions": 20,

From 60f3ec2ca57796203c880d494c872f0086768ce2 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 12 Jun 2019 15:00:53 -0700
Subject: [PATCH 137/572] Fix appveyor compliant about passing const to thread
 (#5447)

Summary:
CLANG would complain if we pass const to lambda function and appveyor complains if we don't (https://github.com/facebook/rocksdb/pull/5443). The patch fixes that by using the default capture mode.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5447

Differential Revision: D15788722

Pulled By: maysamyabandeh

fbshipit-source-id: 47e7f49264afe31fdafe42cb8bf93da126abfca9
---
 utilities/transactions/write_prepared_transaction_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 66ea8fa530f..ef89aaeb8c7 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -164,7 +164,7 @@ TEST(PreparedHeap, Concurrent) {
 
   for (size_t n = 0; n < 100; n++) {
     last = 0;
-    t[0] = rocksdb::port::Thread([&heap, &last]() {
+    t[0] = rocksdb::port::Thread([&]() {
       Random rnd(1103);
       for (size_t seq = 1; seq <= t_cnt; seq++) {
         // This is not recommended usage but we should be resilient against it.

From 5c76ba9dc4cbc676d8a28264b15af68c1bf06917 Mon Sep 17 00:00:00 2001
From: Patrick Zhang <patrick.zhang@amperecomputing.com>
Date: Thu, 13 Jun 2019 11:43:35 -0700
Subject: [PATCH 138/572] Support rocksdbjava aarch64 build and test (#5258)

Summary:
Verified with an Ampere Computing eMAG aarch64 system.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5258

Differential Revision: D15807309

Pulled By: maysamyabandeh

fbshipit-source-id: ab85d2fd3fe40e6094430ab0eba557b1e979510d
---
 Makefile                                       |  5 ++++-
 build_tools/build_detect_platform              |  2 ++
 .../java/org/rocksdb/util/Environment.java     |  6 +++++-
 .../java/org/rocksdb/util/EnvironmentTest.java | 18 ++++++++++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 425c75eb5f5..5944325aafe 100644
--- a/Makefile
+++ b/Makefile
@@ -1641,7 +1641,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64))
+	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64 aarch64))
 		ARCH := 64
 	else
 		ARCH := 32
@@ -1655,6 +1655,9 @@ ifeq (,$(findstring ppc,$(MACHINE)))
 else
         ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
 endif
+ifneq (,$(findstring aarch64,$(MACHINE)))
+        ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
+endif
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 5d42faa30ae..ac30f9ab0fa 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -540,6 +540,8 @@ if test -z "$PORTABLE"; then
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then
     # TODO: Handle this with approprite options.
     COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
+    COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" == "IOS" ]; then
     COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index c019266483f..03611a248a6 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -5,6 +5,10 @@ public class Environment {
   private static String OS = System.getProperty("os.name").toLowerCase();
   private static String ARCH = System.getProperty("os.arch").toLowerCase();
 
+  public static boolean isAarch64() {
+    return ARCH.contains("aarch64");
+  }
+
   public static boolean isPowerPC() {
     return ARCH.contains("ppc");
   }
@@ -60,7 +64,7 @@ public static String getSharedLibraryFileName(final String name) {
   public static String getJniLibraryName(final String name) {
     if (isUnix()) {
       final String arch = is64Bit() ? "64" : "32";
-      if(isPowerPC()) {
+      if(isPowerPC() || isAarch64()) {
         return String.format("%sjni-linux-%s", name, ARCH);
       } else if(isS390x()) {
         return String.format("%sjni-linux%s", name, ARCH);
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index 28ee04768e9..49c8bf19a91 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -130,6 +130,24 @@ public void win64() {
       isEqualTo("librocksdbjni.dll");
   }
 
+  @Test
+  public void aarch64() {
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("rocksdbjni-linux-aarch64");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
   private void setEnvironmentClassFields(String osName,
       String osArch) {
     setEnvironmentClassField(OS_FIELD_NAME, osName);

From ec8111c5a4eb8669c097e55a75bd54f2e8c6db81 Mon Sep 17 00:00:00 2001
From: Bin Fan <fanbin103@gmail.com>
Date: Thu, 13 Jun 2019 12:20:30 -0700
Subject: [PATCH 139/572] Add Alluxio to USERS.md (#5434)

Summary:
Add Alluxio's use case of RocksDB to `USERS.md` for metadata service
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5434

Differential Revision: D15766559

Pulled By: riversand963

fbshipit-source-id: b68ef851f8f92e0925c31e55296260225fdf849e
---
 USERS.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/USERS.md b/USERS.md
index a95903f0662..6401757d2bd 100644
--- a/USERS.md
+++ b/USERS.md
@@ -50,6 +50,10 @@ Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santande
 ## Airbnb
 Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs
 
+## Alluxio
+[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog:
+https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/
+
 ## Pinterest
 Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo
 
@@ -91,4 +95,4 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed
 [ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
 
 ## IOTA Foundation
- [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
\ No newline at end of file
+ [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.

From 2c9df9f9e5c757c8f368d0860e2da8adb63849a3 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Thu, 13 Jun 2019 13:52:43 -0700
Subject: [PATCH 140/572] Dynamic test whether sync_file_range returns ENOSYS
 (#5416)

Summary:
`sync_file_range` returns `ENOSYS` on Windows Subsystem for Linux even
when using a supposedly supported filesystem like ext4. To handle this
case we can do a dynamic check that a no-op `sync_file_range`
invocation, which is accomplished by passing zero for the `flags`
argument, succeeds.

Also I rearranged the function and comments to hopefully make it more
easily understandable.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5416

Differential Revision: D15807061

fbshipit-source-id: d31d94e1f228b7850ea500e6199f8b5daf8cfbd3
---
 env/io_posix.cc | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index 8b42a636295..304c4ffe1c7 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -186,28 +186,34 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #define ZFS_SUPER_MAGIC 0x2fc12fc1
 #endif
 
-bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) {
-  // `fstatfs` is only available on Linux, but so is `sync_file_range`, so
-  // `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`.
+bool IsSyncFileRangeSupported(int fd) {
+  // The approach taken in this function is to build a blacklist of cases where
+  // we know `sync_file_range` definitely will not work properly despite passing
+  // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or
+  // if any of the checks fail in unexpected ways, we allow `sync_file_range` to
+  // be used. This way should minimize risk of impacting existing use cases.
   struct statfs buf;
   int ret = fstatfs(fd, &buf);
   assert(ret == 0);
-  if (ret != 0) {
-    // We don't know whether the filesystem properly supports `sync_file_range`.
-    // Even if it doesn't, we don't know of any safety issue with trying to call
-    // it anyways. So, to preserve the same behavior as before this `fstatfs`
-    // check was introduced, we assume `sync_file_range` is usable.
-    return true;
-  }
-  if (buf.f_type == ZFS_SUPER_MAGIC) {
+  if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
     // Testing on ZFS showed the writeback did not happen asynchronously when
     // `sync_file_range` was called, even though it returned success. Avoid it
     // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
     // even though this'll incur extra I/O for metadata.
     return false;
   }
-  // No known problems with other filesystems' implementations of
-  // `sync_file_range`, so allow them to use it.
+
+  ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
+  assert(!(ret == -1 && errno != ENOSYS));
+  if (ret == -1 && errno == ENOSYS) {
+    // `sync_file_range` is not implemented on all platforms even if
+    // compile-time checks pass and a supported filesystem is in-use. For
+    // example, using ext4 on WSL (Windows Subsystem for Linux),
+    // `sync_file_range()` returns `ENOSYS`
+    // ("Function not implemented").
+    return false;
+  }
+  // None of the cases on the blacklist matched, so allow `sync_file_range` use.
   return true;
 }
 

From a3b8c76d8e3f2a849d354280e9baaac6728a8b4d Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 13 Jun 2019 14:38:54 -0700
Subject: [PATCH 141/572] Add missing check before calling PurgeObsoleteFiles
 in EnableFileDeletions (#5448)

Summary:
Calling PurgeObsoleteFiles with a JobContext for which HaveSomethingToDelete
is false is a precondition violation. This would trigger an assertion in debug builds;
however, in release builds with assertions disabled, this can result in the
pending_purge_obsolete_files_ counter in DBImpl underflowing, which in turn can lead
to the process hanging during database close.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5448

Differential Revision: D15792569

Pulled By: ltamasi

fbshipit-source-id: 82d92c9b4f6a9efcdc69dbb3d5a52a1ae2dd2472
---
 db/db_filesnapshot.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 3ff7c73f4e8..67d994f5568 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -57,7 +57,9 @@ Status DBImpl::EnableFileDeletions(bool force) {
   }
   if (file_deletion_enabled) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
-    PurgeObsoleteFiles(job_context);
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "File Deletions Enable, but not really enabled. Counter: %d",

From bb4178066dc4f18b9b7f1d371e641db027b3edbe Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 13 Jun 2019 15:39:52 -0700
Subject: [PATCH 142/572] Integrate block cache tracer into db_impl (#5433)

Summary:
This PR integrates the block cache tracer class into db_impl.cc.
db_impl.cc contains a member variable of AtomicBlockCacheTraceWriter class and passes its reference to the block_based_table_reader.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5433

Differential Revision: D15728016

Pulled By: HaoyuHuang

fbshipit-source-id: 23d5659e8c82d556833dcc1a5558aac8c1f7db71
---
 TARGETS                                       |  13 +++
 db/column_family.cc                           |  20 ++--
 db/column_family.h                            |  14 ++-
 db/compaction/compaction_job_test.cc          |   6 +-
 db/db_impl/db_impl.cc                         |  14 ++-
 db/db_impl/db_impl.h                          |  11 +-
 db/db_wal_test.cc                             |   3 +-
 db/flush_job_test.cc                          |   3 +-
 db/memtable_list_test.cc                      |   4 +-
 db/repair.cc                                  |   6 +-
 db/table_cache.cc                             |   8 +-
 db/table_cache.h                              |   5 +-
 db/version_set.cc                             |  18 ++--
 db/version_set.h                              |   6 +-
 db/version_set_test.cc                        |   3 +-
 db/wal_manager_test.cc                        |   3 +-
 include/rocksdb/db.h                          |  11 ++
 include/rocksdb/utilities/stackable_db.h      |  10 ++
 .../block_based/block_based_table_factory.cc  |   3 +-
 table/block_based/block_based_table_reader.cc |  27 +++--
 table/block_based/block_based_table_reader.h  |   8 +-
 .../partitioned_filter_block_test.cc          |   3 +-
 table/table_builder.h                         |  14 ++-
 tools/ldb_cmd.cc                              |   6 +-
 trace_replay/block_cache_tracer.cc            |  66 +++++++++---
 trace_replay/block_cache_tracer.h             |  35 +++++-
 trace_replay/block_cache_tracer_test.cc       | 102 ++++++++++++++++++
 27 files changed, 341 insertions(+), 81 deletions(-)

diff --git a/TARGETS b/TARGETS
index 0cdd3b162f9..7a8bb000596 100644
--- a/TARGETS
+++ b/TARGETS
@@ -222,6 +222,7 @@ cpp_library(
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
+        "trace_replay/block_cache_tracer.cc",
         "trace_replay/trace_replay.cc",
         "util/bloom.cc",
         "util/build_version.cc",
@@ -314,6 +315,7 @@ cpp_library(
         "test_util/fault_injection_test_env.cc",
         "test_util/testharness.cc",
         "test_util/testutil.cc",
+        "tools/block_cache_trace_analyzer.cc",
         "tools/trace_analyzer_tool.cc",
         "utilities/cassandra/test_utils.cc",
     ],
@@ -329,6 +331,7 @@ cpp_library(
     name = "rocksdb_tools_lib",
     srcs = [
         "test_util/testutil.cc",
+        "tools/block_cache_trace_analyzer.cc",
         "tools/db_bench_tool.cc",
         "tools/trace_analyzer_tool.cc",
     ],
@@ -383,6 +386,16 @@ ROCKS_TESTS = [
         "table/block_based/block_based_filter_block_test.cc",
         "serial",
     ],
+    [
+        "block_cache_trace_analyzer_test",
+        "tools/block_cache_trace_analyzer_test.cc",
+        "serial",
+    ],
+    [
+        "block_cache_tracer_test",
+        "trace_replay/block_cache_tracer_test.cc",
+        "serial",
+    ],
     [
         "block_test",
         "table/block_based/block_test.cc",
diff --git a/db/column_family.cc b/db/column_family.cc
index 2a2e6cb980f..e135c2d317f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -405,7 +405,8 @@ ColumnFamilyData::ColumnFamilyData(
     uint32_t id, const std::string& name, Version* _dummy_versions,
     Cache* _table_cache, WriteBufferManager* write_buffer_manager,
     const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
-    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
+    const EnvOptions& env_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -445,7 +446,8 @@ ColumnFamilyData::ColumnFamilyData(
   if (_dummy_versions != nullptr) {
     internal_stats_.reset(
         new InternalStats(ioptions_.num_levels, db_options.env, this));
-    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
+    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache,
+                                      block_cache_tracer));
     if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
@@ -1254,18 +1256,20 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const EnvOptions& env_options,
                                  Cache* table_cache,
                                  WriteBufferManager* write_buffer_manager,
-                                 WriteController* write_controller)
+                                 WriteController* write_controller,
+                                 BlockCacheTracer* const block_cache_tracer)
     : max_column_family_(0),
-      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
-                                      ColumnFamilyOptions(), *db_options,
-                                      env_options, nullptr)),
+      dummy_cfd_(new ColumnFamilyData(
+          0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options,
+          env_options, nullptr, block_cache_tracer)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
       env_options_(env_options),
       table_cache_(table_cache),
       write_buffer_manager_(write_buffer_manager),
-      write_controller_(write_controller) {
+      write_controller_(write_controller),
+      block_cache_tracer_(block_cache_tracer) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -1333,7 +1337,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
-      *db_options_, env_options_, this);
+      *db_options_, env_options_, this, block_cache_tracer_);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   max_column_family_ = std::max(max_column_family_, id);
diff --git a/db/column_family.h b/db/column_family.h
index 8646b4fc197..8180f0be26a 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -24,6 +24,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
@@ -46,7 +47,7 @@ struct SuperVersionContext;
 
 extern const double kIncSlowdownRatio;
 // This file contains a list of data structures for managing column family
-// level metadata. 
+// level metadata.
 //
 // The basic relationships among classes declared here are illustrated as
 // following:
@@ -94,7 +95,7 @@ extern const double kIncSlowdownRatio;
 //         |             |    |   1.a    |  |   1.b    |  |   1.c    |
 //         +-------------+    |          |  |          |  |          |
 //                            +----------+  +----------+  +----------+
-// 
+//
 // DBImpl keeps a ColumnFamilySet, which references to all column families by
 // pointing to respective ColumnFamilyData object of each column family.
 // This is how DBImpl can list and operate on all the column families.
@@ -151,7 +152,7 @@ extern const double kIncSlowdownRatio;
 // contains Version B, memtable a and memtable b; SuperVersion1 contains
 // Version B and memtable b (mutable). As a result, Version B and memtable b
 // are prevented from being destroyed or deleted.
-  
+
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
 // is done using the column family
@@ -504,7 +505,8 @@ class ColumnFamilyData {
                    const ColumnFamilyOptions& options,
                    const ImmutableDBOptions& db_options,
                    const EnvOptions& env_options,
-                   ColumnFamilySet* column_family_set);
+                   ColumnFamilySet* column_family_set,
+                   BlockCacheTracer* const block_cache_tracer);
 
   uint32_t id_;
   const std::string name_;
@@ -632,7 +634,8 @@ class ColumnFamilySet {
                   const ImmutableDBOptions* db_options,
                   const EnvOptions& env_options, Cache* table_cache,
                   WriteBufferManager* write_buffer_manager,
-                  WriteController* write_controller);
+                  WriteController* write_controller,
+                  BlockCacheTracer* const block_cache_tracer);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -691,6 +694,7 @@ class ColumnFamilySet {
   Cache* table_cache_;
   WriteBufferManager* write_buffer_manager_;
   WriteController* write_controller_;
+  BlockCacheTracer* const block_cache_tracer_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 66c3353fcf6..add4911891a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -77,7 +77,8 @@ class CompactionJobTest : public testing::Test {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
         shutting_down_(false),
         preserve_deletes_seqnum_(0),
         mock_table_factory_(new mock::MockTableFactory()),
@@ -200,7 +201,8 @@ class CompactionJobTest : public testing::Test {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
                                    table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_));
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
     compaction_job_stats_.Reset();
 
     VersionEdit new_db;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 27d48539c35..af39b5ca11d 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -237,7 +237,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
 
   versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_,
                                  table_cache_.get(), write_buffer_manager_,
-                                 &write_controller_));
+                                 &write_controller_, &block_cache_tracer_));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -3924,6 +3924,18 @@ Status DBImpl::EndTrace() {
   return s;
 }
 
+Status DBImpl::StartBlockCacheTrace(
+    const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  return block_cache_tracer_.StartTrace(env_, trace_options,
+                                        std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+  block_cache_tracer_.EndTrace();
+  return Status::OK();
+}
+
 Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
   Status s;
   if (tracer_) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 4de15f0324d..942c36ff6e6 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -40,7 +40,6 @@
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "db/memtable_list.h"
 #include "logging/event_logger.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
@@ -53,6 +52,7 @@
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
 #include "util/hash.h"
@@ -331,6 +331,14 @@ class DBImpl : public DB {
   using DB::EndTrace;
   virtual Status EndTrace() override;
 
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override;
+
   using DB::GetPropertiesOfAllTables;
   virtual Status GetPropertiesOfAllTables(
       ColumnFamilyHandle* column_family,
@@ -832,6 +840,7 @@ class DBImpl : public DB {
       recovered_transactions_;
   std::unique_ptr<Tracer> tracer_;
   InstrumentedMutex trace_mutex_;
+  BlockCacheTracer block_cache_tracer_;
 
   // State below is protected by mutex_
   // With two_write_queues enabled, some of the variables that accessed during
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 9a1382e98ab..4859bdc90f4 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -838,7 +838,8 @@ class RecoveryTestHelper {
 
     versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
                                   table_cache.get(), &write_buffer_manager,
-                                  &write_controller));
+                                  &write_controller,
+                                  /*block_cache_tracer=*/nullptr));
 
     wal_manager.reset(new WalManager(db_options, env_options));
 
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index ef89199c98e..130179ae67b 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -35,7 +35,8 @@ class FlushJobTest : public testing::Test {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index f55fbdc501a..3a14b6830a6 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -100,7 +100,7 @@ class MemTableListTest : public testing::Test {
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller);
+                        &write_controller, /*block_cache_tracer=*/nullptr);
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -144,7 +144,7 @@ class MemTableListTest : public testing::Test {
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller);
+                        &write_controller, /*block_cache_tracer=*/nullptr);
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
diff --git a/db/repair.cc b/db/repair.cc
index 6967a46e36c..3ae46c6e7ee 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -109,11 +109,13 @@ class Repairer {
             // once.
             NewLRUCache(10, db_options_.table_cache_numshardbits)),
         table_cache_(new TableCache(default_cf_iopts_, env_options_,
-                                    raw_table_cache_.get())),
+                                    raw_table_cache_.get(),
+                                    /*block_cache_tracer=*/nullptr)),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
         vset_(dbname_, &immutable_db_options_, env_options_,
-              raw_table_cache_.get(), &wb_, &wc_),
+              raw_table_cache_.get(), &wb_, &wc_,
+              /*block_cache_tracer=*/nullptr),
         next_file_number_(1),
         db_lock_(nullptr) {
     for (const auto& cfd : column_families) {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 14c0169c11a..0a152f89a16 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -68,11 +68,13 @@ void AppendVarint64(IterKey* key, uint64_t v) {
 }  // namespace
 
 TableCache::TableCache(const ImmutableCFOptions& ioptions,
-                       const EnvOptions& env_options, Cache* const cache)
+                       const EnvOptions& env_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer)
     : ioptions_(ioptions),
       env_options_(env_options),
       cache_(cache),
-      immortal_tables_(false) {
+      immortal_tables_(false),
+      block_cache_tracer_(block_cache_tracer) {
   if (ioptions_.row_cache) {
     // If the same cache is shared by multiple instances, we need to
     // disambiguate its entries.
@@ -125,7 +127,7 @@ Status TableCache::GetTableReader(
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, prefix_extractor, env_options,
                            internal_comparator, skip_filters, immortal_tables_,
-                           level, fd.largest_seqno),
+                           level, fd.largest_seqno, block_cache_tracer_),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
diff --git a/db/table_cache.h b/db/table_cache.h
index 64d7b898b22..1577cef82ff 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -23,6 +23,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
@@ -48,7 +49,8 @@ class HistogramImpl;
 class TableCache {
  public:
   TableCache(const ImmutableCFOptions& ioptions,
-             const EnvOptions& storage_options, Cache* cache);
+             const EnvOptions& storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer);
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
@@ -188,6 +190,7 @@ class TableCache {
   Cache* const cache_;
   std::string row_cache_id_;
   bool immortal_tables_;
+  BlockCacheTracer* const block_cache_tracer_;
 };
 
 }  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index 658a397fa58..30fc744c98a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3342,10 +3342,11 @@ VersionSet::VersionSet(const std::string& dbname,
                        const ImmutableDBOptions* _db_options,
                        const EnvOptions& storage_options, Cache* table_cache,
                        WriteBufferManager* write_buffer_manager,
-                       WriteController* write_controller)
-    : column_family_set_(
-          new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
-                              write_buffer_manager, write_controller)),
+                       WriteController* write_controller,
+                       BlockCacheTracer* const block_cache_tracer)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, _db_options, storage_options, table_cache,
+          write_buffer_manager, write_controller, block_cache_tracer)),
       env_(_db_options->env),
       dbname_(dbname),
       db_options_(_db_options),
@@ -3359,7 +3360,8 @@ VersionSet::VersionSet(const std::string& dbname,
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      env_options_(storage_options) {}
+      env_options_(storage_options),
+      block_cache_tracer_(block_cache_tracer) {}
 
 void CloseTables(void* ptr, size_t) {
   TableReader* table_reader = reinterpret_cast<TableReader*>(ptr);
@@ -4445,7 +4447,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                                         options->table_cache_numshardbits));
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
-  VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc);
+  VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -5200,7 +5203,8 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
                                        WriteBufferManager* write_buffer_manager,
                                        WriteController* write_controller)
     : VersionSet(dbname, _db_options, _env_options, table_cache,
-                 write_buffer_manager, write_controller) {}
+                 write_buffer_manager, write_controller,
+                 /*block_cache_tracer=*/nullptr) {}
 
 ReactiveVersionSet::~ReactiveVersionSet() {}
 
diff --git a/db/version_set.h b/db/version_set.h
index 8a43b982366..90be94a789a 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -46,6 +46,7 @@
 #include "rocksdb/env.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
@@ -777,7 +778,8 @@ class VersionSet {
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
              const EnvOptions& env_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
-             WriteController* write_controller);
+             WriteController* write_controller,
+             BlockCacheTracer* const block_cache_tracer);
   virtual ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
@@ -1125,6 +1127,8 @@ class VersionSet {
   // env options for all reads and writes except compactions
   EnvOptions env_options_;
 
+  BlockCacheTracer* const block_cache_tracer_;
+
  private:
   // No copying allowed
   VersionSet(const VersionSet&);
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index bf9ef8e39fe..a1278bfc7ad 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -618,7 +618,8 @@ class VersionSetTestBase {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
         reactive_versions_(std::make_shared<ReactiveVersionSet>(
             dbname_, &db_options_, env_options_, table_cache_.get(),
             &write_buffer_manager_, &write_controller_)),
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 3657fb691be..1bc6a8afe83 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -50,7 +50,8 @@ class WalManagerTest : public testing::Test {
 
     versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
                                    table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_));
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
 
     wal_manager_.reset(new WalManager(db_options_, env_options_));
   }
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index b0538433b4a..3a32d6f82bd 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1317,6 +1317,17 @@ class DB {
   virtual Status EndTrace() {
     return Status::NotSupported("EndTrace() is not implemented.");
   }
+
+  // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+  virtual Status StartBlockCacheTrace(
+      const TraceOptions& /*options*/,
+      std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status EndBlockCacheTrace() {
+    return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+  }
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 6e98a48e591..8535952cd3e 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -315,6 +315,16 @@ class StackableDB : public DB {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
   }
 
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
 #endif  // ROCKSDB_LITE
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index cf205be72de..00b13033f3d 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -198,7 +198,8 @@ Status BlockBasedTableFactory::NewTableReader(
       file_size, table_reader, table_reader_options.prefix_extractor,
       prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
       table_reader_options.level, table_reader_options.immortal,
-      table_reader_options.largest_seqno, &tail_prefetch_stats_);
+      table_reader_options.largest_seqno, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 75c8301c5c2..7434188a01d 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1020,19 +1020,17 @@ Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
   return Slice(cache_key, static_cast<size_t>(end - cache_key));
 }
 
-Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
-                             const EnvOptions& env_options,
-                             const BlockBasedTableOptions& table_options,
-                             const InternalKeyComparator& internal_comparator,
-                             std::unique_ptr<RandomAccessFileReader>&& file,
-                             uint64_t file_size,
-                             std::unique_ptr<TableReader>* table_reader,
-                             const SliceTransform* prefix_extractor,
-                             const bool prefetch_index_and_filter_in_cache,
-                             const bool skip_filters, const int level,
-                             const bool immortal_table,
-                             const SequenceNumber largest_seqno,
-                             TailPrefetchStats* tail_prefetch_stats) {
+Status BlockBasedTable::Open(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    const SliceTransform* prefix_extractor,
+    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+    const int level, const bool immortal_table,
+    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer) {
   table_reader->reset();
 
   Status s;
@@ -1082,7 +1080,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   rep->internal_prefix_transform.reset(
       new InternalKeySliceTransform(prefix_extractor));
   SetupCacheKeyPrefix(rep);
-  std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
 
   // page cache options
   rep->persistent_cache_options =
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 420da25932b..223746b3ac9 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -35,6 +35,7 @@
 #include "table/table_properties_internal.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/user_comparator_wrapper.h"
@@ -108,7 +109,8 @@ class BlockBasedTable : public TableReader {
                      bool skip_filters = false, int level = -1,
                      const bool immortal_table = false,
                      const SequenceNumber largest_seqno = 0,
-                     TailPrefetchStats* tail_prefetch_stats = nullptr);
+                     TailPrefetchStats* tail_prefetch_stats = nullptr,
+                     BlockCacheTracer* const block_cache_tracer = nullptr);
 
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
@@ -239,11 +241,13 @@ class BlockBasedTable : public TableReader {
 
  protected:
   Rep* rep_;
-  explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
+  explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+      : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
 
  private:
   friend class MockedBlockBasedTable;
   static std::atomic<uint64_t> next_cache_key_id_;
+  BlockCacheTracer* const block_cache_tracer_;
 
   void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
                              size_t usage) const;
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 5af7034968a..34ecfa4ac65 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -23,7 +23,8 @@ std::map<uint64_t, Slice> slices;
 
 class MockedBlockBasedTable : public BlockBasedTable {
  public:
-  explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {
+  explicit MockedBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
     rep->cache_key_prefix_size = 10;
   }
diff --git a/table/table_builder.h b/table/table_builder.h
index 21df978c3eb..23189200c64 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -18,6 +18,7 @@
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
@@ -32,10 +33,12 @@ struct TableReaderOptions {
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters = false, bool _immortal = false,
-                     int _level = -1)
+                     int _level = -1,
+                     BlockCacheTracer* const _block_cache_tracer = nullptr)
       : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
                            _internal_comparator, _skip_filters, _immortal,
-                           _level, 0 /* _largest_seqno */) {}
+                           _level, 0 /* _largest_seqno */,
+                           _block_cache_tracer) {}
 
   // @param skip_filters Disables loading/accessing the filter block
   TableReaderOptions(const ImmutableCFOptions& _ioptions,
@@ -43,7 +46,8 @@ struct TableReaderOptions {
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters, bool _immortal, int _level,
-                     SequenceNumber _largest_seqno)
+                     SequenceNumber _largest_seqno,
+                     BlockCacheTracer* const _block_cache_tracer)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
         env_options(_env_options),
@@ -51,7 +55,8 @@ struct TableReaderOptions {
         skip_filters(_skip_filters),
         immortal(_immortal),
         level(_level),
-        largest_seqno(_largest_seqno) {}
+        largest_seqno(_largest_seqno),
+        block_cache_tracer(_block_cache_tracer) {}
 
   const ImmutableCFOptions& ioptions;
   const SliceTransform* prefix_extractor;
@@ -65,6 +70,7 @@ struct TableReaderOptions {
   int level;
   // largest seqno in the table
   SequenceNumber largest_seqno;
+  BlockCacheTracer* const block_cache_tracer;
 };
 
 struct TableBuilderOptions {
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 958d862fd32..49489173c33 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -954,7 +954,8 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc);
+  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   Status s = versions.DumpManifest(options, file, verbose, hex, json);
   if (!s.ok()) {
     printf("Error in processing file %s %s\n", file.c_str(),
@@ -1664,7 +1665,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   const InternalKeyComparator cmp(opt.comparator);
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
-  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc);
+  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 58c7df70b20..565511e5a07 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -23,30 +23,29 @@ bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) {
           record.caller == BlockCacheLookupCaller::kUserMGet);
 }
 
-BlockCacheTraceWriter::BlockCacheTraceWriter(
-    Env* env, const TraceOptions& trace_options,
-    std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
-      trace_options_(trace_options),
-      trace_writer_(std::move(trace_writer)) {}
-
-bool BlockCacheTraceWriter::ShouldTrace(
-    const BlockCacheTraceRecord& record) const {
-  if (trace_options_.sampling_frequency == 0 ||
-      trace_options_.sampling_frequency == 1) {
+bool ShouldTrace(const BlockCacheTraceRecord& record,
+                 const TraceOptions& trace_options) {
+  if (trace_options.sampling_frequency == 0 ||
+      trace_options.sampling_frequency == 1) {
     return true;
   }
   // We use spatial downsampling so that we have a complete access history for a
   // block.
   const uint64_t hash = GetSliceNPHash64(Slice(record.block_key));
-  return hash % trace_options_.sampling_frequency == 0;
+  return hash % trace_options.sampling_frequency == 0;
 }
 
+BlockCacheTraceWriter::BlockCacheTraceWriter(
+    Env* env, const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
 Status BlockCacheTraceWriter::WriteBlockAccess(
     const BlockCacheTraceRecord& record) {
   uint64_t trace_file_size = trace_writer_->GetFileSize();
-  if (trace_file_size > trace_options_.max_trace_file_size ||
-      !ShouldTrace(record)) {
+  if (trace_file_size > trace_options_.max_trace_file_size) {
     return Status::OK();
   }
   Trace trace;
@@ -68,7 +67,6 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
   }
   std::string encoded_trace;
   TracerHelper::EncodeTrace(trace, &encoded_trace);
-  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   return trace_writer_->Write(encoded_trace);
 }
 
@@ -81,7 +79,6 @@ Status BlockCacheTraceWriter::WriteHeader() {
   PutFixed32(&trace.payload, kMinorVersion);
   std::string encoded_trace;
   TracerHelper::EncodeTrace(trace, &encoded_trace);
-  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   return trace_writer_->Write(encoded_trace);
 }
 
@@ -216,4 +213,41 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   return Status::OK();
 }
 
+BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); }
+
+BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
+
+Status BlockCacheTracer::StartTrace(
+    Env* env, const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (writer_.load()) {
+    return Status::OK();
+  }
+  trace_options_ = trace_options;
+  writer_.store(
+      new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer)));
+  return writer_.load()->WriteHeader();
+}
+
+void BlockCacheTracer::EndTrace() {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  delete writer_.load();
+  writer_.store(nullptr);
+}
+
+Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) {
+  if (!writer_.load() || !ShouldTrace(record, trace_options_)) {
+    return Status::OK();
+  }
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return Status::OK();
+  }
+  return writer_.load()->WriteBlockAccess(record);
+}
+
 }  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index e24d5a5ef35..320e6d67b3c 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <atomic>
+
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
@@ -101,13 +103,9 @@ class BlockCacheTraceWriter {
   Status WriteHeader();
 
  private:
-  bool ShouldTrace(const BlockCacheTraceRecord& record) const;
-
   Env* env_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
-  /*Mutex to protect trace_writer_ */
-  InstrumentedMutex trace_writer_mutex_;
 };
 
 // BlockCacheTraceReader helps read the trace file generated by
@@ -130,4 +128,33 @@ class BlockCacheTraceReader {
   std::unique_ptr<TraceReader> trace_reader_;
 };
 
+// A block cache tracer. It downsamples the accesses according to
+// trace_options and uses BlockCacheTraceWriter to write the access record to
+// the trace file.
+class BlockCacheTracer {
+ public:
+  BlockCacheTracer();
+  ~BlockCacheTracer();
+  // No copy and move.
+  BlockCacheTracer(const BlockCacheTracer&) = delete;
+  BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
+  BlockCacheTracer(BlockCacheTracer&&) = delete;
+  BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
+
+  // Start writing block cache accesses to the trace_writer.
+  Status StartTrace(Env* env, const TraceOptions& trace_options,
+                    std::unique_ptr<TraceWriter>&& trace_writer);
+
+  // Stop writing block cache accesses to the trace_writer.
+  void EndTrace();
+
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record);
+
+ private:
+  TraceOptions trace_options_;
+  // A mutex protects the writer_.
+  InstrumentedMutex trace_writer_mutex_;
+  std::atomic<BlockCacheTraceWriter*> writer_;
+};
+
 }  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index 28052d9db8d..c6fc3e4acee 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -80,6 +80,26 @@ class BlockCacheTracerTest : public testing::Test {
     }
   }
 
+  BlockCacheTraceRecord GenerateAccessRecord() {
+    uint32_t key_id = 0;
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = kBlockSize;
+    record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = kCFId;
+    record.cf_name = kDefaultColumnFamilyName;
+    record.caller = GetCaller(key_id);
+    record.level = kLevel;
+    record.sst_fd_number = kSSTFDNumber + key_id;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kFalse;
+    record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+    record.is_referenced_key_exist_in_block = Boolean::kTrue;
+    record.num_keys_in_block = kNumKeysInBlock;
+    return record;
+  }
+
   void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id,
                     TraceType block_type, uint32_t nblocks) {
     assert(reader);
@@ -118,6 +138,88 @@ class BlockCacheTracerTest : public testing::Test {
   std::string test_path_;
 };
 
+TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    // The record should be written to the trace_file since StartTrace is not
+    // called.
+    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains nothing.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_NOK(reader.ReadHeader(&header));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, AtomicWrite) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record));
+    writer.EndTrace();
+    // Write the record again. This time the record should not be written since
+    // EndTrace is called.
+    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
 TEST_F(BlockCacheTracerTest, MixedBlocks) {
   {
     // Generate a trace file containing a mix of blocks.

From 89695bfbaafd6fd589ad37e31ab27d9cf25e9930 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 14 Jun 2019 09:13:48 -0700
Subject: [PATCH 143/572] Remove unused variable (#5457)

Summary:
This PR removes the unused variable that causes CLANG build to fail.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5457

Differential Revision: D15825027

Pulled By: HaoyuHuang

fbshipit-source-id: 72c847c39ca310560efcbc5938cffa6f31164068
---
 trace_replay/block_cache_tracer_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index c6fc3e4acee..0f3ca67c611 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -141,7 +141,6 @@ class BlockCacheTracerTest : public testing::Test {
 TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
   BlockCacheTraceRecord record = GenerateAccessRecord();
   {
-    TraceOptions trace_opt;
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));

From 58c78358ef0442ec3adeffa1df1dd43a593177ce Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Fri, 14 Jun 2019 10:33:45 -0700
Subject: [PATCH 144/572] Set executeLocal on child lego jobs (#5456)

Summary:
This property is needed to run the child jobs on the same host and thus propagate the child job status back to the parent's.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5456

Reviewed By: yancouto

Differential Revision: D15824382

Pulled By: maysamyabandeh

fbshipit-source-id: 42f2efbedaa3a8b399281105f0ce793c1c9a6191
---
 build_tools/rocksdb-lego-determinator | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index e47b2ef30d8..dc32b3af9ff 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -133,6 +133,7 @@ UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -153,6 +154,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -175,6 +177,7 @@ RELEASE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb Release Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -195,6 +198,7 @@ UNIT_TEST_COMMANDS_481="[
     {
         'name':'Rocksdb Unit Test on GCC 4.8.1',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -215,6 +219,7 @@ RELEASE_BUILD_COMMANDS_481="[
     {
         'name':'Rocksdb Release on GCC 4.8.1',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -235,6 +240,7 @@ CLANG_UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -255,6 +261,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb CLANG Release Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -275,6 +282,7 @@ CLANG_ANALYZE_COMMANDS="[
     {
         'name':'Rocksdb analyze',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -295,6 +303,7 @@ CODE_COV_COMMANDS="[
     {
         'name':'Rocksdb Unit Test Code Coverage',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -315,6 +324,7 @@ UNITY_COMMANDS="[
     {
         'name':'Rocksdb Unity',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -335,6 +345,7 @@ LITE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb Lite build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -354,6 +365,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[
     {
         'name':'Rocksdb Lite Binary Size',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -371,6 +383,7 @@ STRESS_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb Stress and Crash Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -399,6 +412,7 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
         'name':'Rocksdb Stress and Crash Test with atomic flush',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -427,6 +441,7 @@ WRITE_STRESS_COMMANDS="[
     {
         'name':'Rocksdb Write Stress Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -449,6 +464,7 @@ ASAN_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -469,6 +485,7 @@ ASAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb crash test under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -491,6 +508,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
         'name':'Rocksdb crash test with atomic flush under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -513,6 +531,7 @@ UBSAN_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -533,6 +552,7 @@ UBSAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb crash test under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -555,6 +575,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
         'name':'Rocksdb crash test with atomic flush under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -577,6 +598,7 @@ VALGRIND_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under valgrind',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -599,6 +621,7 @@ TSAN_UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -621,6 +644,7 @@ TSAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb Crash Test under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -643,6 +667,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
         'name':'Rocksdb Crash Test with atomic flush under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -675,6 +700,7 @@ FORMAT_COMPATIBLE_COMMANDS="[
     {
         'name':'Rocksdb Format Compatible tests',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -708,6 +734,7 @@ NO_COMPRESSION_COMMANDS="[
     {
         'name':'Rocksdb No Compression tests',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -785,6 +812,7 @@ JAVA_BUILD_TEST_COMMANDS="[
     {
         'name':'Rocksdb Java Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {

From b47cfec5d01fd8c848b7539e5b43884257dba613 Mon Sep 17 00:00:00 2001
From: Huisheng Liu <hliu@microsoft.com>
Date: Fri, 14 Jun 2019 11:24:02 -0700
Subject: [PATCH 145/572] fix compilation error on MSVC (#5458)

Summary:
"__attribute__((__weak__))" was introduced in port\jemalloc_helper.h. It's not supported by Microsoft VS 2015, resulting in compile error. This fix adds a #if branch to work around the compile issue.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5458

Differential Revision: D15827285

fbshipit-source-id: 8c5f7ad31de1ac677bd96f16c4450767de834beb
---
 port/jemalloc_helper.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index 0c216face13..26e5fb66336 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -16,6 +16,14 @@
 #define JEMALLOC_CXX_THROW
 #endif
 
+#if defined(OS_WIN) && defined(_MSC_VER)
+
+// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is defined,
+// Jemalloc memory allocator is used.
+static inline bool HasJemalloc() { return true; }
+
+#else
+
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
 extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
@@ -50,4 +58,6 @@ static inline bool HasJemalloc() {
          malloc_stats_print != nullptr && malloc_usable_size != nullptr;
 }
 
+#endif
+
 #endif  // ROCKSDB_JEMALLOC

From f1219644ec834a96f3a2a13d83046126e8e7409d Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Fri, 14 Jun 2019 14:07:50 -0700
Subject: [PATCH 146/572] Validate CF Options when creating a new column family
 (#5453)

Summary:
It seems like CF Options are not properly validated  when creating a new column family with `CreateColumnFamily` API; only a selected few checks are done. Calling `ColumnFamilyData::ValidateOptions`, which is the single source for all CFOptions validations,  will help fix this. (`ColumnFamilyData::ValidateOptions` is already called at the time of `DB::Open`).

**Test Plan:**
Added a new test: `DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions`
```
TEST_TMPDIR=/dev/shm ./db_test --gtest_filter=DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions
```
Also ran gtest-parallel to make sure the new test is not flaky.
```
TEST_TMPDIR=/dev/shm ~/gtest-parallel/gtest-parallel ./db_test --gtest_filter=DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions --repeat=10000
[10000/10000] DBTest.CreateColumnFamilyShouldFailOnIncompatibleOptions (15 ms)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5453

Differential Revision: D15816851

Pulled By: sagar0

fbshipit-source-id: 9e702b9850f5c4a7e0ef8d39e1e6f9b81e7fe1e5
---
 HISTORY.md            |  1 +
 db/db_impl/db_impl.cc | 10 +++-------
 db/db_test.cc         | 13 +++++++++++++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 5574c769878..228d02b61df 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,7 @@
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
+* Improve ColumnFamilyOptions validation when creating a new column family.
 
 ### Bug Fixes
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index af39b5ca11d..154e6dd2339 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1944,13 +1944,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
   Status persist_options_status;
   *handle = nullptr;
 
-  s = CheckCompressionSupported(cf_options);
-  if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) {
-    s = CheckConcurrentWritesSupported(cf_options);
-  }
-  if (s.ok()) {
-    s = CheckCFPathsSupported(initial_db_options_, cf_options);
-  }
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
   if (s.ok()) {
     for (auto& cf_path : cf_options.cf_paths) {
       s = env_->CreateDirIfMissing(cf_path.path);
diff --git a/db/db_test.cc b/db/db_test.cc
index 3bac53f2f0a..0204f4d9f62 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5978,6 +5978,19 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
   }
 }
 
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is only supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, RowCache) {
   Options options = CurrentOptions();

From 7a8d7358bb40b13a06c2c6adc62e80295d89ed05 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 14 Jun 2019 17:37:24 -0700
Subject: [PATCH 147/572] Integrate block cache tracer in block based table
 reader. (#5441)

Summary:
This PR integrates the block cache tracer into block based table reader. The tracer will write the block cache accesses using the trace_writer. The tracer is null in this PR so that nothing will be logged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5441

Differential Revision: D15772029

Pulled By: HaoyuHuang

fbshipit-source-id: a64adb92642cd23222e0ba8b10d86bf522b42f9b
---
 table/block_based/block_based_table_reader.cc | 265 ++++++++++++++----
 table/block_based/block_based_table_reader.h  |  18 ++
 tools/block_cache_trace_analyzer.h            |   5 +-
 tools/block_cache_trace_analyzer_test.cc      |   5 +-
 trace_replay/block_cache_tracer.cc            |  62 ++--
 trace_replay/block_cache_tracer.h             |  89 +++++-
 trace_replay/block_cache_tracer_test.cc       |  31 +-
 7 files changed, 365 insertions(+), 110 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 7434188a01d..0caea508822 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1877,9 +1877,8 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
     const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* /*lookup_context*/,
+    BlockCacheLookupContext* lookup_context,
     const SliceTransform* prefix_extractor) const {
-  // TODO(haoyu): Trace filter block access here.
   // If cache_index_and_filter_blocks is false, filter should be pre-populated.
   // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
   // read fails at Open() time. We don't want to reload again since it will
@@ -1912,17 +1911,22 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
       GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
 
   FilterBlockReader* filter = nullptr;
+  size_t usage = 0;
+  bool is_cache_hit = false;
+  bool return_empty_reader = false;
   if (cache_handle != nullptr) {
     filter =
         reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
+    usage = filter->ApproximateMemoryUsage();
+    is_cache_hit = true;
   } else if (no_io) {
     // Do not invoke any io.
-    return CachableEntry<FilterBlockReader>();
+    return_empty_reader = true;
   } else {
     filter = ReadFilter(prefetch_buffer, filter_blk_handle,
                         is_a_filter_partition, prefix_extractor);
     if (filter != nullptr) {
-      size_t usage = filter->ApproximateMemoryUsage();
+      usage = filter->ApproximateMemoryUsage();
       Status s = block_cache->Insert(
           key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
           rep_->table_options.cache_index_and_filter_blocks_with_high_priority
@@ -1934,19 +1938,36 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
       } else {
         RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
         delete filter;
-        return CachableEntry<FilterBlockReader>();
+        return_empty_reader = true;
       }
     }
   }
 
+  if (block_cache_tracer_ && lookup_context) {
+    // Avoid making copy of block_key and cf_name when constructing the access
+    // record.
+    BlockCacheTraceRecord access_record(
+        rep_->ioptions.env->NowMicros(),
+        /*block_key=*/"", TraceType::kBlockTraceFilterBlock,
+        /*block_size=*/usage, rep_->cf_id_for_tracing(),
+        /*cf_name=*/"", rep_->level_for_tracing(),
+        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+        /*no_insert=*/no_io);
+    block_cache_tracer_->WriteBlockAccess(access_record, key,
+                                          rep_->cf_name_for_tracing(),
+                                          /*referenced_key=*/nullptr);
+  }
+
+  if (return_empty_reader) {
+    return CachableEntry<FilterBlockReader>();
+  }
   return {filter, cache_handle ? block_cache : nullptr, cache_handle,
           /*own_value=*/false};
 }
 
 CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
     FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* /*lookup_context*/) const {
-  // TODO(haoyu): Trace the access on the uncompression dictionary here.
+    BlockCacheLookupContext* lookup_context) const {
   if (!rep_->table_options.cache_index_and_filter_blocks) {
     // block cache is either disabled or not used for meta-blocks. In either
     // case, BlockBasedTableReader is the owner of the uncompression dictionary.
@@ -1964,9 +1985,13 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
       GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
                         BlockType::kCompressionDictionary, get_context);
   UncompressionDict* dict = nullptr;
+  bool is_cache_hit = false;
+  size_t usage = 0;
   if (cache_handle != nullptr) {
     dict = reinterpret_cast<UncompressionDict*>(
         rep_->table_options.block_cache->Value(cache_handle));
+    is_cache_hit = true;
+    usage = dict->ApproximateMemoryUsage();
   } else if (no_io) {
     // Do not invoke any io.
   } else {
@@ -1980,7 +2005,7 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
           new UncompressionDict(compression_dict_block->data.ToString(),
                                 rep_->blocks_definitely_zstd_compressed,
                                 rep_->ioptions.statistics));
-      const size_t usage = uncompression_dict->ApproximateMemoryUsage();
+      usage = uncompression_dict->ApproximateMemoryUsage();
       s = rep_->table_options.block_cache->Insert(
           cache_key, uncompression_dict.get(), usage,
           &DeleteCachedUncompressionDictEntry, &cache_handle,
@@ -2000,6 +2025,20 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
       }
     }
   }
+  if (block_cache_tracer_ && lookup_context) {
+    // Avoid making copy of block_key and cf_name when constructing the access
+    // record.
+    BlockCacheTraceRecord access_record(
+        rep_->ioptions.env->NowMicros(),
+        /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
+        /*block_size=*/usage, rep_->cf_id_for_tracing(),
+        /*cf_name=*/"", rep_->level_for_tracing(),
+        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+        /*no_insert=*/no_io);
+    block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
+                                          rep_->cf_name_for_tracing(),
+                                          /*referenced_key=*/nullptr);
+  }
   return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
           cache_handle, false /* own_value */};
 }
@@ -2116,13 +2155,10 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
-    GetContext* get_context,
-    BlockCacheLookupContext* /*lookup_context*/) const {
-  // TODO(haoyu): Trace data/index/range deletion block access here.
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
-
   // No point to cache compressed blocks if it never goes away
   Cache* block_cache_compressed =
       rep_->immortal_table ? nullptr
@@ -2136,6 +2172,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   Slice key /* key to the block cache */;
   Slice ckey /* key to the compressed block cache */;
+  bool is_cache_hit = false;
+  bool no_insert = true;
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
     // create key for block cache
     if (block_cache != nullptr) {
@@ -2152,10 +2190,15 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
                               ro, block_entry, uncompression_dict, block_type,
                               get_context);
-
+    if (block_entry->GetValue()) {
+      // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+      // compressed block cache.
+      is_cache_hit = true;
+    }
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
+      no_insert = false;
       Statistics* statistics = rep_->ioptions.statistics;
       bool do_decompress =
           block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
@@ -2186,6 +2229,59 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       }
     }
   }
+
+  // Fill lookup_context.
+  if (block_cache_tracer_ && lookup_context) {
+    size_t usage = 0;
+    uint64_t nkeys = 0;
+    if (block_entry->GetValue()) {
+      // Approximate the number of keys in the block using restarts.
+      nkeys = rep_->table_options.block_restart_interval *
+              block_entry->GetValue()->NumRestarts();
+      usage = block_entry->GetValue()->ApproximateMemoryUsage();
+    }
+    TraceType trace_block_type = TraceType::kTraceMax;
+    switch (block_type) {
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
+      case BlockType::kData:
+        trace_block_type = TraceType::kBlockTraceDataBlock;
+        break;
+      case BlockType::kRangeDeletion:
+        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+        break;
+      default:
+        // This cannot happen.
+        assert(false);
+        break;
+    }
+    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
+            trace_block_type, lookup_context->caller)) {
+      // Defer logging the access to Get() and MultiGet() to trace additional
+      // information, e.g., the referenced key,
+      // referenced_key_exist_in_block.
+
+      // Make a copy of the block key here since it will be logged later.
+      lookup_context->FillLookupContext(
+          is_cache_hit, no_insert, trace_block_type,
+          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+    } else {
+      // Avoid making copy of block_key and cf_name when constructing the access
+      // record.
+      BlockCacheTraceRecord access_record(
+          rep_->ioptions.env->NowMicros(),
+          /*block_key=*/"", trace_block_type,
+          /*block_size=*/usage, rep_->cf_id_for_tracing(),
+          /*cf_name=*/"", rep_->level_for_tracing(),
+          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+          no_insert);
+      block_cache_tracer_->WriteBlockAccess(access_record, key,
+                                            rep_->cf_name_for_tracing(),
+                                            /*referenced_key=*/nullptr);
+    }
+  }
+
   assert(s.ok() || block_entry->GetValue() == nullptr);
   return s;
 }
@@ -2874,11 +2970,15 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
       } else {
+        BlockCacheLookupContext lookup_data_block_context{
+            BlockCacheLookupCaller::kUserGet};
+        bool does_referenced_key_exist = false;
         DataBlockIter biter;
+        uint64_t referenced_data_size = 0;
         NewDataBlockIterator<DataBlockIter>(
             read_options, iiter->value(), &biter, BlockType::kData,
             /*key_includes_seq=*/true,
-            /*index_key_is_full=*/true, get_context, &lookup_context,
+            /*index_key_is_full=*/true, get_context, &lookup_data_block_context,
             /*s=*/Status(), /*prefetch_buffer*/ nullptr);
 
         if (read_options.read_tier == kBlockCacheTier &&
@@ -2902,25 +3002,47 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           // the end of the block, i.e. cannot be in the following blocks
           // either. In this case, the seek_key cannot be found, so we break
           // from the top level for-loop.
-          break;
-        }
-
-        // Call the *saver function on each entry/block until it returns false
-        for (; biter.Valid(); biter.Next()) {
-          ParsedInternalKey parsed_key;
-          if (!ParseInternalKey(biter.key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
-          }
+          done = true;
+        } else {
+          // Call the *saver function on each entry/block until it returns false
+          for (; biter.Valid(); biter.Next()) {
+            ParsedInternalKey parsed_key;
+            if (!ParseInternalKey(biter.key(), &parsed_key)) {
+              s = Status::Corruption(Slice());
+            }
 
-          if (!get_context->SaveValue(
-                  parsed_key, biter.value(), &matched,
-                  biter.IsValuePinned() ? &biter : nullptr)) {
-            done = true;
-            break;
+            if (!get_context->SaveValue(
+                    parsed_key, biter.value(), &matched,
+                    biter.IsValuePinned() ? &biter : nullptr)) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+              done = true;
+              break;
+            }
           }
+          s = biter.status();
+        }
+        // Write the block cache access record.
+        if (block_cache_tracer_) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.env->NowMicros(),
+              /*block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              /*referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          block_cache_tracer_->WriteBlockAccess(
+              access_record, lookup_data_block_context.block_key,
+              rep_->cf_name_for_tracing(), key);
         }
-        s = biter.status();
       }
+
       if (done) {
         // Avoid the extra Next which is expensive in two-level indexes
         break;
@@ -2992,14 +3114,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       bool done = false;
       for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
         bool reusing_block = true;
+        uint64_t referenced_data_size = 0;
+        bool does_referenced_key_exist = false;
+        BlockCacheLookupContext lookup_data_block_context(
+            BlockCacheLookupCaller::kUserMGet);
         if (iiter->value().offset() != offset) {
           offset = iiter->value().offset();
           biter.Invalidate(Status::OK());
           NewDataBlockIterator<DataBlockIter>(
               read_options, iiter->value(), &biter, BlockType::kData,
               /*key_includes_seq=*/false,
-              /*index_key_is_full=*/true, get_context, &lookup_context,
-              Status(), nullptr);
+              /*index_key_is_full=*/true, get_context,
+              &lookup_data_block_context, Status(), nullptr);
           reusing_block = false;
         }
         if (read_options.read_tier == kBlockCacheTier &&
@@ -3021,38 +3147,59 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           // the end of the block, i.e. cannot be in the following blocks
           // either. In this case, the seek_key cannot be found, so we break
           // from the top level for-loop.
-          break;
-        }
-
-        // Call the *saver function on each entry/block until it returns false
-        for (; biter.Valid(); biter.Next()) {
-          ParsedInternalKey parsed_key;
-          Cleanable dummy;
-          Cleanable* value_pinner = nullptr;
-
-          if (!ParseInternalKey(biter.key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
-          }
-          if (biter.IsValuePinned()) {
-            if (reusing_block) {
-              Cache* block_cache = rep_->table_options.block_cache.get();
-              assert(biter.cache_handle() != nullptr);
-              block_cache->Ref(biter.cache_handle());
-              dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                                    biter.cache_handle());
-              value_pinner = &dummy;
-            } else {
-              value_pinner = &biter;
+          done = true;
+        } else {
+          // Call the *saver function on each entry/block until it returns false
+          for (; biter.Valid(); biter.Next()) {
+            ParsedInternalKey parsed_key;
+            Cleanable dummy;
+            Cleanable* value_pinner = nullptr;
+
+            if (!ParseInternalKey(biter.key(), &parsed_key)) {
+              s = Status::Corruption(Slice());
+            }
+            if (biter.IsValuePinned()) {
+              if (reusing_block) {
+                Cache* block_cache = rep_->table_options.block_cache.get();
+                assert(biter.cache_handle() != nullptr);
+                block_cache->Ref(biter.cache_handle());
+                dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                                      biter.cache_handle());
+                value_pinner = &dummy;
+              } else {
+                value_pinner = &biter;
+              }
             }
-          }
 
-          if (!get_context->SaveValue(
-                  parsed_key, biter.value(), &matched, value_pinner)) {
-            done = true;
-            break;
+            if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
+                                        value_pinner)) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+              done = true;
+              break;
+            }
           }
+          s = biter.status();
+        }
+        // Write the block cache access.
+        if (block_cache_tracer_) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.env->NowMicros(),
+              /*block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              /*referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          block_cache_tracer_->WriteBlockAccess(
+              access_record, lookup_data_block_context.block_key,
+              rep_->cf_name_for_tracing(), key);
         }
-        s = biter.status();
         if (done) {
           // Avoid the extra Next which is expensive in two-level indexes
           break;
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 223746b3ac9..17c4e7238c8 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/persistent_cache.h"
@@ -571,6 +572,23 @@ struct BlockBasedTable::Rep {
                ? kDisableGlobalSequenceNumber
                : global_seqno;
   }
+
+  uint64_t cf_id_for_tracing() const {
+    return table_properties ? table_properties->column_family_id
+                            : rocksdb::TablePropertiesCollectorFactory::
+                                  Context::kUnknownColumnFamily;
+  }
+
+  Slice cf_name_for_tracing() const {
+    return table_properties ? table_properties->column_family_name
+                            : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+  }
+
+  uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+  uint64_t sst_number_for_tracing() const {
+    return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
+  }
 };
 
 // Iterates over the contents of BlockBasedTable.
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 9dde8a939b5..51bb1ec7930 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -35,10 +35,11 @@ struct BlockAccessInfo {
     block_size = access.block_size;
     caller_num_access_map[access.caller]++;
     num_accesses++;
-    if (ShouldTraceReferencedKey(access)) {
+    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type,
+                                                        access.caller)) {
       num_keys = access.num_keys_in_block;
 
-      if (access.is_referenced_key_exist_in_block == Boolean::kTrue) {
+      if (access.referenced_key_exist_in_block == Boolean::kTrue) {
         key_num_access_map[access.referenced_key]++;
         num_referenced_key_exist_in_block++;
       } else {
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index 96f52c1ec00..a75804492f6 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -89,9 +89,10 @@ class BlockCacheTracerTest : public testing::Test {
       // The writer should only write these fields for data blocks and the
       // caller is either GET or MGET.
       record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
-      record.is_referenced_key_exist_in_block = Boolean::kTrue;
+      record.referenced_key_exist_in_block = Boolean::kTrue;
       record.num_keys_in_block = kNumKeysInBlock;
-      ASSERT_OK(writer->WriteBlockAccess(record));
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
     }
   }
 
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 565511e5a07..f733bc9005f 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -15,13 +15,6 @@ namespace rocksdb {
 
 namespace {
 const unsigned int kCharSize = 1;
-}  // namespace
-
-bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) {
-  return (record.block_type == TraceType::kBlockTraceDataBlock) &&
-         (record.caller == BlockCacheLookupCaller::kUserGet ||
-          record.caller == BlockCacheLookupCaller::kUserMGet);
-}
 
 bool ShouldTrace(const BlockCacheTraceRecord& record,
                  const TraceOptions& trace_options) {
@@ -34,6 +27,17 @@ bool ShouldTrace(const BlockCacheTraceRecord& record,
   const uint64_t hash = GetSliceNPHash64(Slice(record.block_key));
   return hash % trace_options.sampling_frequency == 0;
 }
+}  // namespace
+
+const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
+    "UnknownColumnFamily";
+
+bool BlockCacheTraceHelper::ShouldTraceReferencedKey(
+    TraceType block_type, BlockCacheLookupCaller caller) {
+  return (block_type == TraceType::kBlockTraceDataBlock) &&
+         (caller == BlockCacheLookupCaller::kUserGet ||
+          caller == BlockCacheLookupCaller::kUserMGet);
+}
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
     Env* env, const TraceOptions& trace_options,
@@ -43,7 +47,8 @@ BlockCacheTraceWriter::BlockCacheTraceWriter(
       trace_writer_(std::move(trace_writer)) {}
 
 Status BlockCacheTraceWriter::WriteBlockAccess(
-    const BlockCacheTraceRecord& record) {
+    const BlockCacheTraceRecord& record, const Slice& block_key,
+    const Slice& cf_name, const Slice& referenced_key) {
   uint64_t trace_file_size = trace_writer_->GetFileSize();
   if (trace_file_size > trace_options_.max_trace_file_size) {
     return Status::OK();
@@ -51,19 +56,21 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
   Trace trace;
   trace.ts = record.access_timestamp;
   trace.type = record.block_type;
-  PutLengthPrefixedSlice(&trace.payload, record.block_key);
+  PutLengthPrefixedSlice(&trace.payload, block_key);
   PutFixed64(&trace.payload, record.block_size);
-  PutFixed32(&trace.payload, record.cf_id);
-  PutLengthPrefixedSlice(&trace.payload, record.cf_name);
+  PutFixed64(&trace.payload, record.cf_id);
+  PutLengthPrefixedSlice(&trace.payload, cf_name);
   PutFixed32(&trace.payload, record.level);
-  PutFixed32(&trace.payload, record.sst_fd_number);
+  PutFixed64(&trace.payload, record.sst_fd_number);
   trace.payload.push_back(record.caller);
   trace.payload.push_back(record.is_cache_hit);
   trace.payload.push_back(record.no_insert);
-  if (ShouldTraceReferencedKey(record)) {
-    PutLengthPrefixedSlice(&trace.payload, record.referenced_key);
+  if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type,
+                                                      record.caller)) {
+    PutLengthPrefixedSlice(&trace.payload, referenced_key);
+    PutFixed64(&trace.payload, record.referenced_data_size);
     PutFixed64(&trace.payload, record.num_keys_in_block);
-    trace.payload.push_back(record.is_referenced_key_exist_in_block);
+    trace.payload.push_back(record.referenced_key_exist_in_block);
   }
   std::string encoded_trace;
   TracerHelper::EncodeTrace(trace, &encoded_trace);
@@ -143,6 +150,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   record->access_timestamp = trace.ts;
   record->block_type = trace.type;
   Slice enc_slice = Slice(trace.payload);
+
   Slice block_key;
   if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
     return Status::Incomplete(
@@ -153,7 +161,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
     return Status::Incomplete(
         "Incomplete access record: Failed to read block size.");
   }
-  if (!GetFixed32(&enc_slice, &record->cf_id)) {
+  if (!GetFixed64(&enc_slice, &record->cf_id)) {
     return Status::Incomplete(
         "Incomplete access record: Failed to read column family ID.");
   }
@@ -167,7 +175,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
     return Status::Incomplete(
         "Incomplete access record: Failed to read level.");
   }
-  if (!GetFixed32(&enc_slice, &record->sst_fd_number)) {
+  if (!GetFixed64(&enc_slice, &record->sst_fd_number)) {
     return Status::Incomplete(
         "Incomplete access record: Failed to read SST file number.");
   }
@@ -190,13 +198,18 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   record->no_insert = static_cast<Boolean>(enc_slice[0]);
   enc_slice.remove_prefix(kCharSize);
 
-  if (ShouldTraceReferencedKey(*record)) {
+  if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type,
+                                                      record->caller)) {
     Slice referenced_key;
     if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read the referenced key.");
     }
     record->referenced_key = referenced_key.ToString();
+    if (!GetFixed64(&enc_slice, &record->referenced_data_size)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced data size.");
+    }
     if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read the number of keys in the "
@@ -205,10 +218,9 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
     if (enc_slice.empty()) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read "
-          "is_referenced_key_exist_in_block.");
+          "referenced_key_exist_in_block.");
     }
-    record->is_referenced_key_exist_in_block =
-        static_cast<Boolean>(enc_slice[0]);
+    record->referenced_key_exist_in_block = static_cast<Boolean>(enc_slice[0]);
   }
   return Status::OK();
 }
@@ -239,7 +251,10 @@ void BlockCacheTracer::EndTrace() {
   writer_.store(nullptr);
 }
 
-Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) {
+Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
+                                          const Slice& block_key,
+                                          const Slice& cf_name,
+                                          const Slice& referenced_key) {
   if (!writer_.load() || !ShouldTrace(record, trace_options_)) {
     return Status::OK();
   }
@@ -247,7 +262,8 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) {
   if (!writer_.load()) {
     return Status::OK();
   }
-  return writer_.load()->WriteBlockAccess(record);
+  return writer_.load()->WriteBlockAccess(record, block_key, cf_name,
+                                          referenced_key);
 }
 
 }  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 320e6d67b3c..bf88133111e 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -49,28 +49,80 @@ struct BlockCacheLookupContext {
   BlockCacheLookupContext(const BlockCacheLookupCaller& _caller)
       : caller(_caller) {}
   const BlockCacheLookupCaller caller;
+  // These are populated when we perform lookup/insert on block cache. The block
+  // cache tracer uses these inforation when logging the block access at
+  // BlockBasedTable::GET and BlockBasedTable::MultiGet.
+  bool is_cache_hit = false;
+  bool no_insert = false;
+  TraceType block_type = TraceType::kTraceMax;
+  uint64_t block_size = 0;
+  std::string block_key;
+  uint64_t num_keys_in_block = 0;
+
+  void FillLookupContext(bool _is_cache_hit, bool _no_insert,
+                         TraceType _block_type, uint64_t _block_size,
+                         const std::string& _block_key,
+                         uint64_t _num_keys_in_block) {
+    is_cache_hit = _is_cache_hit;
+    no_insert = _no_insert;
+    block_type = _block_type;
+    block_size = _block_size;
+    block_key = _block_key;
+    num_keys_in_block = _num_keys_in_block;
+  }
 };
 
 enum Boolean : char { kTrue = 1, kFalse = 0 };
 
 struct BlockCacheTraceRecord {
   // Required fields for all accesses.
-  uint64_t access_timestamp;
+  uint64_t access_timestamp = 0;
   std::string block_key;
-  TraceType block_type;
-  uint64_t block_size;
-  uint32_t cf_id;
+  TraceType block_type = TraceType::kTraceMax;
+  uint64_t block_size = 0;
+  uint64_t cf_id = 0;
   std::string cf_name;
-  uint32_t level;
-  uint32_t sst_fd_number;
-  BlockCacheLookupCaller caller;
-  Boolean is_cache_hit;
-  Boolean no_insert;
+  uint32_t level = 0;
+  uint64_t sst_fd_number = 0;
+  BlockCacheLookupCaller caller =
+      BlockCacheLookupCaller::kMaxBlockCacheLookupCaller;
+  Boolean is_cache_hit = Boolean::kFalse;
+  Boolean no_insert = Boolean::kFalse;
 
   // Required fields for data block and user Get/Multi-Get only.
   std::string referenced_key;
+  uint64_t referenced_data_size = 0;
   uint64_t num_keys_in_block = 0;
-  Boolean is_referenced_key_exist_in_block = Boolean::kFalse;
+  Boolean referenced_key_exist_in_block = Boolean::kFalse;
+
+  BlockCacheTraceRecord() {}
+
+  BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
+                        TraceType _block_type, uint64_t _block_size,
+                        uint64_t _cf_id, std::string _cf_name, uint32_t _level,
+                        uint64_t _sst_fd_number, BlockCacheLookupCaller _caller,
+                        bool _is_cache_hit, bool _no_insert,
+                        std::string _referenced_key = "",
+                        uint64_t _referenced_data_size = 0,
+                        uint64_t _num_keys_in_block = 0,
+                        bool _referenced_key_exist_in_block = false)
+      : access_timestamp(_access_timestamp),
+        block_key(_block_key),
+        block_type(_block_type),
+        block_size(_block_size),
+        cf_id(_cf_id),
+        cf_name(_cf_name),
+        level(_level),
+        sst_fd_number(_sst_fd_number),
+        caller(_caller),
+        is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
+        no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
+        referenced_key(_referenced_key),
+        referenced_data_size(_referenced_data_size),
+        num_keys_in_block(_num_keys_in_block),
+        referenced_key_exist_in_block(
+            _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
+  }
 };
 
 struct BlockCacheTraceHeader {
@@ -79,7 +131,13 @@ struct BlockCacheTraceHeader {
   uint32_t rocksdb_minor_version;
 };
 
-bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record);
+class BlockCacheTraceHelper {
+ public:
+  static bool ShouldTraceReferencedKey(TraceType block_type,
+                                       BlockCacheLookupCaller caller);
+
+  static const std::string kUnknownColumnFamilyName;
+};
 
 // BlockCacheTraceWriter captures all RocksDB block cache accesses using a
 // user-provided TraceWriter. Every RocksDB operation is written as a single
@@ -96,7 +154,10 @@ class BlockCacheTraceWriter {
   BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
   BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
 
-  Status WriteBlockAccess(const BlockCacheTraceRecord& record);
+  // Pass Slice references to avoid copy.
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
 
   // Write a trace header at the beginning, typically on initiating a trace,
   // with some metadata like a magic number and RocksDB version.
@@ -148,7 +209,9 @@ class BlockCacheTracer {
   // Stop writing block cache accesses to the trace_writer.
   void EndTrace();
 
-  Status WriteBlockAccess(const BlockCacheTraceRecord& record);
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
 
  private:
   TraceOptions trace_options_;
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index 0f3ca67c611..95fe16b8c8f 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -20,6 +20,7 @@ const uint32_t kLevel = 1;
 const uint64_t kSSTFDNumber = 100;
 const std::string kRefKeyPrefix = "test-get-";
 const uint64_t kNumKeysInBlock = 1024;
+const uint64_t kReferencedDataSize = 10;
 }  // namespace
 
 class BlockCacheTracerTest : public testing::Test {
@@ -61,7 +62,7 @@ class BlockCacheTracerTest : public testing::Test {
       BlockCacheTraceRecord record;
       record.block_type = block_type;
       record.block_size = kBlockSize + key_id;
-      record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+      record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
       record.access_timestamp = env_->NowMicros();
       record.cf_id = kCFId;
       record.cf_name = kDefaultColumnFamilyName;
@@ -73,10 +74,12 @@ class BlockCacheTracerTest : public testing::Test {
       // Provide these fields for all block types.
       // The writer should only write these fields for data blocks and the
       // caller is either GET or MGET.
-      record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
-      record.is_referenced_key_exist_in_block = Boolean::kTrue;
+      record.referenced_key = (kRefKeyPrefix + std::to_string(key_id));
+      record.referenced_key_exist_in_block = Boolean::kTrue;
       record.num_keys_in_block = kNumKeysInBlock;
-      ASSERT_OK(writer->WriteBlockAccess(record));
+      record.referenced_data_size = kReferencedDataSize + key_id;
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
     }
   }
 
@@ -95,7 +98,7 @@ class BlockCacheTracerTest : public testing::Test {
     record.is_cache_hit = Boolean::kFalse;
     record.no_insert = Boolean::kFalse;
     record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
-    record.is_referenced_key_exist_in_block = Boolean::kTrue;
+    record.referenced_key_exist_in_block = Boolean::kTrue;
     record.num_keys_in_block = kNumKeysInBlock;
     return record;
   }
@@ -122,13 +125,15 @@ class BlockCacheTracerTest : public testing::Test {
            record.caller == BlockCacheLookupCaller::kUserMGet)) {
         ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
                   record.referenced_key);
-        ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block);
+        ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block);
         ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
+        ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size);
         continue;
       }
       ASSERT_EQ("", record.referenced_key);
-      ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block);
+      ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block);
       ASSERT_EQ(0, record.num_keys_in_block);
+      ASSERT_EQ(0, record.referenced_data_size);
     }
   }
 
@@ -147,7 +152,8 @@ TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
     BlockCacheTracer writer;
     // The record should be written to the trace_file since StartTrace is not
     // called.
-    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {
@@ -170,7 +176,8 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) {
                                  &trace_writer));
     BlockCacheTracer writer;
     ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
-    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {
@@ -197,11 +204,13 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
                                  &trace_writer));
     BlockCacheTracer writer;
     ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
-    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
     writer.EndTrace();
     // Write the record again. This time the record should not be written since
     // EndTrace is called.
-    ASSERT_OK(writer.WriteBlockAccess(record));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {

From d1ae67bdb921e32b7d5c2ad614a1b69faab64c9c Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Mon, 17 Jun 2019 10:15:58 -0700
Subject: [PATCH 148/572] Switch Travis to Xenial build (#4789)

Summary:
I think this should now also run on Travis's new virtualised infrastructure which affords more memory and CPU.

We also need to think about migrating from travis-ci.org to travis-ci.com.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4789

Differential Revision: D15856272

fbshipit-source-id: 10b41d21924e8a362bc9646a63ccd1a5dfc437c6
---
 .travis.yml         |  13 +-
 CMakeLists.txt      |   1 +
 java/CMakeLists.txt | 301 +++++++++++++++++++++++++-------------------
 java/Makefile       |  15 +++
 4 files changed, 193 insertions(+), 137 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index e759a642a0c..75eaac8eab5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,4 @@
-sudo: false
-dist: trusty
+dist: xenial
 language: cpp
 os:
   - linux
@@ -9,7 +8,7 @@ compiler:
   - gcc
 osx_image: xcode8.3
 jdk:
-  - oraclejdk7
+  - openjdk7
 cache:
   - ccache
   - apt
@@ -71,7 +70,10 @@ install:
       CC=gcc-8 && CXX=g++-8;
     fi
   - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then
-      mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+      mkdir cmake-dist && curl -sfSL https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+    fi
+  - if [[ "${JOB_NAME}" == java_test ]]; then
+      java -version && echo "JAVA_HOME=${JAVA_HOME}";
     fi
 
 before_script:
@@ -101,7 +103,7 @@ script:
     esac
   - case $JOB_NAME in
     java_test)
-      OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest
+      OPT=-DTRAVIS V=1 make rocksdbjava jtest
       ;;
     lite_build)
       OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools
@@ -110,6 +112,7 @@ script:
       OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4
       ;;
     cmake-mingw)
+      sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix;
       mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
       ;;
     cmake*)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 006f6798666..eda1281e149 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,6 +183,7 @@ else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing")
   if(MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
+    add_definitions(-D_POSIX_C_SOURCE=1)
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 360951834a7..f00b6f7f919 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -1,5 +1,9 @@
 cmake_minimum_required(VERSION 3.4)
 
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4")
+    message("Please consider switching to CMake 3.11.4 or newer")
+endif()
+
 set(JNI_NATIVE_SOURCES
         rocksjni/backupablejni.cc
         rocksjni/backupenginejni.cc
@@ -11,9 +15,9 @@ set(JNI_NATIVE_SOURCES
         rocksjni/compaction_filter.cc
         rocksjni/compaction_filter_factory.cc
         rocksjni/compaction_filter_factory_jnicallback.cc
-	rocksjni/compaction_job_info.cc
-	rocksjni/compaction_job_stats.cc
-	rocksjni/compaction_options.cc
+        rocksjni/compaction_job_info.cc
+        rocksjni/compaction_job_stats.cc
+        rocksjni/compaction_options.cc
         rocksjni/compaction_options_fifo.cc
         rocksjni/compaction_options_universal.cc
         rocksjni/compact_range_options.cc
@@ -72,125 +76,9 @@ set(JNI_NATIVE_SOURCES
         rocksjni/write_buffer_manager.cc
 )
 
-set(NATIVE_JAVA_CLASSES
-        org.rocksdb.AbstractCompactionFilter
-        org.rocksdb.AbstractCompactionFilterFactory
-        org.rocksdb.AbstractComparator
-        org.rocksdb.AbstractImmutableNativeReference
-        org.rocksdb.AbstractNativeReference
-        org.rocksdb.AbstractRocksIterator
-        org.rocksdb.AbstractSlice
-        org.rocksdb.AbstractTableFilter
-        org.rocksdb.AbstractTraceWriter
-        org.rocksdb.AbstractTransactionNotifier
-        org.rocksdb.AbstractWalFilter
-        org.rocksdb.BackupableDBOptions
-        org.rocksdb.BackupEngine
-        org.rocksdb.BlockBasedTableConfig
-        org.rocksdb.BloomFilter
-        org.rocksdb.CassandraCompactionFilter
-        org.rocksdb.CassandraValueMergeOperator
-        org.rocksdb.Checkpoint
-        org.rocksdb.ClockCache
-        org.rocksdb.ColumnFamilyHandle
-        org.rocksdb.ColumnFamilyOptions
-        org.rocksdb.CompactionJobInfo
-        org.rocksdb.CompactionJobStats
-        org.rocksdb.CompactionOptions
-        org.rocksdb.CompactionOptionsFIFO
-        org.rocksdb.CompactionOptionsUniversal
-        org.rocksdb.CompactRangeOptions
-        org.rocksdb.Comparator
-        org.rocksdb.ComparatorOptions
-        org.rocksdb.CompressionOptions
-        org.rocksdb.DBOptions
-        org.rocksdb.DirectComparator
-        org.rocksdb.DirectSlice
-        org.rocksdb.Env
-        org.rocksdb.EnvOptions
-        org.rocksdb.Filter
-        org.rocksdb.FlushOptions
-        org.rocksdb.HashLinkedListMemTableConfig
-        org.rocksdb.HashSkipListMemTableConfig
-        org.rocksdb.HdfsEnv
-        org.rocksdb.IngestExternalFileOptions
-        org.rocksdb.Logger
-        org.rocksdb.LRUCache
-        org.rocksdb.MemoryUtil
-        org.rocksdb.MemTableConfig
-        org.rocksdb.NativeComparatorWrapper
-        org.rocksdb.NativeLibraryLoader
-        org.rocksdb.OptimisticTransactionDB
-        org.rocksdb.OptimisticTransactionOptions
-        org.rocksdb.Options
-        org.rocksdb.OptionsUtil
-        org.rocksdb.PersistentCache
-        org.rocksdb.PlainTableConfig
-        org.rocksdb.RateLimiter
-        org.rocksdb.ReadOptions
-        org.rocksdb.RemoveEmptyValueCompactionFilter
-        org.rocksdb.RestoreOptions
-        org.rocksdb.RocksCallbackObject
-        org.rocksdb.RocksDB
-        org.rocksdb.RocksEnv
-        org.rocksdb.RocksIterator
-        org.rocksdb.RocksIteratorInterface
-        org.rocksdb.RocksMemEnv
-        org.rocksdb.RocksMutableObject
-        org.rocksdb.RocksObject
-        org.rocksdb.SkipListMemTableConfig
-        org.rocksdb.Slice
-        org.rocksdb.Snapshot
-        org.rocksdb.SstFileManager
-        org.rocksdb.SstFileWriter
-        org.rocksdb.Statistics
-        org.rocksdb.StringAppendOperator
-        org.rocksdb.TableFormatConfig
-        org.rocksdb.ThreadStatus
-        org.rocksdb.TimedEnv
-        org.rocksdb.Transaction
-        org.rocksdb.TransactionDB
-        org.rocksdb.TransactionDBOptions
-        org.rocksdb.TransactionLogIterator
-        org.rocksdb.TransactionOptions
-        org.rocksdb.TtlDB
-        org.rocksdb.UInt64AddOperator
-        org.rocksdb.VectorMemTableConfig
-        org.rocksdb.WBWIRocksIterator
-        org.rocksdb.WriteBatch
-        org.rocksdb.WriteBatch.Handler
-        org.rocksdb.WriteBatchInterface
-        org.rocksdb.WriteBatchWithIndex
-        org.rocksdb.WriteOptions
-        org.rocksdb.NativeComparatorWrapperTest
-        org.rocksdb.RocksDBExceptionTest
-        org.rocksdb.SnapshotTest
-        org.rocksdb.WriteBatchTest
-        org.rocksdb.WriteBatchTestInternalHelper
-        org.rocksdb.WriteBufferManager
-)
-
-include(FindJava)
-include(UseJava)
-include(FindJNI)
-
-include_directories(${JNI_INCLUDE_DIRS})
-include_directories(${PROJECT_SOURCE_DIR}/java)
-
-set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
-set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar)
-set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar)
-set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
-set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
-set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
-set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
-set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
-
-add_jar(
-  rocksdbjni_classes
-  SOURCES
-  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
+set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/AbstractCompactionFilter.java
+  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
   src/main/java/org/rocksdb/AbstractComparator.java
   src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
   src/main/java/org/rocksdb/AbstractMutableOptions.java
@@ -338,8 +226,8 @@ add_jar(
   src/main/java/org/rocksdb/WalProcessingOption.java
   src/main/java/org/rocksdb/WALRecoveryMode.java
   src/main/java/org/rocksdb/WBWIRocksIterator.java
-  src/main/java/org/rocksdb/WriteBatchInterface.java
   src/main/java/org/rocksdb/WriteBatch.java
+  src/main/java/org/rocksdb/WriteBatchInterface.java
   src/main/java/org/rocksdb/WriteBatchWithIndex.java
   src/main/java/org/rocksdb/WriteOptions.java
   src/main/java/org/rocksdb/WriteBufferManager.java
@@ -348,6 +236,10 @@ add_jar(
   src/main/java/org/rocksdb/util/Environment.java
   src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
   src/main/java/org/rocksdb/util/SizeUnit.java
+  src/main/java/org/rocksdb/UInt64AddOperator.java
+)
+
+set(JAVA_TEST_CLASSES
   src/test/java/org/rocksdb/BackupEngineTest.java
   src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
   src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
@@ -355,13 +247,59 @@ add_jar(
   src/test/java/org/rocksdb/RocksDBExceptionTest.java
   src/test/java/org/rocksdb/RocksMemoryResource.java
   src/test/java/org/rocksdb/SnapshotTest.java
-  src/main/java/org/rocksdb/UInt64AddOperator.java
   src/test/java/org/rocksdb/WriteBatchTest.java
   src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
   src/test/java/org/rocksdb/util/WriteBatchGetter.java
-  INCLUDE_JARS ${JAVA_TESTCLASSPATH}
 )
 
+include(FindJava)
+include(UseJava)
+find_package(JNI)
+
+include_directories(${JNI_INCLUDE_DIRS})
+include_directories(${PROJECT_SOURCE_DIR}/java)
+
+set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
+set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar)
+set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar)
+set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
+set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
+set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
+set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
+set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
+
+set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
+
+if(${Java_VERSION_MAJOR} VERSION_GREATER_EQUAL "10" AND ${CMAKE_VERSION} VERSION_LESS "3.11.4")
+  # Java 10 and newer don't have javah, but the alternative GENERATE_NATIVE_HEADERS requires CMake 3.11.4 or newer
+  message(FATAL_ERROR "Detected Java 10 or newer (${Java_VERSION_STRING}), to build with CMake please upgrade CMake to 3.11.4 or newer")
+
+elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1"))
+  # Old CMake or Java 1.7 prepare the JAR...
+  message("Preparing Jar for Java 7")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+  )
+
+else ()
+  # Java 1.8 or newer prepare the JAR...
+  message("Preparing Jar for JDK ${Java_VERSION_STRING}")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+      GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR}
+  )
+
+endif()
+
 if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes)
   file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes)
 endif()
@@ -424,15 +362,114 @@ if(NOT EXISTS ${JAVA_ASSERTJ_JAR})
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR})
 endif()
 
-set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1"))
+  # Old CMake or Java 1.7 ONLY generate JNI headers, Java 1.8+ JNI is handled in add_jar step above
+  message("Preparing JNI headers for Java 7")
+  set(NATIVE_JAVA_CLASSES
+          org.rocksdb.AbstractCompactionFilter
+          org.rocksdb.AbstractCompactionFilterFactory
+          org.rocksdb.AbstractComparator
+          org.rocksdb.AbstractImmutableNativeReference
+          org.rocksdb.AbstractNativeReference
+          org.rocksdb.AbstractRocksIterator
+          org.rocksdb.AbstractSlice
+          org.rocksdb.AbstractTableFilter
+          org.rocksdb.AbstractTraceWriter
+          org.rocksdb.AbstractTransactionNotifier
+          org.rocksdb.AbstractWalFilter
+          org.rocksdb.BackupableDBOptions
+          org.rocksdb.BackupEngine
+          org.rocksdb.BlockBasedTableConfig
+          org.rocksdb.BloomFilter
+          org.rocksdb.CassandraCompactionFilter
+          org.rocksdb.CassandraValueMergeOperator
+          org.rocksdb.Checkpoint
+          org.rocksdb.ClockCache
+          org.rocksdb.ColumnFamilyHandle
+          org.rocksdb.ColumnFamilyOptions
+          org.rocksdb.CompactionJobInfo
+          org.rocksdb.CompactionJobStats
+          org.rocksdb.CompactionOptions
+          org.rocksdb.CompactionOptionsFIFO
+          org.rocksdb.CompactionOptionsUniversal
+          org.rocksdb.CompactRangeOptions
+          org.rocksdb.Comparator
+          org.rocksdb.ComparatorOptions
+          org.rocksdb.CompressionOptions
+          org.rocksdb.DBOptions
+          org.rocksdb.DirectComparator
+          org.rocksdb.DirectSlice
+          org.rocksdb.Env
+          org.rocksdb.EnvOptions
+          org.rocksdb.Filter
+          org.rocksdb.FlushOptions
+          org.rocksdb.HashLinkedListMemTableConfig
+          org.rocksdb.HashSkipListMemTableConfig
+          org.rocksdb.HdfsEnv
+          org.rocksdb.IngestExternalFileOptions
+          org.rocksdb.Logger
+          org.rocksdb.LRUCache
+          org.rocksdb.MemoryUtil
+          org.rocksdb.MemTableConfig
+          org.rocksdb.NativeComparatorWrapper
+          org.rocksdb.NativeLibraryLoader
+          org.rocksdb.OptimisticTransactionDB
+          org.rocksdb.OptimisticTransactionOptions
+          org.rocksdb.Options
+          org.rocksdb.OptionsUtil
+          org.rocksdb.PersistentCache
+          org.rocksdb.PlainTableConfig
+          org.rocksdb.RateLimiter
+          org.rocksdb.ReadOptions
+          org.rocksdb.RemoveEmptyValueCompactionFilter
+          org.rocksdb.RestoreOptions
+          org.rocksdb.RocksCallbackObject
+          org.rocksdb.RocksDB
+          org.rocksdb.RocksEnv
+          org.rocksdb.RocksIterator
+          org.rocksdb.RocksIteratorInterface
+          org.rocksdb.RocksMemEnv
+          org.rocksdb.RocksMutableObject
+          org.rocksdb.RocksObject
+          org.rocksdb.SkipListMemTableConfig
+          org.rocksdb.Slice
+          org.rocksdb.Snapshot
+          org.rocksdb.SstFileManager
+          org.rocksdb.SstFileWriter
+          org.rocksdb.Statistics
+          org.rocksdb.StringAppendOperator
+          org.rocksdb.TableFormatConfig
+          org.rocksdb.ThreadStatus
+          org.rocksdb.TimedEnv
+          org.rocksdb.Transaction
+          org.rocksdb.TransactionDB
+          org.rocksdb.TransactionDBOptions
+          org.rocksdb.TransactionLogIterator
+          org.rocksdb.TransactionOptions
+          org.rocksdb.TtlDB
+          org.rocksdb.UInt64AddOperator
+          org.rocksdb.VectorMemTableConfig
+          org.rocksdb.WBWIRocksIterator
+          org.rocksdb.WriteBatch
+          org.rocksdb.WriteBatch.Handler
+          org.rocksdb.WriteBatchInterface
+          org.rocksdb.WriteBatchWithIndex
+          org.rocksdb.WriteOptions
+          org.rocksdb.NativeComparatorWrapperTest
+          org.rocksdb.RocksDBExceptionTest
+          org.rocksdb.SnapshotTest
+          org.rocksdb.WriteBatchTest
+          org.rocksdb.WriteBatchTestInternalHelper
+          org.rocksdb.WriteBufferManager
+  )
 
-file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
-create_javah(
-  TARGET rocksdbjni_headers
-  CLASSES ${NATIVE_JAVA_CLASSES}
-  CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
-  OUTPUT_DIR ${JNI_OUTPUT_DIR}
-)
+  create_javah(
+    TARGET rocksdbjni_headers
+    CLASSES ${NATIVE_JAVA_CLASSES}
+    CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
+    OUTPUT_DIR ${JNI_OUTPUT_DIR}
+  )
+endif()
 
 if(NOT MSVC)
   set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/java/Makefile b/java/Makefile
index efc9d2b4e11..7aa15bfd038 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -229,12 +229,20 @@ javalib: java java_test javadocs
 
 java:
 	$(AM_V_GEN)mkdir -p $(MAIN_CLASSES)
+ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
 	$(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\
 		$(MAIN_SRC)/org/rocksdb/util/*.java\
 		$(MAIN_SRC)/org/rocksdb/*.java
+else
+	$(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\
+		$(MAIN_SRC)/org/rocksdb/util/*.java\
+		$(MAIN_SRC)/org/rocksdb/*.java
+endif
 	$(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md
 	$(AM_V_at)@rm -f ./HISTORY-CPP.md
+ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
 	$(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+endif
 
 sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
@@ -276,11 +284,18 @@ resolve_test_deps:
 
 java_test: java resolve_test_deps
 	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
+ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0)
 	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
 		$(TEST_SRC)/org/rocksdb/test/*.java\
 		$(TEST_SRC)/org/rocksdb/util/*.java\
 		$(TEST_SRC)/org/rocksdb/*.java
 	$(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
+else
+	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\
+		$(TEST_SRC)/org/rocksdb/test/*.java\
+		$(TEST_SRC)/org/rocksdb/util/*.java\
+		$(TEST_SRC)/org/rocksdb/*.java
+endif
 
 test: java java_test run_test
 

From d43b4cd570dccf234d2a43f6acec2d5160971cc3 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 17 Jun 2019 11:03:47 -0700
Subject: [PATCH 149/572] Integrate block cache tracing into db_bench (#5459)

Summary:
This PR integrates the block cache tracing into db_bench. It adds three command line arguments.
-block_cache_trace_file (Block cache trace file path.) type: string default: ""
-block_cache_trace_max_trace_file_size_in_bytes (The maximum block cache
trace file size in bytes. Block cache accesses will not be logged if the
trace file size exceeds this threshold. Default is 64 GB.) type: int64
default: 68719476736
-block_cache_trace_sampling_frequency (Block cache trace sampling
frequency, termed s. It uses spatial downsampling and samples accesses to
one out of s blocks.) type: int32 default: 1
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5459

Differential Revision: D15832031

Pulled By: HaoyuHuang

fbshipit-source-id: 0ecf2f2686557251fe741a2769b21170777efa3d
---
 tools/db_bench_tool.cc | 61 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index b254978c5ed..a14758418c3 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -774,6 +774,17 @@ DEFINE_string(trace_file, "", "Trace workload to a file. ");
 DEFINE_int32(trace_replay_fast_forward, 1,
              "Fast forward trace replay, must >= 1. ");
 
+DEFINE_int32(block_cache_trace_sampling_frequency, 1,
+             "Block cache trace sampling frequency, termed s. It uses spatial "
+             "downsampling and samples accesses to one out of s blocks.");
+DEFINE_int64(
+    block_cache_trace_max_trace_file_size_in_bytes,
+    uint64_t{64} * 1024 * 1024 * 1024,
+    "The maximum block cache trace file size in bytes. Block cache accesses "
+    "will not be logged if the trace file size exceeds this threshold. Default "
+    "is 64 GB.");
+DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
+
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -2081,6 +2092,7 @@ class Benchmark {
   Options open_options_;  // keep options around to properly destroy db later
 #ifndef ROCKSDB_LITE
   TraceOptions trace_options_;
+  TraceOptions block_cache_trace_options_;
 #endif
   int64_t reads_;
   int64_t deletes_;
@@ -2917,6 +2929,47 @@ class Benchmark {
           fprintf(stdout, "Tracing the workload to: [%s]\n",
                   FLAGS_trace_file.c_str());
         }
+        // Start block cache tracing.
+        if (!FLAGS_block_cache_trace_file.empty()) {
+          // Sanity checks.
+          if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
+            fprintf(stderr,
+                    "Block cache trace sampling frequency must be higher than "
+                    "0.\n");
+            exit(1);
+          }
+          if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
+            fprintf(stderr,
+                    "The maximum file size for block cache tracing must be "
+                    "higher than 0.\n");
+            exit(1);
+          }
+          block_cache_trace_options_.max_trace_file_size =
+              FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
+          block_cache_trace_options_.sampling_frequency =
+              FLAGS_block_cache_trace_sampling_frequency;
+          std::unique_ptr<TraceWriter> block_cache_trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_block_cache_trace_file,
+                                        &block_cache_trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Encountered an error when creating trace writer, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
+                                           std::move(block_cache_trace_writer));
+          if (!s.ok()) {
+            fprintf(
+                stderr,
+                "Encountered an error when starting block cache tracing, %s\n",
+                s.ToString().c_str());
+            exit(1);
+          }
+          fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
+                  FLAGS_block_cache_trace_file.c_str());
+        }
 #endif  // ROCKSDB_LITE
 
         if (num_warmup > 0) {
@@ -2959,6 +3012,14 @@ class Benchmark {
                 s.ToString().c_str());
       }
     }
+    if (!FLAGS_block_cache_trace_file.empty()) {
+      Status s = db_.db->EndBlockCacheTrace();
+      if (!s.ok()) {
+        fprintf(stderr,
+                "Encountered an error ending the block cache tracing, %s\n",
+                s.ToString().c_str());
+      }
+    }
 #endif  // ROCKSDB_LITE
 
     if (FLAGS_statistics) {

From ee294c24ed26a7efb6688ed165328b7da68aee0d Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 17 Jun 2019 11:07:27 -0700
Subject: [PATCH 150/572] Make db_bloom_filter_test parallel (#5467)

Summary:
When run under TSAN it sometimes goes over 10m and times out. The slowest ones are `DBBloomFilterTestWithParam.BloomFilter` which we have 6 of them. Making the tests run in parallel should take care of the timeout issue.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5467

Differential Revision: D15856912

Pulled By: maysamyabandeh

fbshipit-source-id: 26c43c55312974c1b809c070342dee037d0219f4
---
 TARGETS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TARGETS b/TARGETS
index 7a8bb000596..45a99a55d77 100644
--- a/TARGETS
+++ b/TARGETS
@@ -539,7 +539,7 @@ ROCKS_TESTS = [
     [
         "db_bloom_filter_test",
         "db/db_bloom_filter_test.cc",
-        "serial",
+        "parallel",
     ],
     [
         "db_compaction_filter_test",

From 671d15cbdd3839acb54cb21a2aa82efca4917155 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Mon, 17 Jun 2019 15:17:43 -0700
Subject: [PATCH 151/572] Persistent Stats: persist stats history to disk
 (#5046)

Summary:
This PR continues the work in https://github.com/facebook/rocksdb/pull/4748 and https://github.com/facebook/rocksdb/pull/4535 by adding a new DBOption `persist_stats_to_disk` which instructs RocksDB to persist stats history to RocksDB itself. When statistics is enabled, and  both options `stats_persist_period_sec` and `persist_stats_to_disk` are set, RocksDB will periodically write stats to a built-in column family in the following form: key -> (timestamp in microseconds)#(stats name), value -> stats value. The existing API `GetStatsHistory` will detect the current value of `persist_stats_to_disk` and either read from in-memory data structure or from the hidden column family on disk.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5046

Differential Revision: D15863138

Pulled By: miasantreble

fbshipit-source-id: bb82abdb3f2ca581aa42531734ac799f113e931b
---
 CMakeLists.txt                                |   4 +-
 Makefile                                      |   4 +
 TARGETS                                       |   8 +-
 db/db_impl/db_impl.cc                         |  91 ++-
 db/db_impl/db_impl.h                          |  26 +-
 db/db_impl/db_impl_debug.cc                   |   4 +-
 db/db_impl/db_impl_open.cc                    | 117 +++-
 db/db_options_test.cc                         | 265 --------
 db/version_set.cc                             |  19 +-
 include/rocksdb/db.h                          |   3 +-
 include/rocksdb/options.h                     |  12 +
 include/rocksdb/stats_history.h               |   4 +-
 {db => monitoring}/in_memory_stats_history.cc |   2 +-
 {db => monitoring}/in_memory_stats_history.h  |   2 +-
 monitoring/persistent_stats_history.cc        | 171 ++++++
 monitoring/persistent_stats_history.h         |  83 +++
 monitoring/stats_history_test.cc              | 576 ++++++++++++++++++
 options/db_options.cc                         |   5 +-
 options/db_options.h                          |   1 +
 options/options.cc                            |   1 -
 options/options_helper.cc                     |   5 +
 options/options_settable_test.cc              |   1 +
 options/options_test.cc                       |   2 +
 src.mk                                        |  66 +-
 tools/db_bench_tool.cc                        |   3 +
 25 files changed, 1143 insertions(+), 332 deletions(-)
 rename {db => monitoring}/in_memory_stats_history.cc (97%)
 rename {db => monitoring}/in_memory_stats_history.h (98%)
 create mode 100644 monitoring/persistent_stats_history.cc
 create mode 100644 monitoring/persistent_stats_history.h
 create mode 100644 monitoring/stats_history_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eda1281e149..7ff61dca99f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -521,7 +521,6 @@ set(SOURCES
         db/flush_scheduler.cc
         db/forward_iterator.cc
         db/internal_stats.cc
-        db/in_memory_stats_history.cc
         db/logs_with_prep_tracker.cc
         db/log_reader.cc
         db/log_writer.cc
@@ -568,10 +567,12 @@ set(SOURCES
         memtable/write_buffer_manager.cc
         monitoring/histogram.cc
         monitoring/histogram_windowing.cc
+        monitoring/in_memory_stats_history.cc
         monitoring/instrumented_mutex.cc
         monitoring/iostats_context.cc
         monitoring/perf_context.cc
         monitoring/perf_level.cc
+        monitoring/persistent_stats_history.cc
         monitoring/statistics.cc
         monitoring/thread_status_impl.cc
         monitoring/thread_status_updater.cc
@@ -955,6 +956,7 @@ if(WITH_TESTS)
         monitoring/histogram_test.cc
         monitoring/iostats_context_test.cc
         monitoring/statistics_test.cc
+        monitoring/stats_history_test.cc
         options/options_settable_test.cc
         options/options_test.cc
         table/block_based/block_based_filter_block_test.cc
diff --git a/Makefile b/Makefile
index 5944325aafe..a499cbbedd7 100644
--- a/Makefile
+++ b/Makefile
@@ -548,6 +548,7 @@ TESTS = \
 	ldb_cmd_test \
 	persistent_cache_test \
 	statistics_test \
+	stats_history_test \
 	lru_cache_test \
 	object_registry_test \
 	repair_test \
@@ -1566,6 +1567,9 @@ persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o  db/db
 statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index 45a99a55d77..a43ed6b1085 100644
--- a/TARGETS
+++ b/TARGETS
@@ -113,7 +113,6 @@ cpp_library(
         "db/flush_job.cc",
         "db/flush_scheduler.cc",
         "db/forward_iterator.cc",
-        "db/in_memory_stats_history.cc",
         "db/internal_stats.cc",
         "db/log_reader.cc",
         "db/log_writer.cc",
@@ -163,10 +162,12 @@ cpp_library(
         "memtable/write_buffer_manager.cc",
         "monitoring/histogram.cc",
         "monitoring/histogram_windowing.cc",
+        "monitoring/in_memory_stats_history.cc",
         "monitoring/instrumented_mutex.cc",
         "monitoring/iostats_context.cc",
         "monitoring/perf_context.cc",
         "monitoring/perf_level.cc",
+        "monitoring/persistent_stats_history.cc",
         "monitoring/statistics.cc",
         "monitoring/thread_status_impl.cc",
         "monitoring/thread_status_updater.cc",
@@ -971,6 +972,11 @@ ROCKS_TESTS = [
         "monitoring/statistics_test.cc",
         "serial",
     ],
+    [
+        "stats_history_test",
+        "monitoring/stats_history_test.cc",
+        "serial",
+    ],
     [
         "stringappend_test",
         "utilities/merge_operators/string_append/stringappend_test.cc",
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 154e6dd2339..21b8f3d9165 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -34,7 +34,6 @@
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/forward_iterator.h"
-#include "db/in_memory_stats_history.h"
 #include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -58,8 +57,10 @@
 #include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "memtable/hash_skiplist_rep.h"
+#include "monitoring/in_memory_stats_history.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
 #include "options/cf_options.h"
@@ -98,6 +99,9 @@
 
 namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+    "___rocksdb_stats_history___");
+const int kMicrosInSecond = 1000 * 1000;
 void DumpRocksDBBuildVersion(Logger* log);
 
 CompressionType GetCompressionFlush(
@@ -162,6 +166,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       logfile_number_(0),
       log_dir_synced_(false),
       log_empty_(true),
+      persist_stats_cf_handle_(nullptr),
       log_sync_cv_(&mutex_),
       total_log_size_(0),
       is_snapshot_supported_(true),
@@ -482,10 +487,17 @@ Status DBImpl::CloseHelper() {
     }
   }
 
-  if (default_cf_handle_ != nullptr) {
+  if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
     // we need to delete handle outside of lock because it does its own locking
     mutex_.Unlock();
-    delete default_cf_handle_;
+    if (default_cf_handle_) {
+      delete default_cf_handle_;
+      default_cf_handle_ = nullptr;
+    }
+    if (persist_stats_cf_handle_) {
+      delete persist_stats_cf_handle_;
+      persist_stats_cf_handle_ = nullptr;
+    }
     mutex_.Lock();
   }
 
@@ -634,7 +646,7 @@ void DBImpl::StartTimedTasks() {
       if (!thread_dump_stats_) {
         thread_dump_stats_.reset(new rocksdb::RepeatableThread(
             [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-            stats_dump_period_sec * 1000000));
+            static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
       }
     }
     stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
@@ -642,14 +654,14 @@ void DBImpl::StartTimedTasks() {
       if (!thread_persist_stats_) {
         thread_persist_stats_.reset(new rocksdb::RepeatableThread(
             [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-            stats_persist_period_sec * 1000000));
+            static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
       }
     }
   }
 }
 
 // esitmate the total size of stats_history_
-size_t DBImpl::EstiamteStatsHistorySize() const {
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
   size_t size_total =
       sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
   if (stats_history_.size() == 0) return size_total;
@@ -671,7 +683,7 @@ void DBImpl::PersistStats() {
   if (shutdown_initiated_) {
     return;
   }
-  uint64_t now_micros = env_->NowMicros();
+  uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
   Statistics* statistics = immutable_db_options_.statistics.get();
   if (!statistics) {
     return;
@@ -682,12 +694,40 @@ void DBImpl::PersistStats() {
     stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
   }
 
-  // TODO(Zhongyi): also persist immutable_db_options_.statistics
-  {
-    std::map<std::string, uint64_t> stats_map;
-    if (!statistics->getTickerMap(&stats_map)) {
-      return;
+  std::map<std::string, uint64_t> stats_map;
+  if (!statistics->getTickerMap(&stats_map)) {
+    return;
+  }
+
+  if (immutable_db_options_.persist_stats_to_disk) {
+    WriteBatch batch;
+    if (stats_slice_initialized_) {
+      for (const auto& stat : stats_map) {
+        char key[100];
+        int length =
+            EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+        // calculate the delta from last time
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          uint64_t delta = stat.second - stats_slice_[stat.first];
+          batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
+                    ToString(delta));
+        }
+      }
     }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    Status s = Write(wo, &batch);
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing to persistent stats CF failed -- %s\n",
+                     s.ToString().c_str());
+    }
+    // TODO(Zhongyi): add purging for persisted data
+  } else {
     InstrumentedMutexLock l(&stats_history_mutex_);
     // calculate the delta from last time
     if (stats_slice_initialized_) {
@@ -697,17 +737,19 @@ void DBImpl::PersistStats() {
           stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
         }
       }
-      stats_history_[now_micros] = stats_delta;
+      stats_history_[now_seconds] = stats_delta;
     }
     stats_slice_initialized_ = true;
     std::swap(stats_slice_, stats_map);
     TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
 
     // delete older stats snapshots to control memory consumption
-    bool purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+    bool purge_needed =
+        EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
     while (purge_needed && !stats_history_.empty()) {
       stats_history_.erase(stats_history_.begin());
-      purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+      purge_needed =
+          EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
     }
   }
   // TODO: persist stats to disk
@@ -741,8 +783,13 @@ Status DBImpl::GetStatsHistory(
   if (!stats_iterator) {
     return Status::InvalidArgument("stats_iterator not preallocated.");
   }
-  stats_iterator->reset(
-      new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  if (immutable_db_options_.persist_stats_to_disk) {
+    stats_iterator->reset(
+        new PersistentStatsHistoryIterator(start_time, end_time, this));
+  } else {
+    stats_iterator->reset(
+        new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  }
   return (*stats_iterator)->status();
 }
 
@@ -946,7 +993,8 @@ Status DBImpl::SetDBOptions(
         if (new_options.stats_dump_period_sec > 0) {
           thread_dump_stats_.reset(new rocksdb::RepeatableThread(
               [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-              new_options.stats_dump_period_sec * 1000000));
+              static_cast<uint64_t>(new_options.stats_dump_period_sec) *
+                  kMicrosInSecond));
         } else {
           thread_dump_stats_.reset();
         }
@@ -961,7 +1009,8 @@ Status DBImpl::SetDBOptions(
         if (new_options.stats_persist_period_sec > 0) {
           thread_persist_stats_.reset(new rocksdb::RepeatableThread(
               [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-              new_options.stats_persist_period_sec * 1000000));
+              static_cast<uint64_t>(new_options.stats_persist_period_sec) *
+                  kMicrosInSecond));
         } else {
           thread_persist_stats_.reset();
         }
@@ -1373,6 +1422,10 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
   return default_cf_handle_;
 }
 
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+  return persist_stats_cf_handle_;
+}
+
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 942c36ff6e6..e6d5a56e244 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -66,6 +66,7 @@ class Arena;
 class ArenaWrappedDBIter;
 class InMemoryStatsHistoryIterator;
 class MemTable;
+class PersistentStatsHistoryIterator;
 class TableCache;
 class TaskLimiterToken;
 class Version;
@@ -268,6 +269,8 @@ class DBImpl : public DB {
 
   ColumnFamilyHandle* DefaultColumnFamily() const override;
 
+  ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
   virtual Status Close() override;
 
   Status GetStatsHistory(
@@ -822,7 +825,7 @@ class DBImpl : public DB {
   void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
   void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
   bool TEST_IsPersistentStatsEnabled() const;
-  size_t TEST_EstiamteStatsHistorySize() const;
+  size_t TEST_EstimateInMemoryStatsHistorySize() const;
 
 #endif  // NDEBUG
 
@@ -1016,6 +1019,7 @@ class DBImpl : public DB {
   friend class DBTest_MixedSlowdownOptionsStop_Test;
   friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
   friend class DBCompactionTest_CompactionDuringShutdown_Test;
+  friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
   friend class WriteCallbackTest_WriteWithCallbackTest_Test;
@@ -1176,6 +1180,21 @@ class DBImpl : public DB {
     PrepickedCompaction* prepicked_compaction;
   };
 
+  // Initialize the built-in column family for persistent stats. Depending on
+  // whether on-disk persistent stats have been enabled before, it may either
+  // create a new column family and column family handle or just a column family
+  // handle.
+  // Required: DB mutex held
+  Status InitPersistStatsColumnFamily();
+
+  // Persistent Stats column family has two format version key which are used
+  // for compatibility check. Write format version if it's created for the
+  // first time, read format version and check compatibility if recovering
+  // from disk. This function requires DB mutex held at entrance but may
+  // release and re-acquire DB mutex in the process.
+  // Required: DB mutex held
+  Status PersistentStatsProcessFormatVersion();
+
   Status ResumeImpl();
 
   void MaybeIgnoreError(Status* s) const;
@@ -1424,7 +1443,7 @@ class DBImpl : public DB {
 
   void PrintStatistics();
 
-  size_t EstiamteStatsHistorySize() const;
+  size_t EstimateInMemoryStatsHistorySize() const;
 
   // persist stats to column family "_persistent_stats"
   void PersistStats();
@@ -1571,6 +1590,9 @@ class DBImpl : public DB {
   // expesnive mutex_ lock during WAL write, which update log_empty_.
   bool log_empty_;
 
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+  bool persistent_stats_cfd_exists_ = true;
 
   // Without two_write_queues, read and writes to alive_log_files_ are
   // protected by mutex_. However since back() is never popped, and push_back()
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 4b558facb37..ec1e1b47752 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -262,8 +262,8 @@ bool DBImpl::TEST_IsPersistentStatsEnabled() const {
   return thread_persist_stats_ && thread_persist_stats_->IsRunning();
 }
 
-size_t DBImpl::TEST_EstiamteStatsHistorySize() const {
-  return EstiamteStatsHistorySize();
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+  return EstimateInMemoryStatsHistorySize();
 }
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index baa4fe707aa..eec7cf16aa7 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -13,6 +13,7 @@
 #include "db/builder.h"
 #include "db/error_handler.h"
 #include "file/sst_file_manager_impl.h"
+#include "monitoring/persistent_stats_history.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -375,6 +376,7 @@ Status DBImpl::Recover(
   }
 
   Status s = versions_->Recover(column_families, read_only);
+
   if (immutable_db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
@@ -386,6 +388,10 @@ Status DBImpl::Recover(
       }
     }
   }
+  // DB mutex is already held
+  if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
+    s = InitPersistStatsColumnFamily();
+  }
 
   // Initial max_total_in_memory_state_ before recovery logs. Log recovery
   // may check this value to decide whether to flush.
@@ -401,6 +407,8 @@ Status DBImpl::Recover(
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
     default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    // TODO(Zhongyi): handle single_column_family_mode_ when
+    // persistent_stats is enabled
     single_column_family_mode_ =
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
@@ -496,6 +504,98 @@ Status DBImpl::Recover(
   return s;
 }
 
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+  mutex_.AssertHeld();
+  Status s;
+  // persist version when stats CF doesn't exist
+  bool should_persist_format_version = !persistent_stats_cfd_exists_;
+  mutex_.Unlock();
+  if (persistent_stats_cfd_exists_) {
+    // Check persistent stats format version compatibility. Drop and recreate
+    // persistent stats CF if format version is incompatible
+    uint64_t format_version_recovered = 0;
+    Status s_format = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+    uint64_t compatible_version_recovered = 0;
+    Status s_compatible = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kCompatibleVersion,
+        &compatible_version_recovered);
+    // abort reading from existing stats CF if any of following is true:
+    // 1. failed to read format version or compatible version from disk
+    // 2. sst's format version is greater than current format version, meaning
+    // this sst is encoded with a newer RocksDB release, and current compatible
+    // version is below the sst's compatible version
+    if (!s_format.ok() || !s_compatible.ok() ||
+        (kStatsCFCurrentFormatVersion < format_version_recovered &&
+         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+      if (!s_format.ok() || !s_compatible.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Reading persistent stats version key failed. Format key: %s, "
+            "compatible key: %s",
+            s_format.ToString().c_str(), s_compatible.ToString().c_str());
+      } else {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Disable persistent stats due to corrupted or incompatible format "
+            "version\n");
+      }
+      DropColumnFamily(persist_stats_cf_handle_);
+      DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+      ColumnFamilyHandle* handle = nullptr;
+      ColumnFamilyOptions cfo;
+      OptimizeForPersistentStats(&cfo);
+      s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+      // should also persist version here because old stats CF is discarded
+      should_persist_format_version = true;
+    }
+  }
+  if (s.ok() && should_persist_format_version) {
+    // Persistent stats CF being created for the first time, need to write
+    // format version key
+    WriteBatch batch;
+    batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+              ToString(kStatsCFCurrentFormatVersion));
+    batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+              ToString(kStatsCFCompatibleFormatVersion));
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    s = Write(wo, &batch);
+  }
+  mutex_.Lock();
+  return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+  mutex_.AssertHeld();
+  assert(!persist_stats_cf_handle_);
+  ColumnFamilyData* persistent_stats_cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(
+          kPersistentStatsColumnFamilyName);
+  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+  Status s;
+  if (persistent_stats_cfd != nullptr) {
+    // We are recovering from a DB which already contains persistent stats CF,
+    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+    // column family handle was not. Need to explicitly create handle here.
+    persist_stats_cf_handle_ =
+        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+  } else {
+    mutex_.Unlock();
+    ColumnFamilyHandle* handle = nullptr;
+    ColumnFamilyOptions cfo;
+    OptimizeForPersistentStats(&cfo);
+    s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+    mutex_.Lock();
+  }
+  return s;
+}
+
 // REQUIRES: log_numbers are sorted in ascending order
 Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                                SequenceNumber* next_sequence, bool read_only) {
@@ -1065,12 +1165,23 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
   std::vector<ColumnFamilyDescriptor> column_families;
   column_families.push_back(
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  if (db_options.persist_stats_to_disk) {
+    column_families.push_back(
+        ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+  }
   std::vector<ColumnFamilyHandle*> handles;
   Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
-    assert(handles.size() == 1);
+    if (db_options.persist_stats_to_disk) {
+      assert(handles.size() == 2);
+    } else {
+      assert(handles.size() == 1);
+    }
     // i can delete the handle since DBImpl is always holding a reference to
     // default column family
+    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+      delete handles[1];
+    }
     delete handles[0];
   }
   return s;
@@ -1247,6 +1358,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       s = impl->directories_.GetDbDir()->Fsync();
     }
   }
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // try to read format version but no need to fail Open() even if it fails
+    s = impl->PersistentStatsProcessFormatVersion();
+  }
 
   if (s.ok()) {
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index bf33153284e..7dd672646b5 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -518,114 +518,6 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
   Close();
 }
 
-TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_dump_period_sec = 5;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0); // in seconds
-  options.env = mock_env.get();
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::DumpStats:1", [&](void* /*arg*/) {
-        counter++;
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
-  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-
-  // Test cacel job through SetOptions
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
-  int old_val = counter;
-  for (int i = 6; i < 20; ++i) {
-    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
-  }
-  ASSERT_EQ(counter, old_val);
-  Close();
-}
-
-// Test persistent stats background thread scheduling and cancelling
-TEST_F(DBOptionsTest, StatsPersistScheduling) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-
-  // Test cacel job through SetOptions
-  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
-  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
-  Close();
-}
-
-// Test enabling persistent stats for the first time
-TEST_F(DBOptionsTest, PersistentStatsFreshInstall) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 0;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-  Close();
-}
-
 TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
   Options options;
   options.create_if_missing = true;
@@ -640,163 +532,6 @@ TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
   ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
 }
 
-TEST_F(DBOptionsTest, GetStatsHistory) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = rocksdb::CreateDBStatistics();
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
-
-  CreateColumnFamilies({"pikachu"}, options);
-  ASSERT_OK(Put("foo", "bar"));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  int mock_time = 1;
-  // Wait for stats persist to finish
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0, 6 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  // disabled stats snapshots
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
-  size_t stats_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count += stats_map.size();
-  }
-  ASSERT_GT(stats_count, 0);
-  // Wait a bit and verify no more stats are found
-  for (mock_time = 6; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count_new = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    stats_count_new += stats_iter->GetStatsMap().size();
-  }
-  ASSERT_EQ(stats_count_new, stats_count);
-  Close();
-}
-
-TEST_F(DBOptionsTest, InMemoryStatsHistoryPurging) {
-  Options options;
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  options.stats_persist_period_sec = 1;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
-
-  CreateColumnFamilies({"pikachu"}, options);
-  ASSERT_OK(Put("foo", "bar"));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  // some random operation to populate statistics
-  ASSERT_OK(Delete("foo"));
-  ASSERT_OK(Put("sol", "sol"));
-  ASSERT_OK(Put("epic", "epic"));
-  ASSERT_OK(Put("ltd", "ltd"));
-  ASSERT_EQ("sol", Get("sol"));
-  ASSERT_EQ("epic", Get("epic"));
-  ASSERT_EQ("ltd", Get("ltd"));
-  Iterator* iterator = db_->NewIterator(ReadOptions());
-  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-    ASSERT_TRUE(iterator->key() == iterator->value());
-  }
-  delete iterator;
-  ASSERT_OK(Flush());
-  ASSERT_OK(Delete("sol"));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  int mock_time = 1;
-  // Wait for stats persist to finish
-  for (; mock_time < 5; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-
-  // second round of ops
-  ASSERT_OK(Put("saigon", "saigon"));
-  ASSERT_OK(Put("noodle talk", "noodle talk"));
-  ASSERT_OK(Put("ping bistro", "ping bistro"));
-  iterator = db_->NewIterator(ReadOptions());
-  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-    ASSERT_TRUE(iterator->key() == iterator->value());
-  }
-  delete iterator;
-  ASSERT_OK(Flush());
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  for (; mock_time < 10; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0, 10 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count = 0;
-  int slice_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    slice_count++;
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count += stats_map.size();
-  }
-  size_t stats_history_size = dbfull()->TEST_EstiamteStatsHistorySize();
-  ASSERT_GE(slice_count, 9);
-  ASSERT_GE(stats_history_size, 12000);
-  // capping memory cost at 12000 bytes since one slice is around 10000~12000
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
-  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
-  // Wait for stats persist to finish
-  for (; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count_reopen = 0;
-  slice_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    slice_count++;
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count_reopen += stats_map.size();
-  }
-  size_t stats_history_size_reopen = dbfull()->TEST_EstiamteStatsHistorySize();
-  // only one slice can fit under the new stats_history_buffer_size
-  ASSERT_LT(slice_count, 2);
-  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
-              stats_history_size_reopen > 0);
-  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
-  Close();
-}
-
 static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
   dbfull->TEST_LockMutex();
   JobContext job_context(0);
diff --git a/db/version_set.cc b/db/version_set.cc
index 30fc744c98a..ccedca7940d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -9,10 +9,10 @@
 
 #include "db/version_set.h"
 
-#include <cinttypes>
 #include <stdio.h>
 #include <algorithm>
 #include <array>
+#include <cinttypes>
 #include <list>
 #include <map>
 #include <set>
@@ -32,6 +32,7 @@
 #include "file/filename.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -3962,11 +3963,23 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
           edit.column_family_name_);
     }
     auto cf_options = name_to_options.find(edit.column_family_name_);
-    if (cf_options == name_to_options.end()) {
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options.end() &&
+        !is_persistent_stats_column_family) {
       column_families_not_found.insert(
           {edit.column_family_, edit.column_family_name_});
     } else {
-      cfd = CreateColumnFamily(cf_options->second, &edit);
+      // recover persistent_stats CF from a DB that already contains it
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        cfd = CreateColumnFamily(cfo, &edit);
+      } else {
+        cfd = CreateColumnFamily(cf_options->second, &edit);
+      }
       cfd->set_initialized();
       builders.insert(std::make_pair(
           edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 3a32d6f82bd..0f8573e4319 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -59,6 +59,7 @@ class CompactionJobInfo;
 #endif
 
 extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
 struct ColumnFamilyDescriptor {
   std::string name;
   ColumnFamilyOptions options;
@@ -1335,7 +1336,7 @@ class DB {
 
   // Given a window [start_time, end_time), setup a StatsHistoryIterator
   // to access stats history. Note the start_time and end_time are epoch
-  // time measured in microsecond, and end_time is an exclusive bound.
+  // time measured in seconds, and end_time is an exclusive bound.
   virtual Status GetStatsHistory(
       uint64_t /*start_time*/, uint64_t /*end_time*/,
       std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 307582fe678..fe5617fb5c3 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -694,6 +694,18 @@ struct DBOptions {
   // Default: 600
   unsigned int stats_persist_period_sec = 600;
 
+  // If true, automatically persist stats to a hidden column family (column
+  // family name: ___rocksdb_stats_history___) every
+  // stats_persist_period_sec seconds; otherwise, write to an in-memory
+  // struct. User can query through `GetStatsHistory` API.
+  // If user attempts to create a column family with the same name on a DB
+  // which have previously set persist_stats_to_disk to true, the column family
+  // creation will fail, but the hidden column family will survive, as well as
+  // the previously persisted statistics.
+  // When peristing stats to disk, the stat name will be limited at 100 bytes.
+  // Default: false
+  bool persist_stats_to_disk = false;
+
   // if not zero, periodically take stats snapshots and store in memory, the
   // memory size for stats snapshots is capped at stats_history_buffer_size
   // Default: 1MB
diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h
index 1a841908170..c6634ae68aa 100644
--- a/include/rocksdb/stats_history.h
+++ b/include/rocksdb/stats_history.h
@@ -49,10 +49,12 @@ class StatsHistoryIterator {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  // Return the time stamp (in microseconds) when stats history is recorded.
+  // Return the time stamp (in seconds) when stats history is recorded.
   // REQUIRES: Valid()
   virtual uint64_t GetStatsTime() const = 0;
 
+  virtual int GetFormatVersion() const { return -1; }
+
   // Return the current stats history as an std::map which specifies the
   // mapping from stats name to stats value . The underlying storage
   // for the returned map is valid only until the next modification of
diff --git a/db/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc
similarity index 97%
rename from db/in_memory_stats_history.cc
rename to monitoring/in_memory_stats_history.cc
index 41fdb71c8c1..22ecde0ab6c 100644
--- a/db/in_memory_stats_history.cc
+++ b/monitoring/in_memory_stats_history.cc
@@ -6,7 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/in_memory_stats_history.h"
+#include "monitoring/in_memory_stats_history.h"
 #include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
diff --git a/db/in_memory_stats_history.h b/monitoring/in_memory_stats_history.h
similarity index 98%
rename from db/in_memory_stats_history.h
rename to monitoring/in_memory_stats_history.h
index eeb679cc0a2..8ccec146a96 100644
--- a/db/in_memory_stats_history.h
+++ b/monitoring/in_memory_stats_history.h
@@ -25,7 +25,7 @@ namespace rocksdb {
 class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
  public:
   // Setup InMemoryStatsHistoryIterator to return stats snapshots between
-  // microsecond timestamps [start_time, end_time)
+  // seconds timestamps [start_time, end_time)
   InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
                                DBImpl* db_impl)
       : start_time_(start_time),
diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc
new file mode 100644
index 00000000000..c1704f56747
--- /dev/null
+++ b/monitoring/persistent_stats_history.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/persistent_stats_history.h"
+
+#include <cstring>
+#include <string>
+#include <utility>
+#include "db/db_impl/db_impl.h"
+#include "port/likely.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286]
+const int kNowSecondsStringLength = 10;
+const std::string kFormatVersionKeyString =
+    "__persistent_stats_format_version__";
+const std::string kCompatibleVersionKeyString =
+    "__persistent_stats_compatible_version__";
+// Every release maintains two versions numbers for persistents stats: Current
+// format version and compatible format version. Current format version
+// designates what type of encoding will be used when writing to stats CF;
+// compatible format version designates the minimum format version that
+// can decode the stats CF encoded using the current format version.
+const uint64_t kStatsCFCurrentFormatVersion = 1;
+const uint64_t kStatsCFCompatibleFormatVersion = 1;
+
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number) {
+  if (type >= StatsVersionKeyType::kKeyTypeMax) {
+    return Status::InvalidArgument("Invalid stats version key type provided");
+  }
+  std::string key;
+  if (type == StatsVersionKeyType::kFormatVersion) {
+    key = kFormatVersionKeyString;
+  } else if (type == StatsVersionKeyType::kCompatibleVersion) {
+    key = kCompatibleVersionKeyString;
+  }
+  ReadOptions options;
+  options.verify_checksums = true;
+  std::string result;
+  Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result);
+  if (!s.ok() || result.empty()) {
+    return Status::NotFound("Persistent stats version key " + key +
+                            " not found.");
+  }
+
+  // read version_number but do nothing in current version
+  *version_number = ParseUint64(result);
+  return Status::OK();
+}
+
+int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key,
+                             int size, char* buf) {
+  char timestamp[kNowSecondsStringLength + 1];
+  // make time stamp string equal in length to allow sorting by time
+  snprintf(timestamp, sizeof(timestamp), "%010d",
+           static_cast<int>(now_seconds));
+  timestamp[kNowSecondsStringLength] = '\0';
+  return snprintf(buf, size, "%s#%s", timestamp, key.c_str());
+}
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) {
+  cfo->write_buffer_size = 2 << 20;
+  cfo->target_file_size_base = 2 * 1048576;
+  cfo->max_bytes_for_level_base = 10 * 1048576;
+  cfo->snap_refresh_nanos = 0;
+  cfo->soft_pending_compaction_bytes_limit = 256 * 1048576;
+  cfo->hard_pending_compaction_bytes_limit = 1073741824ul;
+  cfo->compression = kNoCompression;
+}
+
+PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {}
+
+bool PersistentStatsHistoryIterator::Valid() const { return valid_; }
+
+Status PersistentStatsHistoryIterator::status() const { return status_; }
+
+void PersistentStatsHistoryIterator::Next() {
+  // increment start_time by 1 to avoid infinite loop
+  AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
+}
+
+uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; }
+
+const std::map<std::string, uint64_t>&
+PersistentStatsHistoryIterator::GetStatsMap() const {
+  return stats_map_;
+}
+
+std::pair<uint64_t, std::string> parseKey(const Slice& key,
+                                          uint64_t start_time) {
+  std::pair<uint64_t, std::string> result;
+  std::string key_str = key.ToString();
+  std::string::size_type pos = key_str.find("#");
+  // TODO(Zhongyi): add counters to track parse failures?
+  if (pos == std::string::npos) {
+    result.first = port::kMaxUint64;
+    result.second.clear();
+  } else {
+    uint64_t parsed_time = ParseUint64(key_str.substr(0, pos));
+    // skip entries with timestamp smaller than start_time
+    if (parsed_time < start_time) {
+      result.first = port::kMaxUint64;
+      result.second = "";
+    } else {
+      result.first = parsed_time;
+      std::string key_resize = key_str.substr(pos + 1);
+      result.second = key_resize;
+    }
+  }
+  return result;
+}
+
+// advance the iterator to the next time between [start_time, end_time)
+// if success, update time_ and stats_map_ with new_time and stats_map
+void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time,
+                                                           uint64_t end_time) {
+  // try to find next entry in stats_history_ map
+  if (db_impl_ != nullptr) {
+    ReadOptions ro;
+    Iterator* iter =
+        db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily());
+
+    char timestamp[kNowSecondsStringLength + 1];
+    snprintf(timestamp, sizeof(timestamp), "%010d",
+             static_cast<int>(std::max(time_, start_time)));
+    timestamp[kNowSecondsStringLength] = '\0';
+
+    iter->Seek(timestamp);
+    // no more entries with timestamp >= start_time is found or version key
+    // is found to be incompatible
+    if (!iter->Valid()) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    time_ = parseKey(iter->key(), start_time).first;
+    valid_ = true;
+    // check parsed time and invalid if it exceeds end_time
+    if (time_ > end_time) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    // find all entries with timestamp equal to time_
+    std::map<std::string, uint64_t> new_stats_map;
+    std::pair<uint64_t, std::string> kv;
+    for (; iter->Valid(); iter->Next()) {
+      kv = parseKey(iter->key(), start_time);
+      if (kv.first != time_) {
+        break;
+      }
+      if (kv.second.compare(kFormatVersionKeyString) == 0) {
+        continue;
+      }
+      new_stats_map[kv.second] = ParseUint64(iter->value().ToString());
+    }
+    stats_map_.swap(new_stats_map);
+    delete iter;
+  } else {
+    valid_ = false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/monitoring/persistent_stats_history.h b/monitoring/persistent_stats_history.h
new file mode 100644
index 00000000000..9a6885987fd
--- /dev/null
+++ b/monitoring/persistent_stats_history.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/stats_history.h"
+
+namespace rocksdb {
+
+extern const std::string kFormatVersionKeyString;
+extern const std::string kCompatibleVersionKeyString;
+extern const uint64_t kStatsCFCurrentFormatVersion;
+extern const uint64_t kStatsCFCompatibleFormatVersion;
+
+enum StatsVersionKeyType : uint32_t {
+  kFormatVersion = 1,
+  kCompatibleVersion = 2,
+  kKeyTypeMax = 3
+};
+
+// Read the version number from persitent stats cf depending on type provided
+// stores the version number in `*version_number`
+// returns Status::OK() on success, or other status code on failure
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number);
+
+// Encode timestamp and stats key into buf
+// Format: timestamp(10 digit) + '#' + key
+// Total length of encoded key will be capped at 100 bytes
+int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key,
+                             int size, char* buf);
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo);
+
+class PersistentStatsHistoryIterator final : public StatsHistoryIterator {
+ public:
+  PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
+                                 DBImpl* db_impl)
+      : time_(0),
+        start_time_(start_time),
+        end_time_(end_time),
+        valid_(true),
+        db_impl_(db_impl) {
+    AdvanceIteratorByTime(start_time_, end_time_);
+  }
+  ~PersistentStatsHistoryIterator() override;
+  bool Valid() const override;
+  Status status() const override;
+
+  void Next() override;
+  uint64_t GetStatsTime() const override;
+
+  const std::map<std::string, uint64_t>& GetStatsMap() const override;
+
+ private:
+  // advance the iterator to the next stats history record with timestamp
+  // between [start_time, end_time)
+  void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
+
+  // No copying allowed
+  PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) =
+      delete;
+  void operator=(const PersistentStatsHistoryIterator&) = delete;
+  PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete;
+  PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) =
+      delete;
+
+  uint64_t time_;
+  uint64_t start_time_;
+  uint64_t end_time_;
+  std::map<std::string, uint64_t> stats_map_;
+  Status status_;
+  bool valid_;
+  DBImpl* db_impl_;
+};
+
+}  // namespace rocksdb
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
new file mode 100644
index 00000000000..a66043da1fe
--- /dev/null
+++ b/monitoring/stats_history_test.cc
@@ -0,0 +1,576 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+class StatsHistoryTest : public DBTestBase {
+ public:
+  StatsHistoryTest() : DBTestBase("/stats_history_test") {}
+};
+
+TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
+  int old_val = counter;
+  for (int i = 6; i < 20; ++i) {
+    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
+  }
+  ASSERT_EQ(counter, old_val);
+  Close();
+}
+
+// Test persistent stats background thread scheduling and cancelling
+TEST_F(StatsHistoryTest, StatsPersistScheduling) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
+  Close();
+}
+
+// Test enabling persistent stats for the first time
+TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 0;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+  Close();
+}
+
+// TODO(Zhongyi): Move persistent stats related tests to a separate file
+TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  // disabled stats snapshots
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  size_t stats_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5);
+    stats_count += stats_map.size();
+  }
+  ASSERT_GT(stats_count, 0);
+  // Wait a bit and verify no more stats are found
+  for (mock_time = 6; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_new = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    stats_count_new += stats_iter->GetStatsMap().size();
+  }
+  ASSERT_EQ(stats_count_new, stats_count);
+  Close();
+}
+
+TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.stats_persist_period_sec = 1;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // some random operation to populate statistics
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Put("sol", "sol"));
+  ASSERT_OK(Put("epic", "epic"));
+  ASSERT_OK(Put("ltd", "ltd"));
+  ASSERT_EQ("sol", Get("sol"));
+  ASSERT_EQ("epic", Get("epic"));
+  ASSERT_EQ("ltd", Get("ltd"));
+  Iterator* iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("sol"));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  for (; mock_time < 5; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+
+  // second round of ops
+  ASSERT_OK(Put("saigon", "saigon"));
+  ASSERT_OK(Put("noodle talk", "noodle talk"));
+  ASSERT_OK(Put("ping bistro", "ping bistro"));
+  iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  for (; mock_time < 10; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count += stats_map.size();
+  }
+  size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  ASSERT_GE(slice_count, 9);
+  ASSERT_GE(stats_history_size, 12000);
+  // capping memory cost at 12000 bytes since one slice is around 10000~12000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
+  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
+  // Wait for stats persist to finish
+  for (; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count_reopen += stats_map.size();
+  }
+  size_t stats_history_size_reopen =
+      dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  // only one slice can fit under the new stats_history_buffer_size
+  ASSERT_LT(slice_count, 2);
+  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
+              stats_history_size_reopen > 0);
+  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
+  Close();
+  // TODO: may also want to verify stats timestamp to make sure we are purging
+  // the correct stats snapshot
+}
+
+int countkeys(Iterator* iter) {
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    count++;
+  }
+  return count;
+}
+
+TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count1 = countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count2 = countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count3 = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count2, key_count1);
+  ASSERT_GE(key_count3, key_count2);
+  ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1);
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  int non_zero_count = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count++;
+      }
+    }
+    stats_count += stats_map.size();
+  }
+  ASSERT_EQ(slice_count, 3);
+  // 2 extra keys for format version
+  ASSERT_EQ(stats_count, key_count3 - 2);
+  // verify reopen will not cause data loss
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  int slice_count_reopen = 0;
+  int non_zero_count_recover = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count_reopen++;
+    auto stats_map = stats_iter->GetStatsMap();
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count_recover++;
+      }
+    }
+    stats_count_reopen += stats_map.size();
+  }
+  ASSERT_EQ(non_zero_count, non_zero_count_recover);
+  ASSERT_EQ(slice_count, slice_count_reopen);
+  ASSERT_EQ(stats_count, stats_count_reopen);
+  Close();
+}
+
+// Test persisted stats matches the value found in options.statistics and
+// the stats value retains after DB reopen
+TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  std::map<std::string, uint64_t> stats_map_before;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(20); });
+
+  std::map<std::string, uint64_t> stats_map_after;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after));
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  std::string sample = "rocksdb.num.iterator.deleted";
+  uint64_t recovered_value = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, stats_map_after[sample]);
+
+  // test stats value retains after recovery
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  uint64_t new_recovered_value = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        new_recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, new_recovered_value);
+
+  // TODO(Zhongyi): also add test to read raw values from disk and verify
+  // correctness
+  Close();
+}
+
+// TODO(Zhongyi): add test for different format versions
+
+TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  ASSERT_OK(TryReopen(options));
+  CreateColumnFamilies({"one", "two", "three"}, options);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+  CreateColumnFamilies({"four"}, options);
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count, 0);
+  uint64_t num_write_wal = 0;
+  std::string sample = "rocksdb.write.wal";
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  stats_iter.reset();
+  ASSERT_EQ(num_write_wal, 2);
+
+  options.persist_stats_to_disk = false;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  int cf_count = 0;
+  for (auto cfd : *dbfull()->versions_->GetColumnFamilySet()) {
+    (void)cfd;
+    cf_count++;
+  }
+  // persistent stats cf will be implicitly opened even if
+  // persist_stats_to_disk is false
+  ASSERT_EQ(cf_count, 6);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+
+  // attempt to create column family using same name, should fail
+  ColumnFamilyOptions cf_opts(options);
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+
+  options.persist_stats_to_disk = true;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+  // verify stats is not affected by prior failed CF creation
+  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  num_write_wal = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(num_write_wal, 2);
+
+  Close();
+  Destroy(options);
+}
+
+TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
+  ASSERT_OK(Put("bar", "v2"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.stats_persist_period_sec = 5;
+  options.persist_stats_to_disk = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v2", Get("bar"));
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/options/db_options.cc b/options/db_options.cc
index bdcdd250a0a..490a3708030 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -84,7 +84,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       two_write_queues(options.two_write_queues),
       manual_wal_flush(options.manual_wal_flush),
       atomic_flush(options.atomic_flush),
-      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) {
+      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+      persist_stats_to_disk(options.persist_stats_to_disk) {
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -222,6 +223,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "            Options.avoid_unnecessary_blocking_io: %d",
                    avoid_unnecessary_blocking_io);
+  ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
+                   persist_stats_to_disk);
 }
 
 MutableDBOptions::MutableDBOptions()
diff --git a/options/db_options.h b/options/db_options.h
index 67b26786f5e..92eea4ecfa1 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -81,6 +81,7 @@ struct ImmutableDBOptions {
   bool manual_wal_flush;
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
+  bool persist_stats_to_disk;
 };
 
 struct MutableDBOptions {
diff --git a/options/options.cc b/options/options.cc
index 1d2b6193cbc..5efd3ce5742 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -502,7 +502,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
       BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
   table_factory.reset(new BlockBasedTableFactory(table_options));
 
-
   return this;
 }
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 388256abd9f..71a7f9b2fc0 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -84,6 +84,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec;
   options.stats_persist_period_sec =
       mutable_db_options.stats_persist_period_sec;
+  options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk;
   options.stats_history_buffer_size =
       mutable_db_options.stats_history_buffer_size;
   options.advise_random_on_open = immutable_db_options.advise_random_on_open;
@@ -1580,6 +1581,10 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct DBOptions, stats_persist_period_sec),
           OptionType::kUInt, OptionVerificationType::kNormal, true,
           offsetof(struct MutableDBOptions, stats_persist_period_sec)}},
+        {"persist_stats_to_disk",
+         {offsetof(struct DBOptions, persist_stats_to_disk),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}},
         {"stats_history_buffer_size",
          {offsetof(struct DBOptions, stats_history_buffer_size),
           OptionType::kSizeT, OptionVerificationType::kNormal, true,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 6044cc4b1c4..f0b79e372f7 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -265,6 +265,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "allow_mmap_writes=false;"
                              "stats_dump_period_sec=70127;"
                              "stats_persist_period_sec=54321;"
+                             "persist_stats_to_disk=true;"
                              "stats_history_buffer_size=14159;"
                              "allow_fallocate=true;"
                              "allow_mmap_reads=false;"
diff --git a/options/options_test.cc b/options/options_test.cc
index 9fcd241d70f..24aeec99e17 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -129,6 +129,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"skip_log_error_on_recovery", "false"},
       {"stats_dump_period_sec", "46"},
       {"stats_persist_period_sec", "57"},
+      {"persist_stats_to_disk", "false"},
       {"stats_history_buffer_size", "69"},
       {"advise_random_on_open", "true"},
       {"use_adaptive_mutex", "false"},
@@ -267,6 +268,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
   ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
   ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
   ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
diff --git a/src.mk b/src.mk
index 150b1c10af9..e48a6959515 100644
--- a/src.mk
+++ b/src.mk
@@ -36,7 +36,6 @@ LIB_SOURCES =                                                   \
   db/flush_job.cc                                               \
   db/flush_scheduler.cc                                         \
   db/forward_iterator.cc                                        \
-  db/in_memory_stats_history.cc                                 \
   db/internal_stats.cc                                          \
   db/logs_with_prep_tracker.cc                                  \
   db/log_reader.cc                                              \
@@ -86,10 +85,12 @@ LIB_SOURCES =                                                   \
   memtable/write_buffer_manager.cc                              \
   monitoring/histogram.cc                                       \
   monitoring/histogram_windowing.cc                             \
+  monitoring/in_memory_stats_history.cc                         \
   monitoring/instrumented_mutex.cc                              \
   monitoring/iostats_context.cc                                 \
   monitoring/perf_context.cc                                    \
   monitoring/perf_level.cc                                      \
+  monitoring/persistent_stats_history.cc                        \
   monitoring/statistics.cc                                      \
   monitoring/thread_status_impl.cc                              \
   monitoring/thread_status_updater.cc                           \
@@ -105,21 +106,21 @@ LIB_SOURCES =                                                   \
   port/port_posix.cc                                            \
   port/stack_trace.cc                                           \
   table/adaptive/adaptive_table_factory.cc                      \
-  table/block_based/block.cc                                     \
-  table/block_based/block_based_filter_block.cc                  \
-  table/block_based/block_based_table_builder.cc                 \
-  table/block_based/block_based_table_factory.cc                 \
-  table/block_based/block_based_table_reader.cc                  \
-  table/block_based/block_builder.cc                             \
-  table/block_based/block_prefix_index.cc                        \
-  table/block_based/data_block_hash_index.cc                     \
-  table/block_based/data_block_footer.cc                         \
-  table/block_based/flush_block_policy.cc                        \
-  table/block_based/full_filter_block.cc                         \
-  table/block_based/index_builder.cc                             \
-  table/block_based/partitioned_filter_block.cc                  \
-  table/block_fetcher.cc                             			\
-  table/bloom_block.cc                               			\
+  table/block_based/block.cc                                    \
+  table/block_based/block_based_filter_block.cc                 \
+  table/block_based/block_based_table_builder.cc                \
+  table/block_based/block_based_table_factory.cc                \
+  table/block_based/block_based_table_reader.cc                 \
+  table/block_based/block_builder.cc                            \
+  table/block_based/block_prefix_index.cc                       \
+  table/block_based/data_block_hash_index.cc                    \
+  table/block_based/data_block_footer.cc                        \
+  table/block_based/flush_block_policy.cc                       \
+  table/block_based/full_filter_block.cc                        \
+  table/block_based/index_builder.cc                            \
+  table/block_based/partitioned_filter_block.cc                 \
+  table/block_fetcher.cc                             		\
+  table/bloom_block.cc                               		\
   table/cuckoo/cuckoo_table_builder.cc                          \
   table/cuckoo/cuckoo_table_factory.cc                          \
   table/cuckoo/cuckoo_table_reader.cc                           \
@@ -233,27 +234,27 @@ LIB_SOURCES_ASM =
 LIB_SOURCES_C =
 endif
 
-TOOL_LIB_SOURCES = \
+TOOL_LIB_SOURCES =                                              \
   tools/ldb_cmd.cc                                              \
   tools/ldb_tool.cc                                             \
   tools/sst_dump_tool.cc                                        \
   utilities/blob_db/blob_dump_tool.cc                           \
 
-ANALYZER_LIB_SOURCES = \
+ANALYZER_LIB_SOURCES =                                          \
   tools/block_cache_trace_analyzer.cc                           \
-  tools/trace_analyzer_tool.cc					\
+  tools/trace_analyzer_tool.cc                                  \
 
-MOCK_LIB_SOURCES = \
-  table/mock_table.cc \
+MOCK_LIB_SOURCES =                                              \
+  table/mock_table.cc                                           \
   test_util/fault_injection_test_env.cc
 
-BENCH_LIB_SOURCES = \
+BENCH_LIB_SOURCES =                                             \
   tools/db_bench_tool.cc                                        \
 
-TEST_LIB_SOURCES = \
+TEST_LIB_SOURCES =                                              \
   db/db_test_util.cc                                            \
-  test_util/testharness.cc                                           \
-  test_util/testutil.cc                                              \
+  test_util/testharness.cc                                      \
+  test_util/testutil.cc                                         \
   utilities/cassandra/test_utils.cc                             \
 
 MAIN_SOURCES =                                                          \
@@ -301,7 +302,7 @@ MAIN_SOURCES =                                                          \
   db/dbformat_test.cc                                                   \
   db/deletefile_test.cc                                                 \
   db/env_timed_test.cc                                                  \
-  db/error_handler_test.cc                                        	\
+  db/error_handler_test.cc                                              \
   db/external_sst_file_basic_test.cc                                    \
   db/external_sst_file_test.cc                                          \
   db/fault_injection_test.cc                                            \
@@ -352,12 +353,13 @@ MAIN_SOURCES =                                                          \
   monitoring/histogram_test.cc                                          \
   monitoring/iostats_context_test.cc                                    \
   monitoring/statistics_test.cc                                         \
+  monitoring/stats_history_test.cc                                      \
   options/options_test.cc                                               \
-  table/block_based/block_based_filter_block_test.cc                     \
-  table/block_based/block_test.cc                                        \
-  table/block_based/data_block_hash_index_test.cc                        \
-  table/block_based/full_filter_block_test.cc                            \
-  table/block_based/partitioned_filter_block_test.cc                     \
+  table/block_based/block_based_filter_block_test.cc                    \
+  table/block_based/block_test.cc                                       \
+  table/block_based/data_block_hash_index_test.cc                       \
+  table/block_based/full_filter_block_test.cc                           \
+  table/block_based/partitioned_filter_block_test.cc                    \
   table/cleanable_test.cc                                               \
   table/cuckoo/cuckoo_table_builder_test.cc                             \
   table/cuckoo/cuckoo_table_reader_test.cc                              \
@@ -373,7 +375,7 @@ MAIN_SOURCES =                                                          \
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
-  tools/trace_analyzer_test.cc						\
+  tools/trace_analyzer_test.cc				             	\
   trace_replay/block_cache_tracer_test.cc                               \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index a14758418c3..9b3e2cac35f 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1146,6 +1146,8 @@ DEFINE_uint64(stats_dump_period_sec, rocksdb::Options().stats_dump_period_sec,
 DEFINE_uint64(stats_persist_period_sec,
               rocksdb::Options().stats_persist_period_sec,
               "Gap between persisting stats in seconds");
+DEFINE_bool(persist_stats_to_disk, rocksdb::Options().persist_stats_to_disk,
+            "whether to persist stats to disk");
 DEFINE_uint64(stats_history_buffer_size,
               rocksdb::Options().stats_history_buffer_size,
               "Max number of stats snapshots to keep in memory");
@@ -3727,6 +3729,7 @@ class Benchmark {
         static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
     options.stats_persist_period_sec =
         static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
+    options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
     options.stats_history_buffer_size =
         static_cast<size_t>(FLAGS_stats_history_buffer_size);
 

From 7d8d56413dbc375cb1257306c101f99f2eb75386 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 17 Jun 2019 15:36:20 -0700
Subject: [PATCH 152/572] Override check consistency for DBImplSecondary
 (#5469)

Summary:
`DBImplSecondary` calls `CheckConsistency()` during open. In the past, `DBImplSecondary` did not override this function thus `DBImpl::CheckConsistency()` is called.
The following can happen. The secondary instance is performing consistency check which calls `GetFileSize(file_path)` but the file at `file_path` is deleted by the primary instance. `DBImpl::CheckConsistency` does not account for this and fails the consistency check. This is undesirable. The solution is that, we call `DBImpl::CheckConsistency()` first. If it passes, then we are good. If not, we give it a second chance and handles the case of file(s) being deleted.

Test plan (on dev server):
```
$make clean && make -j20 all
$./db_secondary_test
```
All other existing unit tests must pass as well.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5469

Differential Revision: D15861845

Pulled By: riversand963

fbshipit-source-id: 507d72392508caed3cd003bb2e2aa43f993dd597
---
 db/db_impl/db_impl.cc           |  2 ++
 db/db_impl/db_impl_secondary.cc | 38 +++++++++++++++++++++++++++++++
 db/db_impl/db_impl_secondary.h  |  6 +++++
 db/db_impl/db_secondary_test.cc | 40 +++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 21b8f3d9165..6341b76854c 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3017,6 +3017,7 @@ Status DBImpl::CheckConsistency() {
   mutex_.AssertHeld();
   std::vector<LiveFileMetaData> metadata;
   versions_->GetLiveFilesMetaData(&metadata);
+  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
 
   std::string corruption_messages;
   for (const auto& md : metadata) {
@@ -3024,6 +3025,7 @@ Status DBImpl::CheckConsistency() {
     std::string file_path = md.db_path + md.name;
 
     uint64_t fsize = 0;
+    TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
     Status s = env_->GetFileSize(file_path, &fsize);
     if (!s.ok() &&
         env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 5cd0beb1f0c..8b93f675f8c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -451,6 +451,44 @@ Status DBImplSecondary::NewIterators(
   return Status::OK();
 }
 
+Status DBImplSecondary::CheckConsistency() {
+  mutex_.AssertHeld();
+  Status s = DBImpl::CheckConsistency();
+  // If DBImpl::CheckConsistency() which is stricter returns success, then we
+  // do not need to give a second chance.
+  if (s.ok()) {
+    return s;
+  }
+  // It's possible that DBImpl::CheckConssitency() can fail because the primary
+  // may have removed certain files, causing the GetFileSize(name) call to
+  // fail and returning a PathNotFound. In this case, we take a best-effort
+  // approach and just proceed.
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
+    uint64_t fsize = 0;
+    s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+         s.IsPathNotFound())) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    }
+  }
+  return corruption_messages.empty() ? Status::OK()
+                                     : Status::Corruption(corruption_messages);
+}
+
 Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
   assert(manifest_reader_.get() != nullptr);
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index 24cfd33c11d..ca853e25802 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -197,6 +197,12 @@ class DBImplSecondary : public DBImpl {
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
+  // Check if all live files exist on file system and that their file sizes
+  // matche to the in-memory records. It is possible that some live files may
+  // have been deleted by the primary. In this case, CheckConsistency() does
+  // not flag the missing file as inconsistency.
+  Status CheckConsistency() override;
+
  protected:
   // ColumnFamilyCollector is a write batch handler which does nothing
   // except recording unique column family IDs
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index c9aaa361191..c79589d5022 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -705,6 +705,46 @@ TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
   iter3->Seek("key1");
   ASSERT_FALSE(iter3->Valid());
 }
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
 #endif  //! ROCKSDB_LITE
 
 }  // namespace rocksdb

From 2d1dd5bce7f1c34723e55de57d8f205576cd3e75 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 17 Jun 2019 16:33:40 -0700
Subject: [PATCH 153/572] Support computing miss ratio curves using sim_cache.
 (#5449)

Summary:
This PR adds a BlockCacheTraceSimulator that reports the miss ratios given different cache configurations. A cache configuration contains "cache_name,num_shard_bits,cache_capacities". For example, "lru, 1, 1K, 2K, 4M, 4G".

When we replay the trace, we also perform lookups and inserts on the simulated caches.
In the end, it reports the miss ratio for each tuple <cache_name, num_shard_bits, cache_capacity> in a output file.

This PR also adds a main source block_cache_trace_analyzer so that we can run the analyzer in command line.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5449

Test Plan:
Added tests for block_cache_trace_analyzer.
COMPILE_WITH_ASAN=1 make check -j32.

Differential Revision: D15797073

Pulled By: HaoyuHuang

fbshipit-source-id: aef0c5c2e7938f3e8b6a10d4a6a50e6928ecf408
---
 Makefile                                 |   4 +
 include/rocksdb/utilities/sim_cache.h    |   4 +
 src.mk                                   |   1 +
 tools/block_cache_trace_analyzer.cc      | 254 +++++++++++++++++++++--
 tools/block_cache_trace_analyzer.h       |  60 +++++-
 tools/block_cache_trace_analyzer_test.cc | 111 +++++++++-
 tools/block_cache_trace_analyzer_tool.cc |  25 +++
 utilities/simulator_cache/sim_cache.cc   |  22 +-
 8 files changed, 449 insertions(+), 32 deletions(-)
 create mode 100644 tools/block_cache_trace_analyzer_tool.cc

diff --git a/Makefile b/Makefile
index a499cbbedd7..8e8c0ac7638 100644
--- a/Makefile
+++ b/Makefile
@@ -608,6 +608,7 @@ TOOLS = \
 	rocksdb_undump \
 	blob_dump \
 	trace_analyzer \
+	block_cache_trace_analyzer \
 
 TEST_LIBS = \
 	librocksdb_env_basic_test.a
@@ -1109,6 +1110,9 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
 trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 	$(AM_LINK)
 
+block_cache_trace_analyzer: tools/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+	$(AM_LINK)
+
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h
index bc2a7bc13d9..fef9e9910e8 100644
--- a/include/rocksdb/utilities/sim_cache.h
+++ b/include/rocksdb/utilities/sim_cache.h
@@ -36,6 +36,10 @@ extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                              size_t sim_capacity,
                                              int num_shard_bits);
 
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                             std::shared_ptr<Cache> cache,
+                                             int num_shard_bits);
+
 class SimCache : public Cache {
  public:
   SimCache() {}
diff --git a/src.mk b/src.mk
index e48a6959515..71c2bd01803 100644
--- a/src.mk
+++ b/src.mk
@@ -369,6 +369,7 @@ MAIN_SOURCES =                                                          \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
   tools/block_cache_trace_analyzer_test.cc                              \
+  tools/block_cache_trace_analyzer_tool.cc                              \
   tools/db_bench.cc                                                     \
   tools/db_bench_tool_test.cc                                           \
   tools/db_sanity_test.cc                                               \
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 5d9b2d18409..0ef4b55e46f 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -3,11 +3,44 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
 #include "tools/block_cache_trace_analyzer.h"
 
 #include <cinttypes>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
 #include <set>
+#include <sstream>
 #include "monitoring/histogram.h"
+#include "util/gflags_compat.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(block_cache_trace_path, "", "The trace file path.");
+DEFINE_string(
+    block_cache_sim_config_path, "",
+    "The config file path. One cache configuration per line. The format of a "
+    "cache configuration is "
+    "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. "
+    "cache_name is lru. cache_capacity can be xK, xM or xG "
+    "where x is a positive number.");
+DEFINE_bool(print_block_size_stats, false,
+            "Print block size distribution and the distribution break down by "
+            "block type and column family.");
+DEFINE_bool(print_access_count_stats, false,
+            "Print access count distribution and the distribution break down "
+            "by block type and column family.");
+DEFINE_bool(print_data_block_access_count_stats, false,
+            "Print data block accesses by user Get and Multi-Get.");
+DEFINE_int32(cache_sim_warmup_seconds, 0,
+             "The number of seconds to warmup simulated caches. The hit/miss "
+             "counters are reset after the warmup completes.");
+DEFINE_string(output_miss_ratio_curve_path, "",
+              "The output file to save the computed miss ratios. File format: "
+              "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses");
 
 namespace rocksdb {
 namespace {
@@ -48,11 +81,101 @@ std::string caller_to_string(BlockCacheLookupCaller caller) {
   // This cannot happen.
   return "InvalidCaller";
 }
+
+const char kBreakLine[] =
+    "***************************************************************\n";
+
+void print_break_lines(uint32_t num_break_lines) {
+  for (uint32_t i = 0; i < num_break_lines; i++) {
+    fprintf(stdout, kBreakLine);
+  }
+}
+
 }  // namespace
 
+BlockCacheTraceSimulator::BlockCacheTraceSimulator(
+    uint64_t warmup_seconds,
+    const std::vector<CacheConfiguration>& cache_configurations)
+    : warmup_seconds_(warmup_seconds),
+      cache_configurations_(cache_configurations) {
+  for (auto const& config : cache_configurations_) {
+    for (auto cache_capacity : config.cache_capacities) {
+      sim_caches_.push_back(
+          NewSimCache(NewLRUCache(cache_capacity, config.num_shard_bits),
+                      /*real_cache=*/nullptr, config.num_shard_bits));
+    }
+  }
+}
+
+void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
+  if (trace_start_time_ == 0) {
+    trace_start_time_ = access.access_timestamp;
+  }
+  // access.access_timestamp is in microseconds.
+  if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <=
+                               access.access_timestamp) {
+    for (auto& sim_cache : sim_caches_) {
+      sim_cache->reset_counter();
+    }
+    warmup_complete_ = true;
+  }
+  for (auto& sim_cache : sim_caches_) {
+    auto handle = sim_cache->Lookup(access.block_key);
+    if (handle == nullptr && !access.no_insert) {
+      sim_cache->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+                        /*deleter=*/nullptr);
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const {
+  if (!cache_simulator_) {
+    return;
+  }
+  if (output_miss_ratio_curve_path_.empty()) {
+    return;
+  }
+  std::ofstream out(output_miss_ratio_curve_path_);
+  if (!out.is_open()) {
+    return;
+  }
+  // Write header.
+  const std::string header =
+      "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses";
+  out << header << std::endl;
+  uint64_t sim_cache_index = 0;
+  for (auto const& config : cache_simulator_->cache_configurations()) {
+    for (auto cache_capacity : config.cache_capacities) {
+      uint64_t hits =
+          cache_simulator_->sim_caches()[sim_cache_index]->get_hit_counter();
+      uint64_t misses =
+          cache_simulator_->sim_caches()[sim_cache_index]->get_miss_counter();
+      uint64_t total_accesses = hits + misses;
+      double miss_ratio = static_cast<double>(misses * 100.0 / total_accesses);
+      // Write the body.
+      out << config.cache_name;
+      out << ",";
+      out << config.num_shard_bits;
+      out << ",";
+      out << cache_capacity;
+      out << ",";
+      out << std::fixed << std::setprecision(4) << miss_ratio;
+      out << ",";
+      out << total_accesses;
+      out << std::endl;
+      sim_cache_index++;
+    }
+  }
+  out.close();
+}
+
 BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
-    const std::string& trace_file_path)
-    : trace_file_path_(trace_file_path) {
+    const std::string& trace_file_path,
+    const std::string& output_miss_ratio_curve_path,
+    std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
+    : trace_file_path_(trace_file_path),
+      output_miss_ratio_curve_path_(output_miss_ratio_curve_path),
+      cache_simulator_(std::move(cache_simulator)) {
   env_ = rocksdb::Env::Default();
 }
 
@@ -88,6 +211,9 @@ Status BlockCacheTraceAnalyzer::Analyze() {
       return s;
     }
     RecordAccess(access);
+    if (cache_simulator_) {
+      cache_simulator_->Access(access);
+    }
   }
   return Status::OK();
 }
@@ -118,6 +244,7 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
   }
   fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
   for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Block size stats for block type %s: \n%s",
             block_type_to_string(bt_stats.first).c_str(),
             bt_stats.second.ToString().c_str());
@@ -125,6 +252,7 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
   for (auto const& cf_bt_stats : cf_bt_stats_map) {
     const std::string& cf_name = cf_bt_stats.first;
     for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
       fprintf(stdout,
               "Block size stats for column family %s and block type %s: \n%s",
               cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
@@ -160,6 +288,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
   fprintf(stdout, "Block access count stats: \n%s",
           access_stats.ToString().c_str());
   for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Block access count stats for block type %s: \n%s",
             block_type_to_string(bt_stats.first).c_str(),
             bt_stats.second.ToString().c_str());
@@ -167,6 +296,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
   for (auto const& cf_bt_stats : cf_bt_stats_map) {
     const std::string& cf_name = cf_bt_stats.first;
     for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
       fprintf(stdout,
               "Block access count stats for column family %s and block type "
               "%s: \n%s",
@@ -230,23 +360,28 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
           "the total number of keys in a block: \n%s",
           existing_keys_stats.ToString().c_str());
   for (auto const& cf_stats : cf_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Break down by column family %s: \n%s",
             cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
   }
+  print_break_lines(/*num_break_lines=*/1);
   fprintf(
       stdout,
       "Histogram on percentage of referenced keys DO NOT exist in a block over "
       "the total number of keys in a block: \n%s",
       non_existing_keys_stats.ToString().c_str());
   for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Break down by column family %s: \n%s",
             cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
   }
+  print_break_lines(/*num_break_lines=*/1);
   fprintf(stdout,
           "Histogram on percentage of accesses on keys exist in a block over "
           "the total number of accesses in a block: \n%s",
           block_access_stats.ToString().c_str());
   for (auto const& cf_stats : cf_block_access_info) {
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Break down by column family %s: \n%s",
             cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
   }
@@ -318,15 +453,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
     }
 
     // Print stats.
-    fprintf(
-        stdout,
-        "***************************************************************\n");
-    fprintf(
-        stdout,
-        "***************************************************************\n");
-    fprintf(
-        stdout,
-        "***************************************************************\n");
+    print_break_lines(/*num_break_lines=*/3);
     fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
     fprintf(stdout,
             "Number of files:%" PRIu64 "Number of blocks: %" PRIu64
@@ -338,9 +465,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
               block_type.second);
     }
     for (auto caller : cf_caller_num_accesses_map) {
-      fprintf(
-          stdout,
-          "***************************************************************\n");
+      print_break_lines(/*num_break_lines=*/1);
       fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
               caller_to_string(caller.first).c_str(), caller.second);
       fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
@@ -368,12 +493,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
       }
     }
   }
-  fprintf(stdout,
-          "***************************************************************\n");
-  fprintf(stdout,
-          "***************************************************************\n");
-  fprintf(stdout,
-          "***************************************************************\n");
+  print_break_lines(/*num_break_lines=*/3);
   fprintf(stdout, "Overall statistics:\n");
   fprintf(stdout,
           "Number of files: %" PRIu64 " Number of blocks: %" PRIu64
@@ -384,9 +504,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
             block_type_to_string(block_type.first).c_str(), block_type.second);
   }
   for (auto caller : caller_num_access_map) {
-    fprintf(
-        stdout,
-        "***************************************************************\n");
+    print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
             caller_to_string(caller.first).c_str(), caller.second);
     fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
@@ -405,4 +523,94 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
   }
 }
 
+std::vector<CacheConfiguration> parse_cache_config_file(
+    const std::string& config_path) {
+  std::ifstream file(config_path);
+  if (!file.is_open()) {
+    return {};
+  }
+  std::vector<CacheConfiguration> configs;
+  std::string line;
+  while (getline(file, line)) {
+    CacheConfiguration cache_config;
+    std::stringstream ss(line);
+    std::vector<std::string> config_strs;
+    while (ss.good()) {
+      std::string substr;
+      getline(ss, substr, ',');
+      config_strs.push_back(substr);
+    }
+    // Sanity checks.
+    if (config_strs.size() < 3) {
+      fprintf(stderr, "Invalid cache simulator configuration %s\n",
+              line.c_str());
+      exit(1);
+    }
+    if (config_strs[0] != "lru") {
+      fprintf(stderr, "We only support LRU cache %s\n", line.c_str());
+      exit(1);
+    }
+    cache_config.cache_name = config_strs[0];
+    cache_config.num_shard_bits = ParseUint32(config_strs[1]);
+    for (uint32_t i = 2; i < config_strs.size(); i++) {
+      uint64_t capacity = ParseUint64(config_strs[i]);
+      if (capacity == 0) {
+        fprintf(stderr, "Invalid cache capacity %s, %s\n",
+                config_strs[i].c_str(), line.c_str());
+        exit(1);
+      }
+      cache_config.cache_capacities.push_back(capacity);
+    }
+    configs.push_back(cache_config);
+  }
+  file.close();
+  return configs;
+}
+
+int block_cache_trace_analyzer_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_block_cache_trace_path.empty()) {
+    fprintf(stderr, "block cache trace path is empty\n");
+    exit(1);
+  }
+  uint64_t warmup_seconds =
+      FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0;
+  std::vector<CacheConfiguration> cache_configs =
+      parse_cache_config_file(FLAGS_block_cache_sim_config_path);
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator;
+  if (!cache_configs.empty()) {
+    cache_simulator.reset(
+        new BlockCacheTraceSimulator(warmup_seconds, cache_configs));
+  }
+  BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
+                                   FLAGS_output_miss_ratio_curve_path,
+                                   std::move(cache_simulator));
+  Status s = analyzer.Analyze();
+  if (!s.IsIncomplete()) {
+    // Read all traces.
+    fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  analyzer.PrintStatsSummary();
+  if (FLAGS_print_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats();
+  }
+  if (FLAGS_print_block_size_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintBlockSizeStats();
+  }
+  if (FLAGS_print_data_block_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintDataBlockAccessStats();
+  }
+  print_break_lines(/*num_break_lines=*/3);
+  analyzer.PrintMissRatioCurves();
+  return 0;
+}
+
 }  // namespace rocksdb
+
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 51bb1ec7930..1420906f3cf 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -9,10 +9,56 @@
 #include <vector>
 
 #include "rocksdb/env.h"
+#include "rocksdb/utilities/sim_cache.h"
 #include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
+class BlockCacheTraceAnalyzer;
+
+// A cache configuration provided by user.
+struct CacheConfiguration {
+  std::string cache_name;  // LRU.
+  uint32_t num_shard_bits;
+  std::vector<uint64_t>
+      cache_capacities;  // simulate cache capacities in bytes.
+};
+
+// A block cache simulator that reports miss ratio curves given a set of cache
+// configurations.
+class BlockCacheTraceSimulator {
+ public:
+  // warmup_seconds: The number of seconds to warmup simulated caches. The
+  // hit/miss counters are reset after the warmup completes.
+  BlockCacheTraceSimulator(
+      uint64_t warmup_seconds,
+      const std::vector<CacheConfiguration>& cache_configurations);
+  ~BlockCacheTraceSimulator() = default;
+  // No copy and move.
+  BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
+  BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
+
+  void Access(const BlockCacheTraceRecord& access);
+
+  const std::vector<std::shared_ptr<SimCache>>& sim_caches() const {
+    return sim_caches_;
+  }
+
+  const std::vector<CacheConfiguration>& cache_configurations() const {
+    return cache_configurations_;
+  }
+
+ private:
+  const uint64_t warmup_seconds_;
+  const std::vector<CacheConfiguration> cache_configurations_;
+
+  bool warmup_complete_ = false;
+  std::vector<std::shared_ptr<SimCache>> sim_caches_;
+  uint64_t trace_start_time_ = 0;
+};
+
 // Statistics of a block.
 struct BlockAccessInfo {
   uint64_t num_accesses = 0;
@@ -67,7 +113,10 @@ struct ColumnFamilyAccessInfoAggregate {
 
 class BlockCacheTraceAnalyzer {
  public:
-  BlockCacheTraceAnalyzer(const std::string& trace_file_path);
+  BlockCacheTraceAnalyzer(
+      const std::string& trace_file_path,
+      const std::string& output_miss_ratio_curve_path,
+      std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
   ~BlockCacheTraceAnalyzer() = default;
   // No copy and move.
   BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete;
@@ -115,6 +164,8 @@ class BlockCacheTraceAnalyzer {
   // accesses on keys exist in a data block and its break down by column family.
   void PrintDataBlockAccessStats() const;
 
+  void PrintMissRatioCurves() const;
+
   const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
   TEST_cf_aggregates_map() const {
     return cf_aggregates_map_;
@@ -124,9 +175,14 @@ class BlockCacheTraceAnalyzer {
   void RecordAccess(const BlockCacheTraceRecord& access);
 
   rocksdb::Env* env_;
-  std::string trace_file_path_;
+  const std::string trace_file_path_;
+  const std::string output_miss_ratio_curve_path_;
+
   BlockCacheTraceHeader header_;
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
   std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
 };
 
+int block_cache_trace_analyzer_tool(int argc, char** argv);
+
 }  // namespace rocksdb
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index a75804492f6..df99e1f616e 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -3,6 +3,18 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr,
+          "Please install gflags to run block_cache_trace_analyzer_test\n");
+  return 1;
+}
+#else
+
+#include <fstream>
+#include <iostream>
 #include <map>
 #include <vector>
 
@@ -25,6 +37,8 @@ const uint64_t kSSTStoringEvenKeys = 100;
 const uint64_t kSSTStoringOddKeys = 101;
 const std::string kRefKeyPrefix = "test-get-";
 const uint64_t kNumKeysInBlock = 1024;
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
 }  // namespace
 
 class BlockCacheTracerTest : public testing::Test {
@@ -34,6 +48,8 @@ class BlockCacheTracerTest : public testing::Test {
     env_ = rocksdb::Env::Default();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
+    block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
+    output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve";
   }
 
   ~BlockCacheTracerTest() override {
@@ -125,12 +141,94 @@ class BlockCacheTracerTest : public testing::Test {
     }
   }
 
+  void RunBlockCacheTraceAnalyzer() {
+    std::vector<std::string> params = {
+        "./block_cache_trace_analyzer",
+        "-block_cache_trace_path=" + trace_file_path_,
+        "-block_cache_sim_config_path=" + block_cache_sim_config_path_,
+        "-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_,
+        "-print_block_size_stats",
+        "-print_access_count_stats",
+        "-print_data_block_access_count_stats",
+        "-cache_sim_warmup_seconds=0"};
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0, rocksdb::block_cache_trace_analyzer_tool(argc, argv));
+  }
+
   Env* env_;
   EnvOptions env_options_;
+  std::string output_miss_ratio_curve_path_;
+  std::string block_cache_sim_config_path_;
   std::string trace_file_path_;
   std::string test_path_;
 };
 
+TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
+  {
+    // Generate a trace file.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Generate a cache sim config.
+    std::string config = "lru,1,1K,1M,1G";
+    std::ofstream out(block_cache_sim_config_path_);
+    ASSERT_TRUE(out.is_open());
+    out << config << std::endl;
+    out.close();
+  }
+  RunBlockCacheTraceAnalyzer();
+  {
+    // Validate the cache miss ratios.
+    const std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
+                                                    1024 * 1024 * 1024};
+    std::ifstream infile(output_miss_ratio_curve_path_);
+    uint32_t config_index = 0;
+    std::string line;
+    // Read header.
+    ASSERT_TRUE(getline(infile, line));
+    while (getline(infile, line)) {
+      std::stringstream ss(line);
+      std::vector<std::string> result_strs;
+      while (ss.good()) {
+        std::string substr;
+        getline(ss, substr, ',');
+        result_strs.push_back(substr);
+      }
+      ASSERT_EQ(5, result_strs.size());
+      ASSERT_LT(config_index, expected_capacities.size());
+      ASSERT_EQ("lru", result_strs[0]);  // cache_name
+      ASSERT_EQ("1", result_strs[1]);    // num_shard_bits
+      ASSERT_EQ(std::to_string(expected_capacities[config_index]),
+                result_strs[2]);         // cache_capacity
+      ASSERT_EQ("100.0000", result_strs[3]);  // miss_ratio
+      ASSERT_EQ("50", result_strs[4]);   // number of accesses.
+      config_index++;
+    }
+    ASSERT_EQ(expected_capacities.size(), config_index);
+    infile.close();
+  }
+  ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_));
+  ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
+}
+
 TEST_F(BlockCacheTracerTest, MixedBlocks) {
   {
     // Generate a trace file containing a mix of blocks.
@@ -164,7 +262,9 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
     ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
     // Read blocks.
-    BlockCacheTraceAnalyzer analyzer(trace_file_path_);
+    BlockCacheTraceAnalyzer analyzer(trace_file_path_,
+                                     /*output_miss_ratio_curve_path=*/"",
+                                     /*simulator=*/nullptr);
     // The analyzer ends when it detects an incomplete access record.
     ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
     const uint64_t expected_num_cfs = 1;
@@ -228,3 +328,12 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+#endif  // GFLAG
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/block_cache_trace_analyzer_tool.cc b/tools/block_cache_trace_analyzer_tool.cc
new file mode 100644
index 00000000000..b7b36c5d241
--- /dev/null
+++ b/tools/block_cache_trace_analyzer_tool.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else  // GFLAGS
+#include "tools/block_cache_trace_analyzer.h"
+int main(int argc, char** argv) {
+  return rocksdb::block_cache_trace_analyzer_tool(argc, argv);
+}
+#endif  // GFLAGS
+#else   // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index 8629b60b095..f6f1e671450 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -152,10 +152,9 @@ class SimCacheImpl : public SimCache {
  public:
   // capacity for real cache (ShardedLRUCache)
   // test_capacity for key only cache
-  SimCacheImpl(std::shared_ptr<Cache> cache, size_t sim_capacity,
-               int num_shard_bits)
+  SimCacheImpl(std::shared_ptr<Cache> sim_cache, std::shared_ptr<Cache> cache)
       : cache_(cache),
-        key_only_cache_(NewLRUCache(sim_capacity, num_shard_bits)),
+        key_only_cache_(sim_cache),
         miss_times_(0),
         hit_times_(0),
         stats_(nullptr) {}
@@ -185,7 +184,9 @@ class SimCacheImpl : public SimCache {
     }
 
     cache_activity_logger_.ReportAdd(key, charge);
-
+    if (!cache_) {
+      return Status::OK();
+    }
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
@@ -201,7 +202,9 @@ class SimCacheImpl : public SimCache {
     }
 
     cache_activity_logger_.ReportLookup(key);
-
+    if (!cache_) {
+      return nullptr;
+    }
     return cache_->Lookup(key, stats);
   }
 
@@ -326,10 +329,17 @@ class SimCacheImpl : public SimCache {
 // For instrumentation purpose, use NewSimCache instead
 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                       size_t sim_capacity, int num_shard_bits) {
+  return NewSimCache(NewLRUCache(sim_capacity, num_shard_bits), cache,
+                     num_shard_bits);
+}
+
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                      std::shared_ptr<Cache> cache,
+                                      int num_shard_bits) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
-  return std::make_shared<SimCacheImpl>(cache, sim_capacity, num_shard_bits);
+  return std::make_shared<SimCacheImpl>(sim_cache, cache);
 }
 
 }  // end namespace rocksdb

From bcfc53b436b386d5a894bf10678b38c058aa1624 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 17 Jun 2019 17:56:09 -0700
Subject: [PATCH 154/572] Block cache tracing: Fix minor bugs with downsampling
 and some benchmark results. (#5473)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
As the code changes for block cache tracing are almost complete, I did a benchmark to compare the performance when block cache tracing is enabled/disabled.

 With 1% downsampling ratio, the performance overhead of block cache tracing is negligible. When we trace all block accesses, the throughput drops by 6 folds with 16 threads issuing random reads and all reads are served in block cache.

Setup:
RocksDB:    version 6.2
Date:       Mon Jun 17 17:11:13 2019
CPU:        24 * Intel Core Processor (Skylake)
CPUCache:   16384 KB
Keys:       20 bytes each
Values:     100 bytes each (100 bytes after compression)
Entries:    10000000
Prefix:    20 bytes
Keys per prefix:    0
RawSize:    1144.4 MB (estimated)
FileSize:   1144.4 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: NoCompression
Compression sampling rate: 0
Memtablerep: skip_list
Perf Level: 1

I ran the readrandom workload for 1 minute. Detailed throughput results:  (ops/second)
Sample rate 0: no block cache tracing.
Sample rate 1: trace all block accesses.
Sample rate 100: trace accesses 1% blocks.
1 thread |   |   |  -- | -- | -- | --
Sample rate | 0 | 1 | 100
1 MB block cache size | 13,094 | 13,166 | 13,341
10 GB block cache size | 202,243 | 188,677 | 229,182

16 threads |   |   |  -- | -- | -- | --
Sample rate | 0 | 1 | 100
1 MB block cache size | 208,761 | 178,700 | 201,872
10 GB block cache size | 2,645,996 | 426,295 | 2,587,605
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5473

Differential Revision: D15869479

Pulled By: HaoyuHuang

fbshipit-source-id: 7ae802abe84811281a6af8649f489887cd7c4618
---
 tools/block_cache_trace_analyzer.cc | 2 +-
 trace_replay/block_cache_tracer.cc  | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 0ef4b55e46f..3fd93a0239b 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -442,7 +442,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
             caller_bt_num_access_map[caller][type] += num_accesses;
             caller_level_num_access_map[caller][level] += num_accesses;
             // Column Family stats.
-            cf_num_accesses++;
+            cf_num_accesses += num_accesses;
             cf_caller_num_accesses_map[caller] += num_accesses;
             cf_caller_level_num_accesses_map[caller][level] += num_accesses;
             cf_caller_file_num_accesses_map[caller][fd] += num_accesses;
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index f733bc9005f..a0f0676eecf 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -16,15 +16,14 @@ namespace rocksdb {
 namespace {
 const unsigned int kCharSize = 1;
 
-bool ShouldTrace(const BlockCacheTraceRecord& record,
-                 const TraceOptions& trace_options) {
+bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
   if (trace_options.sampling_frequency == 0 ||
       trace_options.sampling_frequency == 1) {
     return true;
   }
   // We use spatial downsampling so that we have a complete access history for a
   // block.
-  const uint64_t hash = GetSliceNPHash64(Slice(record.block_key));
+  const uint64_t hash = GetSliceNPHash64(block_key);
   return hash % trace_options.sampling_frequency == 0;
 }
 }  // namespace
@@ -255,7 +254,7 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
                                           const Slice& block_key,
                                           const Slice& cf_name,
                                           const Slice& referenced_key) {
-  if (!writer_.load() || !ShouldTrace(record, trace_options_)) {
+  if (!writer_.load() || !ShouldTrace(block_key, trace_options_)) {
     return Status::OK();
   }
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);

From ddd088c8b91f8f63a110cb3262cc4e4d22fab7ca Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Mon, 17 Jun 2019 21:12:37 -0700
Subject: [PATCH 155/572] fix rocksdb lite and clang contrun test failures
 (#5477)

Summary:
recent commit 671d15cbdd3839acb54cb21a2aa82efca4917155 introduced some test failures:
```
===== Running stats_history_test
[==========] Running 9 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 9 tests from StatsHistoryTest
[ RUN      ] StatsHistoryTest.RunStatsDumpPeriodSec
monitoring/stats_history_test.cc:63: Failure
dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})
Not implemented: Not supported in ROCKSDB LITE

db/db_options_test.cc:28:11: error: unused variable 'kMicrosInSec' [-Werror,-Wunused-const-variable]
const int kMicrosInSec = 1000000;
```
This PR fixes these failures
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5477

Differential Revision: D15871814

Pulled By: miasantreble

fbshipit-source-id: 0a7023914d2c1784d9d2d3f5bfb47310d4855394
---
 db/db_options_test.cc            | 2 --
 monitoring/stats_history_test.cc | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 7dd672646b5..fd8d849cd56 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -25,8 +25,6 @@
 
 namespace rocksdb {
 
-const int kMicrosInSec = 1000000;
-
 class DBOptionsTest : public DBTestBase {
  public:
   DBOptionsTest() : DBTestBase("/db_options_test") {}
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index a66043da1fe..16681fe05d8 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -30,6 +30,7 @@ class StatsHistoryTest : public DBTestBase {
  public:
   StatsHistoryTest() : DBTestBase("/stats_history_test") {}
 };
+#ifndef ROCKSDB_LITE
 
 TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
   Options options;
@@ -566,6 +567,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
 }
+#endif  // !ROCKSDB_LITE
 
 }  // namespace rocksdb
 

From f287f8dc930f0e5455cc236b65960abce6e7bbf0 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 18 Jun 2019 11:16:57 -0700
Subject: [PATCH 156/572] Fix a bug caused by secondary not skipping the
 beginning of new MANIFEST (#5472)

Summary:
While the secondary is replaying after the primary, the primary may switch to a new MANIFEST. The secondary is already able to detect and follow the primary to the new MANIFEST. However, the current implementation has a bug, described as follows.
The new MANIFEST's first records have been generated by VersionSet::WriteSnapshot to describe the current state of the column families and the db as of the MANIFEST creation. Since the secondary instance has already finished recovering upon start, there is no need for the secondary to process these records. Actually, if the secondary were to replay these records, the secondary may end up adding the same SST files **again** to each column family, causing consistency checks done by VersionBuilder to fail. Therefore, we record the number of records to skip at the beginning of the new MANIFEST and ignore them.

Test plan (on dev server)
```
$make clean && make -j32 all
$./db_secondary_test
```
All existing unit tests must pass as well.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5472

Differential Revision: D15866771

Pulled By: riversand963

fbshipit-source-id: a1eec4837fb2ad13059398efb0f437e74fd53bed
---
 HISTORY.md                      |  1 +
 db/db_impl/db_secondary_test.cc | 28 +++++++++++++++++++
 db/version_set.cc               | 48 +++++++++++++++++++++++++++++----
 db/version_set.h                |  3 +++
 4 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 228d02b61df..0b6409dbe47 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -28,6 +28,7 @@
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
+* Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index c79589d5022..26f43c10745 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -525,6 +525,34 @@ TEST_F(DBSecondaryTest, SwitchManifest) {
   range_scan_db();
 }
 
+// Here, "Snapshot" refers to the version edits written by
+// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
+// switching from the old one.
+TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+}
+
 TEST_F(DBSecondaryTest, SwitchWAL) {
   const int kNumKeysPerMemtable = 1;
   Options options;
diff --git a/db/version_set.cc b/db/version_set.cc
index ccedca7940d..9978c8cd463 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5217,7 +5217,8 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
                                        WriteController* write_controller)
     : VersionSet(dbname, _db_options, _env_options, table_cache,
                  write_buffer_manager, write_controller,
-                 /*block_cache_tracer=*/nullptr) {}
+                 /*block_cache_tracer=*/nullptr),
+      number_of_edits_to_skip_(0) {}
 
 ReactiveVersionSet::~ReactiveVersionSet() {}
 
@@ -5415,6 +5416,17 @@ Status ReactiveVersionSet::ReadAndApply(
         break;
       }
 
+      // Skip the first VersionEdits of each MANIFEST generated by
+      // VersionSet::WriteSnapshot.
+      if (number_of_edits_to_skip_ > 0) {
+        ColumnFamilyData* cfd =
+            column_family_set_->GetColumnFamily(edit.column_family_);
+        if (cfd != nullptr && !cfd->IsDropped()) {
+          --number_of_edits_to_skip_;
+        }
+        continue;
+      }
+
       s = read_buffer_.AddEdit(&edit);
       if (!s.ok()) {
         break;
@@ -5463,8 +5475,33 @@ Status ReactiveVersionSet::ReadAndApply(
     // find the next MANIFEST, we should exit the loop.
     s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
     reader = manifest_reader->get();
-    if (s.ok() && reader->file()->file_name() == old_manifest_path) {
-      break;
+    if (s.ok()) {
+      if (reader->file()->file_name() == old_manifest_path) {
+        // Still processing the same MANIFEST, thus no need to continue this
+        // loop since no record is available if we have reached here.
+        break;
+      } else {
+        // We have switched to a new MANIFEST whose first records have been
+        // generated by VersionSet::WriteSnapshot. Since the secondary instance
+        // has already finished recovering upon start, there is no need for the
+        // secondary to process these records. Actually, if the secondary were
+        // to replay these records, the secondary may end up adding the same
+        // SST files AGAIN to each column family, causing consistency checks
+        // done by VersionBuilder to fail. Therefore, we record the number of
+        // records to skip at the beginning of the new MANIFEST and ignore
+        // them.
+        number_of_edits_to_skip_ = 0;
+        for (auto* cfd : *column_family_set_) {
+          if (cfd->IsDropped()) {
+            continue;
+          }
+          // Increase number_of_edits_to_skip by 2 because WriteSnapshot()
+          // writes 2 version edits for each column family at the beginning of
+          // the newly-generated MANIFEST.
+          // TODO(yanqin) remove hard-coded value.
+          number_of_edits_to_skip_ += 2;
+        }
+      }
     }
   }
 
@@ -5504,7 +5541,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     return Status::OK();
   }
   if (active_version_builders_.find(edit.column_family_) ==
-      active_version_builders_.end()) {
+      active_version_builders_.end() && !cfd->IsDropped()) {
     std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
         new BaseReferencedVersionBuilder(cfd));
     active_version_builders_.insert(
@@ -5532,6 +5569,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
       delete cfd;
       cfd = nullptr;
     }
+    active_version_builders_.erase(builder_iter);
   } else {
     builder->Apply(&edit);
   }
@@ -5543,7 +5581,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     return s;
   }
 
-  if (cfd != nullptr) {
+  if (cfd != nullptr && !cfd->IsDropped()) {
     s = builder->LoadTableHandlers(
         cfd->internal_stats(), db_options_->max_file_opening_threads,
         false /* prefetch_index_and_filter_in_cache */,
diff --git a/db/version_set.h b/db/version_set.h
index 90be94a789a..ba1b4d3e3d0 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1195,6 +1195,9 @@ class ReactiveVersionSet : public VersionSet {
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       active_version_builders_;
   AtomicGroupReadBuffer read_buffer_;
+  // Number of version edits to skip by ReadAndApply at the beginning of a new
+  // MANIFEST created by primary.
+  int number_of_edits_to_skip_;
 
   using VersionSet::LogAndApply;
   using VersionSet::Recover;

From 4bd0cf541dc46cf2320311f047aaa559d5d40d3a Mon Sep 17 00:00:00 2001
From: siddontang <siddontang@gmail.com>
Date: Tue, 18 Jun 2019 11:20:52 -0700
Subject: [PATCH 157/572] build on ARM64 (#5450)

Summary:
Support building RocksDB on AWS ARM64

```
uname -m
aarch64
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5450

Differential Revision: D15879851

fbshipit-source-id: a9b56520a2cd9921338305a06d7103a40a3300b8
---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index ac30f9ab0fa..4a52c6cddb7 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -537,7 +537,7 @@ if test -z "$PORTABLE"; then
     COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
     COMMON_FLAGS="$COMMON_FLAGS -march=z10 "
-  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then
     # TODO: Handle this with approprite options.
     COMMON_FLAGS="$COMMON_FLAGS"
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then

From 5dc9fbd1175ad10454b877d9044c4b909d00ae3b Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 18 Jun 2019 11:53:43 -0700
Subject: [PATCH 158/572] Update the version of ZStd for the Rocks Java static
 build

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5228

Differential Revision: D15880451

Pulled By: sagar0

fbshipit-source-id: 84da6f42cac15367d95bffa5336ebd002e7c3308
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8e8c0ac7638..b0b52a37365 100644
--- a/Makefile
+++ b/Makefile
@@ -1684,8 +1684,8 @@ SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
 LZ4_VER ?= 1.8.3
 LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.3.7
-ZSTD_SHA256 ?= 5dd1e90eb16c25425880c8a91327f63de22891ffed082fcc17e5ae84fce0d5fb
+ZSTD_VER ?= 1.4.0
+ZSTD_SHA256 ?= 63be339137d2b683c6d19a9e34f4fb684790e864fee13c7dd40e197a64c705c1
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 

From 220870523cdfe100fadd29ec98cabd83a8112f82 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <ajkr@users.noreply.github.com>
Date: Tue, 18 Jun 2019 14:52:44 -0700
Subject: [PATCH 159/572] Fix compilation with USE_HDFS (#5444)

Summary:
The changes in https://github.com/facebook/rocksdb/commit/8272a6de57ed701fb25bb660e074cab703ed3fe7 were untested with `USE_HDFS=1`. There were a couple compiler errors. This PR fixes them.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5444

Test Plan:
```
$ EXTRA_LDFLAGS="-L/tmp/hadoop-3.1.2/lib/native/" EXTRA_CXXFLAGS="-I/tmp/hadoop-3.1.2/include" USE_HDFS=1 make -j12 check
```

Differential Revision: D15885009

fbshipit-source-id: 2a0a63739e0b9a2819b461ad63ce1292c4833fe2
---
 env/env_hdfs.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc
index 5bdf03ae3e1..207f0815bc4 100644
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@@ -420,7 +420,7 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
 // create a new file for writing
 Status HdfsEnv::NewWritableFile(const std::string& fname,
                                 std::unique_ptr<WritableFile>* result,
-                                const EnvOptions& /*options*/) {
+                                const EnvOptions& options) {
   result->reset();
   Status s;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
@@ -590,6 +590,11 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); }
 
 Status HdfsEnv::NewLogger(const std::string& fname,
                           std::shared_ptr<Logger>* result) {
+  // EnvOptions is used exclusively for its `strict_bytes_per_sync` value. That
+  // option is only intended for WAL/flush/compaction writes, so turn it off in
+  // the logger.
+  EnvOptions options;
+  options.strict_bytes_per_sync = false;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
   if (f == nullptr || !f->isValid()) {
     delete f;

From d0c6aea192f546fc049c90d2782636603c1a80f0 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 18 Jun 2019 14:53:35 -0700
Subject: [PATCH 160/572] Revert to respecting only the read_tier read option
 for index blocks (#5481)

Summary:
PR https://github.com/facebook/rocksdb/issues/5298 subtly changed how read options are applied to the index block
during a Get, MultiGet, or iteration. Earlier, only the read_tier option
applied to the index block read; since PR https://github.com/facebook/rocksdb/issues/5298, fill_cache and
verify_checksums also have an effect. This patch restores the earlier
behavior to prevent surprise memory increases for clients due to the
index block not being cached.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5481

Test Plan: make check

Differential Revision: D15883082

Pulled By: ltamasi

fbshipit-source-id: 9a065ec3a6db5a365cf6dd5e95190a20c5756356
---
 table/block_based/block_based_table_reader.cc | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 0caea508822..adc5eb6b044 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -210,8 +210,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
     return properties == nullptr || !properties->index_value_is_delta_encoded;
   }
 
-  Status GetOrReadIndexBlock(const ReadOptions& read_options,
-                             GetContext* get_context,
+  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
                              BlockCacheLookupContext* lookup_context,
                              CachableEntry<Block>* index_block) const;
 
@@ -250,7 +249,7 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
 }
 
 Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
-    const ReadOptions& read_options, GetContext* get_context,
+    bool no_io, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
     CachableEntry<Block>* index_block) const {
   assert(index_block != nullptr);
@@ -260,6 +259,11 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
     return Status::OK();
   }
 
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
   return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
                         get_context, lookup_context, index_block);
 }
@@ -304,9 +308,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         lookup_context, &index_block);
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -366,7 +371,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     Statistics* kNullStats = nullptr;
 
     CachableEntry<Block> index_block;
-    Status s = GetOrReadIndexBlock(ReadOptions(), nullptr /* get_context */,
+    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
                                    &lookup_context, &index_block);
     if (!s.ok()) {
       ROCKS_LOG_WARN(rep->ioptions.info_log,
@@ -489,9 +494,10 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         lookup_context, &index_block);
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);
@@ -631,9 +637,10 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
       const ReadOptions& read_options, bool disable_prefix_seek,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
     CachableEntry<Block> index_block;
-    const Status s = GetOrReadIndexBlock(read_options, get_context,
-                                         lookup_context, &index_block);
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
     if (!s.ok()) {
       if (iter != nullptr) {
         iter->Invalidate(s);

From 92f631da33e88ce63f1546c3a4865cc4dc1d4e13 Mon Sep 17 00:00:00 2001
From: Huisheng Liu <hliu@microsoft.com>
Date: Tue, 18 Jun 2019 16:35:57 -0700
Subject: [PATCH 161/572] replace sprintf with its safe version snprintf
 (#5475)

Summary:
sprintf is unsafe and has buffer overrun risk. Replace it with the safer version snprintf where buffer size is supplied to avoid overrun.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5475

Differential Revision: D15879481

Pulled By: sagar0

fbshipit-source-id: 7ae1958ffc9727fa50261dfbb98ddd74e70a72d8
---
 tools/trace_analyzer_tool.cc | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 627610ae0f4..9ee746af4a2 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -572,7 +572,7 @@ Status TraceAnalyzer::MakeStatistics() {
       // output the access count distribution
       if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
         for (auto& record : stat.second.a_count_stats) {
-          ret = sprintf(buffer_, "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+          ret = snprintf(buffer_, sizeof(buffer_), "access_count: %" PRIu64 " num: %" PRIu64 "\n",
                         record.first, record.second);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
@@ -596,7 +596,7 @@ Status TraceAnalyzer::MakeStatistics() {
           get_mid = true;
         }
         if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
-          ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", record.first,
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n", record.first,
                         record.second);
           if (ret < 0) {
             return Status::IOError("Format output failed");
@@ -624,7 +624,7 @@ Status TraceAnalyzer::MakeStatistics() {
         if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
             (type == TraceOperationType::kPut ||
              type == TraceOperationType::kMerge)) {
-          ret = sprintf(buffer_,
+          ret = snprintf(buffer_, sizeof(buffer_),
                         "Number_of_value_size_between %" PRIu64 " and %" PRIu64
                         " is: %" PRIu64 "\n",
                         v_begin, v_end, record.second);
@@ -675,7 +675,7 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
       succ_ratio = (static_cast<double>(record.second.succ_count)) /
                    record.second.access_count;
     }
-    ret = sprintf(buffer_, "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
+    ret = snprintf(buffer_, sizeof(buffer_), "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
                   record.second.cf_id, record.second.value_size,
                   record.second.key_id, record.second.access_count, succ_ratio);
     if (ret < 0) {
@@ -703,7 +703,7 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
           prefix_succ_ratio =
               (static_cast<double>(prefix_succ_access)) / prefix_access;
         }
-        ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+        ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
                       record.second.key_id, prefix_access, prefix_count,
                       prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
         if (ret < 0) {
@@ -809,7 +809,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
         }
         if (stat.second.a_qps_f) {
           while (time_line < time_it.first) {
-            ret = sprintf(buffer_, "%u\n", 0);
+            ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0);
             if (ret < 0) {
               return Status::IOError("Format the output failed");
             }
@@ -821,7 +821,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             }
             time_line++;
           }
-          ret = sprintf(buffer_, "%u\n", time_it.second);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -870,7 +870,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
             cur_num = find_time->second;
           }
-          ret = sprintf(buffer_, "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -887,7 +887,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
       // output the prefix of top k access peak
       if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
         while (!stat.second.top_k_qps_sec.empty()) {
-          ret = sprintf(buffer_, "At time: %u with QPS: %u\n",
+          ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n",
                         stat.second.top_k_qps_sec.top().second,
                         stat.second.top_k_qps_sec.top().first);
           if (ret < 0) {
@@ -906,7 +906,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
               std::string qps_prefix_out =
                   rocksdb::LDBCommand::StringToHex(qps_prefix.first);
-              ret = sprintf(buffer_, "The prefix: %s Access count: %u\n",
+              ret = snprintf(buffer_, sizeof(buffer_), "The prefix: %s Access count: %u\n",
                             qps_prefix_out.c_str(), qps_prefix.second);
               if (ret < 0) {
                 return Status::IOError("Format the output failed");
@@ -928,9 +928,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
     for (uint32_t i = 0; i < duration; i++) {
       for (int type = 0; type <= kTaTypeNum; type++) {
         if (type < kTaTypeNum) {
-          ret = sprintf(buffer_, "%u ", type_qps[i][type]);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]);
         } else {
-          ret = sprintf(buffer_, "%u\n", type_qps[i][type]);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]);
         }
         if (ret < 0) {
           return Status::IOError("Format the output failed");
@@ -959,9 +959,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
           v = 0;
         }
         if (cf < cfs_size - 1) {
-          ret = sprintf(buffer_, "%u ", v);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", v);
         } else {
-          ret = sprintf(buffer_, "%u\n", v);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v);
         }
         if (ret < 0) {
           return Status::IOError("Format the output failed");
@@ -1016,7 +1016,7 @@ Status TraceAnalyzer::ReProcessing() {
           if (found != stat.a_key_stats.end()) {
             key_id = found->second.key_id;
           }
-          ret = sprintf(buffer_, "%u %" PRIu64 " %" PRIu64 "\n",
+          ret = snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
                         stat.time_series.front().type,
                         stat.time_series.front().ts, key_id);
           if (ret < 0) {
@@ -1064,7 +1064,7 @@ Status TraceAnalyzer::ReProcessing() {
             TraceStats& stat = ta_[type].stats[cf_id];
             if (stat.w_key_f) {
               if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
-                ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n",
+                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
                               cfs_[cf_id].w_count,
                               stat.a_key_stats[input_key].access_count);
                 if (ret < 0) {
@@ -1086,7 +1086,7 @@ Status TraceAnalyzer::ReProcessing() {
                 prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
                 std::string prefix_out =
                     rocksdb::LDBCommand::StringToHex(prefix[type]);
-                ret = sprintf(buffer_, "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
+                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
                               prefix_out.c_str());
                 if (ret < 0) {
                   return Status::IOError("Format the output failed");
@@ -1904,7 +1904,7 @@ Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
   std::string hex_key = rocksdb::LDBCommand::StringToHex(key);
   int ret;
   ret =
-      sprintf(buffer_, "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
+      snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
   if (ret < 0) {
     return Status::IOError("failed to format the output");
   }

From f46a2a03759a11731d62f01a0707a44ccab4cfbc Mon Sep 17 00:00:00 2001
From: Vaibhav Gogte <vgogte@fb.com>
Date: Tue, 18 Jun 2019 17:32:44 -0700
Subject: [PATCH 162/572] Export Cache::GetCharge (#5476)

Summary:
Exporting GetCharge to cache.hh
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5476

Differential Revision: D15881882

Pulled By: riversand963

fbshipit-source-id: 3d99084d10059b4fcaaaba240606ed50bc23351c
---
 cache/cache_test.cc                    | 8 ++++++++
 cache/sharded_cache.h                  | 3 ++-
 include/rocksdb/cache.h                | 3 +++
 utilities/simulator_cache/sim_cache.cc | 2 ++
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 0cc3d559502..d7b191bb31f 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -686,6 +686,14 @@ TEST_P(CacheTest, DefaultShardBits) {
   ASSERT_EQ(6, sc->GetNumShardBits());
 }
 
+TEST_P(CacheTest, GetCharge) {
+  Insert(1, 2);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
+  ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1, cache_->GetCharge(h1));
+  cache_->Release(h1);
+}
+
 #ifdef SUPPORT_CLOCK_CACHE
 std::shared_ptr<Cache> (*new_clock_cache_func)(size_t, int,
                                                bool) = NewClockCache;
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 920898b871f..0c1499f22dd 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -54,7 +54,8 @@ class ShardedCache : public Cache {
   virtual CacheShard* GetShard(int shard) = 0;
   virtual const CacheShard* GetShard(int shard) const = 0;
   virtual void* Value(Handle* handle) override = 0;
-  virtual size_t GetCharge(Handle* handle) const = 0;
+  virtual size_t GetCharge(Handle* handle) const override = 0;
+
   virtual uint32_t GetHash(Handle* handle) const = 0;
   virtual void DisownData() override = 0;
 
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index ed7790aebb5..8fb691559d0 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -226,6 +226,9 @@ class Cache {
   // returns the memory size for the entries in use by the system
   virtual size_t GetPinnedUsage() const = 0;
 
+  // returns the charge for the specific entry in the cache.
+  virtual size_t GetCharge(Handle* handle) const = 0;
+
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
   // memory - call this only if you're shutting down the process.
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index f6f1e671450..d84a593b9d5 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -235,6 +235,8 @@ class SimCacheImpl : public SimCache {
     return cache_->GetUsage(handle);
   }
 
+  size_t GetCharge(Handle* handle) const override { return cache_->GetCharge(handle); }
+
   size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
   void DisownData() override {

From 2e8ad03ab3f9e498e682dd74600b0f8b5fc02d67 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Tue, 18 Jun 2019 18:34:39 -0700
Subject: [PATCH 163/572] Add more stats in the block cache trace analyzer
 (#5482)

Summary:
This PR adds more stats in the block cache trace analyzer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5482

Differential Revision: D15883553

Pulled By: HaoyuHuang

fbshipit-source-id: 6d440e4f657af75690420102d532d0ee1ed4e9cf
---
 tools/block_cache_trace_analyzer.cc | 143 +++++++++++++++++++++-------
 tools/block_cache_trace_analyzer.h  |   3 +-
 2 files changed, 112 insertions(+), 34 deletions(-)

diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 3fd93a0239b..a8259de71b5 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -27,6 +27,10 @@ DEFINE_string(
     "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. "
     "cache_name is lru. cache_capacity can be xK, xM or xG "
     "where x is a positive number.");
+DEFINE_int32(block_cache_trace_downsample_ratio, 1,
+             "The trace collected accesses on one in every "
+             "block_cache_trace_downsample_ratio blocks. We scale "
+             "down the simulated cache size by this ratio.");
 DEFINE_bool(print_block_size_stats, false,
             "Print block size distribution and the distribution break down by "
             "block type and column family.");
@@ -91,18 +95,30 @@ void print_break_lines(uint32_t num_break_lines) {
   }
 }
 
+double percent(uint64_t numerator, uint64_t denomenator) {
+  if (denomenator == 0) {
+    return -1;
+  }
+  return static_cast<double>(numerator * 100.0 / denomenator);
+}
+
 }  // namespace
 
 BlockCacheTraceSimulator::BlockCacheTraceSimulator(
-    uint64_t warmup_seconds,
+    uint64_t warmup_seconds, uint32_t downsample_ratio,
     const std::vector<CacheConfiguration>& cache_configurations)
     : warmup_seconds_(warmup_seconds),
+      downsample_ratio_(downsample_ratio),
       cache_configurations_(cache_configurations) {
   for (auto const& config : cache_configurations_) {
     for (auto cache_capacity : config.cache_capacities) {
-      sim_caches_.push_back(
-          NewSimCache(NewLRUCache(cache_capacity, config.num_shard_bits),
-                      /*real_cache=*/nullptr, config.num_shard_bits));
+      // Scale down the cache capacity since the trace contains accesses on
+      // 1/'downsample_ratio' blocks.
+      uint64_t simulate_cache_capacity =
+          cache_capacity / downsample_ratio_;
+      sim_caches_.push_back(NewSimCache(
+          NewLRUCache(simulate_cache_capacity, config.num_shard_bits),
+          /*real_cache=*/nullptr, config.num_shard_bits));
     }
   }
 }
@@ -285,11 +301,12 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
       }
     }
   }
-  fprintf(stdout, "Block access count stats: \n%s",
+  fprintf(stdout,
+          "Block access count stats: The number of accesses per block.\n%s",
           access_stats.ToString().c_str());
   for (auto const& bt_stats : bt_stats_map) {
     print_break_lines(/*num_break_lines=*/1);
-    fprintf(stdout, "Block access count stats for block type %s: \n%s",
+    fprintf(stdout, "Break down by block type %s: \n%s",
             block_type_to_string(bt_stats.first).c_str(),
             bt_stats.second.ToString().c_str());
   }
@@ -298,7 +315,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
     for (auto const& bt_stats : cf_bt_stats.second) {
       print_break_lines(/*num_break_lines=*/1);
       fprintf(stdout,
-              "Block access count stats for column family %s and block type "
+              "Break down by column family %s and block type "
               "%s: \n%s",
               cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
               bt_stats.second.ToString().c_str());
@@ -313,6 +330,15 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
   std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map;
   HistogramStat block_access_stats;
   std::map<std::string, HistogramStat> cf_block_access_info;
+  HistogramStat percent_referenced_bytes;
+  std::map<std::string, HistogramStat> cf_percent_referenced_bytes;
+  // Total number of accesses in a data block / number of keys in a data block.
+  HistogramStat avg_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat> cf_avg_naccesses_per_key_in_a_data_block;
+  // The standard deviation on the number of accesses of a key in a data block.
+  HistogramStat stdev_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat>
+      cf_stdev_naccesses_per_key_in_a_data_block;
 
   for (auto const& cf_aggregates : cf_aggregates_map_) {
     // Stats per column family.
@@ -343,6 +369,20 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
                    block_access_info.second.num_referenced_key_exist_in_block /
                (double)block_access_info.second.num_accesses) *
               10000.0);
+
+          HistogramStat hist_naccess_per_key;
+          for (auto const& key_access :
+               block_access_info.second.key_num_access_map) {
+            hist_naccess_per_key.Add(key_access.second);
+          }
+          uint64_t avg_accesses = hist_naccess_per_key.Average();
+          uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation();
+          avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
+          cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
+          stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
+          cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(
+              stdev_accesses);
+
           existing_keys_stats.Add(percent_referenced_for_existing_keys);
           cf_existing_keys_stats_map[cf_name].Add(
               percent_referenced_for_existing_keys);
@@ -356,7 +396,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
     }
   }
   fprintf(stdout,
-          "Histogram on percentage of referenced keys existing in a block over "
+          "Histogram on the number of referenced keys existing in a block over "
           "the total number of keys in a block: \n%s",
           existing_keys_stats.ToString().c_str());
   for (auto const& cf_stats : cf_existing_keys_stats_map) {
@@ -367,7 +407,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
   print_break_lines(/*num_break_lines=*/1);
   fprintf(
       stdout,
-      "Histogram on percentage of referenced keys DO NOT exist in a block over "
+      "Histogram on the number of referenced keys DO NOT exist in a block over "
       "the total number of keys in a block: \n%s",
       non_existing_keys_stats.ToString().c_str());
   for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
@@ -377,7 +417,7 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
   }
   print_break_lines(/*num_break_lines=*/1);
   fprintf(stdout,
-          "Histogram on percentage of accesses on keys exist in a block over "
+          "Histogram on the number of accesses on keys exist in a block over "
           "the total number of accesses in a block: \n%s",
           block_access_stats.ToString().c_str());
   for (auto const& cf_stats : cf_block_access_info) {
@@ -385,6 +425,24 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
     fprintf(stdout, "Break down by column family %s: \n%s",
             cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
   }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(
+      stdout,
+      "Histogram on the average number of accesses per key in a block: \n%s",
+      avg_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(stdout,
+          "Histogram on the standard deviation of the number of accesses per "
+          "key in a block: \n%s",
+          stdev_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
 }
 
 void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
@@ -456,40 +514,49 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
     print_break_lines(/*num_break_lines=*/3);
     fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
     fprintf(stdout,
-            "Number of files:%" PRIu64 "Number of blocks: %" PRIu64
-            "Number of accesses: %" PRIu64 "\n",
+            " Number of files:%" PRIu64 " Number of blocks: %" PRIu64
+            " Number of accesses: %" PRIu64 "\n",
             cf_num_files, cf_num_blocks, cf_num_accesses);
     for (auto block_type : cf_bt_blocks) {
-      fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n",
-              block_type_to_string(block_type.first).c_str(),
-              block_type.second);
+      fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+              block_type_to_string(block_type.first).c_str(), block_type.second,
+              percent(block_type.second, cf_num_blocks));
     }
     for (auto caller : cf_caller_num_accesses_map) {
+      const uint64_t naccesses = caller.second;
       print_break_lines(/*num_break_lines=*/1);
-      fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
-              caller_to_string(caller.first).c_str(), caller.second);
+      fprintf(stdout,
+              "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+              caller_to_string(caller.first).c_str(), naccesses,
+              percent(naccesses, cf_num_accesses));
       fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
               caller_to_string(caller.first).c_str());
       for (auto naccess_level :
            cf_caller_level_num_accesses_map[caller.first]) {
         fprintf(stdout,
-                "\t Level %" PRIu64 ": Number of accesses: %" PRIu64 "\n",
-                naccess_level.first, naccess_level.second);
+                "\t Level %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_level.first, naccess_level.second,
+                percent(naccess_level.second, naccesses));
       }
       fprintf(stdout, "Caller %s: Number of accesses per file break down\n",
               caller_to_string(caller.first).c_str());
       for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) {
         fprintf(stdout,
-                "\t File %" PRIu64 ": Number of accesses: %" PRIu64 "\n",
-                naccess_file.first, naccess_file.second);
+                "\t File %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_file.first, naccess_file.second,
+                percent(naccess_file.second, naccesses));
       }
       fprintf(stdout,
               "Caller %s: Number of accesses per block type break down\n",
               caller_to_string(caller.first).c_str());
       for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) {
-        fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n",
+        fprintf(stdout,
+                "\t Block Type %s: Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
                 block_type_to_string(naccess_type.first).c_str(),
-                naccess_type.second);
+                naccess_type.second, percent(naccess_type.second, naccesses));
       }
     }
   }
@@ -500,25 +567,32 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
           " Number of accesses: %" PRIu64 "\n",
           total_num_files, total_num_blocks, total_num_accesses);
   for (auto block_type : bt_num_blocks_map) {
-    fprintf(stdout, "Number of %s blocks: %" PRIu64 "\n",
-            block_type_to_string(block_type.first).c_str(), block_type.second);
+    fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+            block_type_to_string(block_type.first).c_str(), block_type.second,
+            percent(block_type.second, total_num_blocks));
   }
   for (auto caller : caller_num_access_map) {
     print_break_lines(/*num_break_lines=*/1);
-    fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 "\n",
-            caller_to_string(caller.first).c_str(), caller.second);
+    uint64_t naccesses = caller.second;
+    fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+            caller_to_string(caller.first).c_str(), naccesses,
+            percent(naccesses, total_num_accesses));
     fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
             caller_to_string(caller.first).c_str());
     for (auto naccess_level : caller_level_num_access_map[caller.first]) {
-      fprintf(stdout, "\t Level %d: Number of accesses: %" PRIu64 "\n",
-              naccess_level.first, naccess_level.second);
+      fprintf(stdout,
+              "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n",
+              naccess_level.first, naccess_level.second,
+              percent(naccess_level.second, naccesses));
     }
     fprintf(stdout, "Caller %s: Number of accesses per block type break down\n",
             caller_to_string(caller.first).c_str());
     for (auto naccess_type : caller_bt_num_access_map[caller.first]) {
-      fprintf(stdout, "\t Block Type %s: Number of accesses: %" PRIu64 "\n",
+      fprintf(stdout,
+              "\t Block Type %s: Number of accesses: %" PRIu64
+              " Percent: %.2f\n",
               block_type_to_string(naccess_type.first).c_str(),
-              naccess_type.second);
+              naccess_type.second, percent(naccess_type.second, naccesses));
     }
   }
 }
@@ -575,12 +649,15 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
   }
   uint64_t warmup_seconds =
       FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0;
+  uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0
+                                  ? FLAGS_block_cache_trace_downsample_ratio
+                                  : 0;
   std::vector<CacheConfiguration> cache_configs =
       parse_cache_config_file(FLAGS_block_cache_sim_config_path);
   std::unique_ptr<BlockCacheTraceSimulator> cache_simulator;
   if (!cache_configs.empty()) {
-    cache_simulator.reset(
-        new BlockCacheTraceSimulator(warmup_seconds, cache_configs));
+    cache_simulator.reset(new BlockCacheTraceSimulator(
+        warmup_seconds, downsample_ratio, cache_configs));
   }
   BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
                                    FLAGS_output_miss_ratio_curve_path,
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 1420906f3cf..0690d14d0f3 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -31,7 +31,7 @@ class BlockCacheTraceSimulator {
   // warmup_seconds: The number of seconds to warmup simulated caches. The
   // hit/miss counters are reset after the warmup completes.
   BlockCacheTraceSimulator(
-      uint64_t warmup_seconds,
+      uint64_t warmup_seconds, uint32_t downsample_ratio,
       const std::vector<CacheConfiguration>& cache_configurations);
   ~BlockCacheTraceSimulator() = default;
   // No copy and move.
@@ -52,6 +52,7 @@ class BlockCacheTraceSimulator {
 
  private:
   const uint64_t warmup_seconds_;
+  const uint32_t downsample_ratio_;
   const std::vector<CacheConfiguration> cache_configurations_;
 
   bool warmup_complete_ = false;

From 5355e527d9a4704f2057962f243318772896b4aa Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 18 Jun 2019 19:00:03 -0700
Subject: [PATCH 164/572] Make the 'block read count' performance counters
 consistent (#5484)

Summary:
The patch brings the semantics of per-block-type read performance
context counters in sync with the generic block_read_count by only
incrementing the counter if the block was actually read from the file.
It also fixes index_block_read_count, which fell victim to the
refactoring in PR https://github.com/facebook/rocksdb/issues/5298.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5484

Test Plan: Extended the unit tests.

Differential Revision: D15887431

Pulled By: ltamasi

fbshipit-source-id: a3889759d0ac5759d56625d692cd828d1b9207a6
---
 HISTORY.md                                    |  1 +
 table/block_based/block_based_table_reader.cc | 68 ++++++++++++++-----
 table/block_based/block_based_table_reader.h  |  2 +
 table/block_based/block_type.h                |  6 ++
 table/block_fetcher.cc                        | 20 ++++++
 table/block_fetcher.h                         |  5 +-
 table/meta_blocks.cc                          | 15 ++--
 table/meta_blocks.h                           |  3 +-
 table/plain/plain_table_reader.cc             |  5 +-
 table/table_test.cc                           | 22 ++++--
 10 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 0b6409dbe47..18feefafce8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index adc5eb6b044..66fe34b95ea 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -80,14 +80,14 @@ Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
     std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
-    bool do_uncompress, bool maybe_compressed,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
     const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
   BlockContents contents;
   BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
                              &contents, ioptions, do_uncompress,
-                             maybe_compressed, uncompression_dict,
+                             maybe_compressed, block_type, uncompression_dict,
                              cache_options, memory_allocator);
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
@@ -603,8 +603,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     BlockFetcher prefixes_block_fetcher(
         file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
         &prefixes_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, memory_allocator);
+        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
     s = prefixes_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       return s;
@@ -613,8 +613,8 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     BlockFetcher prefixes_meta_block_fetcher(
         file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
         &prefixes_meta_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, memory_allocator);
+        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
     s = prefixes_meta_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       // TODO: log error
@@ -1373,7 +1373,8 @@ Status BlockBasedTable::ReadCompressionDictBlock(
         rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
         rep_->compression_dict_handle, compression_dict_cont.get(),
         rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
-        UncompressionDict::GetEmptyDict(), cache_options);
+        BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(),
+        cache_options);
     s = compression_block_fetcher.ReadBlockContents();
 
     if (!s.ok()) {
@@ -1583,7 +1584,7 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
   Status s = ReadBlockFromFile(
       rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
       rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
-      true /* decompress */, true /*maybe_compressed*/,
+      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
       GetMemoryAllocator(rep_->table_options));
@@ -1818,8 +1819,9 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
   BlockFetcher block_fetcher(
       rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
       filter_handle, &block, rep->ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
+      false /*maybe_compressed*/, BlockType::kFilter,
+      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
+      GetMemoryAllocator(rep->table_options));
   Status s = block_fetcher.ReadBlockContents();
 
   if (!s.ok()) {
@@ -1940,7 +1942,6 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
               ? Cache::Priority::HIGH
               : Cache::Priority::LOW);
       if (s.ok()) {
-        PERF_COUNTER_ADD(filter_block_read_count, 1);
         UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
       } else {
         RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
@@ -2021,7 +2022,6 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
               : Cache::Priority::LOW);
 
       if (s.ok()) {
-        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
         UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
                                     get_context, usage);
         dict = uncompression_dict.release();
@@ -2217,7 +2217,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &raw_block_contents, rep_->ioptions,
             do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
-            uncompression_dict, rep_->persistent_cache_options,
+            block_type, uncompression_dict, rep_->persistent_cache_options,
             GetMemoryAllocator(rep_->table_options),
             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
         s = block_fetcher.ReadBlockContents();
@@ -2335,7 +2335,7 @@ Status BlockBasedTable::RetrieveBlock(
     s = ReadBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
         rep_->ioptions, rep_->blocks_maybe_compressed,
-        rep_->blocks_maybe_compressed, uncompression_dict,
+        rep_->blocks_maybe_compressed, block_type, uncompression_dict,
         rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
         block_type == BlockType::kData
             ? rep_->table_options.read_amp_bytes_per_bit
@@ -3335,7 +3335,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     BlockFetcher block_fetcher(
         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
         ReadOptions(), handle, &contents, rep_->ioptions,
-        false /* decompress */, false /*maybe_compressed*/,
+        false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
@@ -3345,6 +3345,38 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
   return s;
 }
 
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+    const Slice& meta_block_name) {
+  if (meta_block_name.starts_with(kFilterBlockPrefix) ||
+      meta_block_name.starts_with(kFullFilterBlockPrefix) ||
+      meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+    return BlockType::kFilter;
+  }
+
+  if (meta_block_name == kPropertiesBlock) {
+    return BlockType::kProperties;
+  }
+
+  if (meta_block_name == kCompressionDictBlock) {
+    return BlockType::kCompressionDictionary;
+  }
+
+  if (meta_block_name == kRangeDelBlock) {
+    return BlockType::kRangeDeletion;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesBlock) {
+    return BlockType::kHashIndexPrefixes;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+    return BlockType::kHashIndexMetadata;
+  }
+
+  assert(false);
+  return BlockType::kInvalid;
+}
+
 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
     InternalIteratorBase<Slice>* index_iter) {
   Status s;
@@ -3357,13 +3389,15 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
     Slice input = index_iter->value();
     s = handle.DecodeFrom(&input);
     BlockContents contents;
+    const Slice meta_block_name = index_iter->key();
     BlockFetcher block_fetcher(
         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
         ReadOptions(), handle, &contents, rep_->ioptions,
         false /* decompress */, false /*maybe_compressed*/,
+        GetBlockTypeForMetaBlockByName(meta_block_name),
         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
-    if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
+    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
       TableProperties* table_properties;
       s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
                                            index_iter->value(),
@@ -3662,7 +3696,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
               rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
               ReadOptions(), handle, &block, rep_->ioptions,
               false /*decompress*/, false /*maybe_compressed*/,
-              UncompressionDict::GetEmptyDict(),
+              BlockType::kFilter, UncompressionDict::GetEmptyDict(),
               rep_->persistent_cache_options);
           s = block_fetcher.ReadBlockContents();
           if (!s.ok()) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 17c4e7238c8..3c92621bdcd 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -407,6 +407,8 @@ class BlockBasedTable : public TableReader {
       const BlockBasedTableOptions& table_options, const int level,
       BlockCacheLookupContext* lookup_context);
 
+  static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
+
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
   Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
 
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
index 9b9c53946c9..a60be2e6a70 100644
--- a/table/block_based/block_type.h
+++ b/table/block_based/block_type.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <cstdint>
+
 namespace rocksdb {
 
 // Represents the types of blocks used in the block based table format.
@@ -17,8 +19,12 @@ enum class BlockType : uint8_t {
   kProperties,
   kCompressionDictionary,
   kRangeDeletion,
+  kHashIndexPrefixes,
+  kHashIndexMetadata,
   kMetaIndex,
   kIndex,
+  // Note: keep kInvalid the last value when adding new enum values.
+  kInvalid
 };
 
 }  // namespace rocksdb
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index afcbbaee4f5..35beb79502b 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -220,6 +220,26 @@ Status BlockFetcher::ReadBlockContents() {
                             &slice_, used_buf_);
     }
     PERF_COUNTER_ADD(block_read_count, 1);
+
+    // TODO: introduce dedicated perf counter for range tombstones
+    switch (block_type_) {
+      case BlockType::kFilter:
+        PERF_COUNTER_ADD(filter_block_read_count, 1);
+        break;
+
+      case BlockType::kCompressionDictionary:
+        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+        break;
+
+      case BlockType::kIndex:
+        PERF_COUNTER_ADD(index_block_read_count, 1);
+        break;
+
+      // Nothing to do here as we don't have counters for the other types.
+      default:
+        break;
+    }
+
     PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize);
     if (!status_.ok()) {
       return status_;
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 6451d6d2acc..06e5d9dfa31 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -10,6 +10,7 @@
 #pragma once
 #include "memory/memory_allocator.h"
 #include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
 #include "table/format.h"
 
 namespace rocksdb {
@@ -39,7 +40,7 @@ class BlockFetcher {
                FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                const ReadOptions& read_options, const BlockHandle& handle,
                BlockContents* contents, const ImmutableCFOptions& ioptions,
-               bool do_uncompress, bool maybe_compressed,
+               bool do_uncompress, bool maybe_compressed, BlockType block_type,
                const UncompressionDict& uncompression_dict,
                const PersistentCacheOptions& cache_options,
                MemoryAllocator* memory_allocator = nullptr,
@@ -53,6 +54,7 @@ class BlockFetcher {
         ioptions_(ioptions),
         do_uncompress_(do_uncompress),
         maybe_compressed_(maybe_compressed),
+        block_type_(block_type),
         uncompression_dict_(uncompression_dict),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
@@ -72,6 +74,7 @@ class BlockFetcher {
   const ImmutableCFOptions& ioptions_;
   bool do_uncompress_;
   bool maybe_compressed_;
+  BlockType block_type_;
   const UncompressionDict& uncompression_dict_;
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 341a1185579..7bbbc7966de 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -216,7 +216,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   BlockFetcher block_fetcher(
       file, prefetch_buffer, footer, read_options, handle, &block_contents,
       ioptions, false /* decompress */, false /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options,
+      memory_allocator);
   s = block_fetcher.ReadBlockContents();
   // property block is never compressed. Need to add uncompress logic if we are
   // to compress it..
@@ -375,8 +376,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   BlockFetcher block_fetcher(
       file, nullptr /* prefetch_buffer */, footer, read_options,
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      cache_options, memory_allocator);
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -446,7 +447,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
       file, nullptr /* prefetch_buffer */, footer, read_options,
       metaindex_handle, &metaindex_contents, ioptions,
       false /* do decompression */, false /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options,
+      memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -467,7 +469,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
+                     const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents, bool /*compression_type_missing*/,
                      MemoryAllocator* memory_allocator) {
   Status status;
@@ -488,6 +490,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
                              metaindex_handle, &metaindex_contents, ioptions,
                              false /* decompress */, false /*maybe_compressed*/,
+                             BlockType::kMetaIndex,
                              UncompressionDict::GetEmptyDict(), cache_options,
                              memory_allocator);
   status = block_fetcher.ReadBlockContents();
@@ -515,7 +518,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   // Reading metablock
   BlockFetcher block_fetcher2(
       file, prefetch_buffer, footer, read_options, block_handle, contents,
-      ioptions, false /* decompress */, false /*maybe_compressed*/,
+      ioptions, false /* decompress */, false /*maybe_compressed*/, block_type,
       UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   return block_fetcher2.ReadBlockContents();
 }
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 5224c54714d..86c703f953c 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -16,6 +16,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "table/block_based/block_builder.h"
+#include "table/block_based/block_type.h"
 #include "table/format.h"
 #include "util/kv_map.h"
 
@@ -143,7 +144,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
+                     const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents,
                      bool compression_type_missing = false,
                      MemoryAllocator* memory_allocator = nullptr);
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 15f7be1c253..2f8f300d871 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -299,7 +299,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                            file_size_, kPlainTableMagicNumber, ioptions_,
                            PlainTableIndexBuilder::kPlainTableIndexBlock,
-                           &index_block_contents,
+                           BlockType::kIndex, &index_block_contents,
                            true /* compression_type_missing */);
 
   bool index_in_file = s.ok();
@@ -310,7 +310,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   if (index_in_file) {
     s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                       file_size_, kPlainTableMagicNumber, ioptions_,
-                      BloomBlockBuilder::kBloomBlock, &bloom_block_contents,
+                      BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+                      &bloom_block_contents,
                       true /* compression_type_missing */);
     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index c59c9d8c33f..e836f89a8df 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2268,6 +2268,8 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       if (index_and_filter_in_cache) {
         // data, index and filter block
         ASSERT_EQ(get_perf_context()->block_read_count, 3);
+        ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+        ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
       } else {
         // just the data block
         ASSERT_EQ(get_perf_context()->block_read_count, 1);
@@ -2293,9 +2295,12 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
         if (bloom_filter_type == 0) {
           // with block-based, we read index and then the filter
           ASSERT_EQ(get_perf_context()->block_read_count, 2);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
         } else {
           // with full-filter, we read filter first and then we stop
           ASSERT_EQ(get_perf_context()->block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
         }
       } else {
         // filter is already in memory and it figures out that the key doesn't
@@ -3565,7 +3570,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
                                  &footer, kBlockBasedTableMagicNumber));
 
-    auto BlockFetchHelper = [&](const BlockHandle& handle,
+    auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
                                 BlockContents* contents) {
       ReadOptions read_options;
       read_options.verify_checksums = false;
@@ -3574,8 +3579,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       BlockFetcher block_fetcher(
           file, nullptr /* prefetch_buffer */, footer, read_options, handle,
           contents, ioptions, false /* decompress */,
-          false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-          cache_options);
+          false /*maybe_compressed*/, block_type,
+          UncompressionDict::GetEmptyDict(), cache_options);
 
       ASSERT_OK(block_fetcher.ReadBlockContents());
     };
@@ -3584,7 +3589,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     auto metaindex_handle = footer.metaindex_handle();
     BlockContents metaindex_contents;
 
-    BlockFetchHelper(metaindex_handle, &metaindex_contents);
+    BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
+        &metaindex_contents);
     Block metaindex_block(std::move(metaindex_contents),
                           kDisableGlobalSequenceNumber);
 
@@ -3601,7 +3607,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     ASSERT_OK(properties_handle.DecodeFrom(&v));
     BlockContents properties_contents;
 
-    BlockFetchHelper(properties_handle, &properties_contents);
+    BlockFetchHelper(properties_handle, BlockType::kProperties,
+        &properties_contents);
     Block properties_block(std::move(properties_contents),
                            kDisableGlobalSequenceNumber);
 
@@ -3660,8 +3667,9 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      pcache_opts, nullptr /*memory_allocator*/);
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);

From fe90ed7a70b6cda47ef970375e74b9a0b486cab3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= <simon@graetzer.org>
Date: Wed, 19 Jun 2019 08:02:21 -0700
Subject: [PATCH 165/572] Replace Corruption with TryAgain status when new tail
 is not visible to TransactionLogIterator (#5474)

Summary:
When tailing the WAL with TransactionLogIterator, it used to return Corruption status to indicate that the WAL has new tail that is not visible to the iterator, which is a misleading status. The patch replaces it with TryAgain which is more descriptive of a status, indicating that the user needs to create a new iterator to fetch the recent tail.
Fixes https://github.com/facebook/rocksdb/issues/5455
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5474

Differential Revision: D15898953

Pulled By: maysamyabandeh

fbshipit-source-id: 40966f6457cb539e1aeb104daeada6b0e46059fc
---
 HISTORY.md                 |  1 +
 db/transaction_log_impl.cc |  3 ++-
 db/wal_manager_test.cc     | 23 +++++++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 18feefafce8..825c1def47c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -29,6 +29,7 @@
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
+* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 
 ## 6.2.0 (4/30/2019)
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index 2e4475bb6ac..8c526af12ae 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -199,7 +199,8 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
       if (current_last_seq_ == versions_->LastSequence()) {
         current_status_ = Status::OK();
       } else {
-        current_status_ = Status::Corruption("NO MORE DATA LEFT");
+        const char* msg = "Create a new iterator to fetch the new tail.";
+        current_status_ = Status::TryAgain(msg);
       }
       return;
     }
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 1bc6a8afe83..671dc84e1b8 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -293,6 +293,29 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
   // Check that an empty iterator is returned
   ASSERT_TRUE(!iter->Valid());
 }
+  
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+  Init();
+  CreateArchiveLogs(2, 100);
+  auto iter = OpenTransactionLogIter(0);
+  CreateArchiveLogs(1, 100);
+  int i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 200);
+  // A new log file was added after the iterator was created.
+  // TryAgain indicates a new iterator is needed to fetch the new data
+  ASSERT_TRUE(iter->status().IsTryAgain());
+  
+  iter = OpenTransactionLogIter(0);
+  i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 300);
+  ASSERT_TRUE(iter->status().ok());
+}
 
 }  // namespace rocksdb
 

From 24b118ad986e656a11b94ad441cd455830bac7b2 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Wed, 19 Jun 2019 14:07:36 -0700
Subject: [PATCH 166/572] Combine the read-ahead logic for user reads and
 compaction reads (#5431)

Summary:
Currently the read-ahead logic for user reads and compaction reads go through different code paths where compaction reads create new table readers and use `ReadaheadRandomAccessFile`. This change is to unify read-ahead logic to use read-ahead in BlockBasedTableReader::InitDataBlock(). As a result of the change  `ReadAheadRandomAccessFile` class and `new_table_reader_for_compaction_inputs` option will no longer be used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5431

Test Plan:
make check

Here is the benchmarking - https://gist.github.com/vjnadimpalli/083cf423f7b6aa12dcdb14c858bc18a5

Differential Revision: D15772533

Pulled By: vjnadimpalli

fbshipit-source-id: b71dca710590471ede6fb37553388654e2e479b9
---
 db/db_compaction_test.cc                      | 21 ++---
 db/table_cache.cc                             | 76 ++++---------------
 db/table_cache.h                              |  3 +-
 include/rocksdb/options.h                     |  2 +
 table/block_based/block_based_table_reader.cc | 39 ++++++----
 table/block_based/block_based_table_reader.h  | 19 ++++-
 table/block_fetcher.cc                        |  5 +-
 table/block_fetcher.h                         |  8 +-
 table/cuckoo/cuckoo_table_reader.cc           |  3 +-
 table/cuckoo/cuckoo_table_reader.h            |  6 +-
 table/meta_blocks.cc                          | 11 ++-
 table/mock_table.cc                           |  3 +-
 table/mock_table.h                            |  3 +-
 table/plain/plain_table_reader.cc             |  3 +-
 table/plain/plain_table_reader.h              |  6 +-
 table/table_reader.h                          | 11 +--
 table/table_test.cc                           |  4 +-
 util/file_reader_writer.cc                    | 16 ++--
 util/file_reader_writer.h                     | 11 ++-
 19 files changed, 124 insertions(+), 126 deletions(-)

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 6537950fcc7..7f639c85397 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -497,14 +497,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
 
   // Create new iterator for:
   // (1) 1 for verifying flush results
-  // (2) 3 for compaction input files
-  // (3) 1 for verifying compaction results.
-  ASSERT_EQ(num_new_table_reader, 5);
+  // (2) 1 for verifying compaction results.
+  // (3) New TableReaders will not be created for compaction inputs
+  ASSERT_EQ(num_new_table_reader, 2);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
   ASSERT_EQ(num_new_table_reader, 0);
 
   num_table_cache_lookup = 0;
@@ -519,13 +519,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   // May preload table cache too.
   ASSERT_GE(num_table_cache_lookup, 1);
   old_num_table_cache_lookup2 = num_table_cache_lookup;
-  // One for compaction input, one for verifying compaction results.
-  ASSERT_EQ(num_new_table_reader, 2);
+  // One for verifying compaction results.
+  // No new iterator created for compaction.
+  ASSERT_EQ(num_new_table_reader, 1);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
   ASSERT_EQ(num_new_table_reader, 0);
 
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -4339,12 +4340,6 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
   options.env = new MockEnv(Env::Default());
   Reopen(options);
   bool readahead = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "TableCache::NewIterator:for_compaction", [&](void* arg) {
-        bool* use_direct_reads = static_cast<bool*>(arg);
-        ASSERT_EQ(*use_direct_reads,
-                  options.use_direct_reads);
-      });
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
         bool* use_direct_writes = static_cast<bool*>(arg);
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 0a152f89a16..bbfaf32e09e 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -16,6 +16,7 @@
 
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
@@ -43,13 +44,6 @@ static void UnrefEntry(void* arg1, void* arg2) {
   cache->Release(h);
 }
 
-static void DeleteTableReader(void* arg1, void* arg2) {
-  TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
-  Statistics* stats = reinterpret_cast<Statistics*>(arg2);
-  RecordTick(stats, NO_FILE_CLOSES);
-  delete table_reader;
-}
-
 static Slice GetSliceForFileNumber(const uint64_t* file_number) {
   return Slice(reinterpret_cast<const char*>(file_number),
                sizeof(*file_number));
@@ -96,8 +90,8 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
 Status TableCache::GetTableReader(
     const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
-    bool sequential_mode, size_t readahead, bool record_read_stats,
-    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+    bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
+    std::unique_ptr<TableReader>* table_reader,
     const SliceTransform* prefix_extractor, bool skip_filters, int level,
     bool prefetch_index_and_filter_in_cache, bool for_compaction) {
   std::string fname =
@@ -107,13 +101,6 @@ Status TableCache::GetTableReader(
 
   RecordTick(ioptions_.statistics, NO_FILE_OPENS);
   if (s.ok()) {
-    if (readahead > 0 && !env_options.use_mmap_reads) {
-      // Not compatible with mmap files since ReadaheadRandomAccessFile requires
-      // its wrapped file's Read() to copy data into the provided scratch
-      // buffer, which mmap files don't use.
-      // TODO(ajkr): try madvise for mmap files in place of buffered readahead.
-      file = NewReadaheadRandomAccessFile(std::move(file), readahead);
-    }
     if (!sequential_mode && ioptions_.advise_random_on_open) {
       file->Hint(RandomAccessFile::RANDOM);
     }
@@ -164,10 +151,9 @@ Status TableCache::FindTable(const EnvOptions& env_options,
     }
     std::unique_ptr<TableReader> table_reader;
     s = GetTableReader(env_options, internal_comparator, fd,
-                       false /* sequential mode */, 0 /* readahead */,
-                       record_read_stats, file_read_hist, &table_reader,
-                       prefix_extractor, skip_filters, level,
-                       prefetch_index_and_filter_in_cache);
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader, prefix_extractor,
+                       skip_filters, level, prefetch_index_and_filter_in_cache);
     if (!s.ok()) {
       assert(table_reader == nullptr);
       RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
@@ -196,48 +182,21 @@ InternalIterator* TableCache::NewIterator(
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
-  bool create_new_table_reader = false;
   TableReader* table_reader = nullptr;
   Cache::Handle* handle = nullptr;
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-  size_t readahead = 0;
-  if (for_compaction) {
-#ifndef NDEBUG
-    bool use_direct_reads_for_compaction = env_options.use_direct_reads;
-    TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction",
-                             &use_direct_reads_for_compaction);
-#endif  // !NDEBUG
-    if (ioptions_.new_table_reader_for_compaction_inputs) {
-      // get compaction_readahead_size from env_options allows us to set the
-      // value dynamically
-      readahead = env_options.compaction_readahead_size;
-      create_new_table_reader = true;
-    }
-  }
 
   auto& fd = file_meta.fd;
-  if (create_new_table_reader) {
-    std::unique_ptr<TableReader> table_reader_unique_ptr;
-    s = GetTableReader(
-        env_options, icomparator, fd, true /* sequential_mode */, readahead,
-        !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr,
-        prefix_extractor, false /* skip_filters */, level,
-        true /* prefetch_index_and_filter_in_cache */, for_compaction);
+  table_reader = fd.table_reader;
+  if (table_reader == nullptr) {
+    s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
+                  options.read_tier == kBlockCacheTier /* no_io */,
+                  !for_compaction /* record read_stats */, file_read_hist,
+                  skip_filters, level);
     if (s.ok()) {
-      table_reader = table_reader_unique_ptr.release();
-    }
-  } else {
-    table_reader = fd.table_reader;
-    if (table_reader == nullptr) {
-      s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
-                    options.read_tier == kBlockCacheTier /* no_io */,
-                    !for_compaction /* record read_stats */, file_read_hist,
-                    skip_filters, level);
-      if (s.ok()) {
-        table_reader = GetTableReaderFromHandle(handle);
-      }
+      table_reader = GetTableReaderFromHandle(handle);
     }
   }
   InternalIterator* result = nullptr;
@@ -247,13 +206,10 @@ InternalIterator* TableCache::NewIterator(
       result = NewEmptyInternalIterator<Slice>(arena);
     } else {
       result = table_reader->NewIterator(options, prefix_extractor, arena,
-                                         skip_filters, for_compaction);
+                                         skip_filters, for_compaction,
+                                         env_options.compaction_readahead_size);
     }
-    if (create_new_table_reader) {
-      assert(handle == nullptr);
-      result->RegisterCleanup(&DeleteTableReader, table_reader,
-                              ioptions_.statistics);
-    } else if (handle != nullptr) {
+    if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
       handle = nullptr;  // prevent from releasing below
     }
diff --git a/db/table_cache.h b/db/table_cache.h
index 1577cef82ff..dbf76039a23 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -177,8 +177,7 @@ class TableCache {
   Status GetTableReader(const EnvOptions& env_options,
                         const InternalKeyComparator& internal_comparator,
                         const FileDescriptor& fd, bool sequential_mode,
-                        size_t readahead, bool record_read_stats,
-                        HistogramImpl* file_read_hist,
+                        bool record_read_stats, HistogramImpl* file_read_hist,
                         std::unique_ptr<TableReader>* table_reader,
                         const SliceTransform* prefix_extractor = nullptr,
                         bool skip_filters = false, int level = -1,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index fe5617fb5c3..8ebcd292dba 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -760,6 +760,8 @@ struct DBOptions {
   // for this mode if using block-based table.
   //
   // Default: false
+  // This flag has no affect on the behavior of compaction and plan to delete
+  // in the future.
   bool new_table_reader_for_compaction_inputs = false;
 
   // If non-zero, we perform bigger reads when doing compaction. If you're
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 66fe34b95ea..9339c35364f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -83,12 +83,13 @@ Status ReadBlockFromFile(
     bool do_uncompress, bool maybe_compressed, BlockType block_type,
     const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
-    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+    bool for_compaction = false) {
   BlockContents contents;
-  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
-                             &contents, ioptions, do_uncompress,
-                             maybe_compressed, block_type, uncompression_dict,
-                             cache_options, memory_allocator);
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
     result->reset(new Block(std::move(contents), global_seqno,
@@ -1906,7 +1907,7 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 
   if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
     return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
-    /*cache_handle=*/nullptr, /*own_value=*/false};
+            /*cache_handle=*/nullptr, /*own_value=*/false};
   }
 
   PERF_TIMER_GUARD(read_filter_block_nanos);
@@ -2075,7 +2076,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
     BlockType block_type, bool key_includes_seq, bool index_key_is_full,
     GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
-    FilePrefetchBuffer* prefetch_buffer) const {
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
@@ -2094,7 +2095,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
-                    block_type, get_context, lookup_context);
+                    block_type, get_context, lookup_context, for_compaction);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
@@ -2144,6 +2145,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
       s = block_cache->Insert(unique_key, nullptr,
                               block.GetValue()->ApproximateMemoryUsage(),
                               nullptr, &cache_handle);
+
       if (s.ok()) {
         assert(cache_handle != nullptr);
         iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
@@ -2297,7 +2299,8 @@ Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
-    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
@@ -2340,7 +2343,7 @@ Status BlockBasedTable::RetrieveBlock(
         block_type == BlockType::kData
             ? rep_->table_options.read_amp_bytes_per_bit
             : 0,
-        GetMemoryAllocator(rep_->table_options));
+        GetMemoryAllocator(rep_->table_options), for_compaction);
   }
 
   if (!s.ok()) {
@@ -2714,13 +2717,18 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
             rep->file.get(), read_options_.readahead_size,
             read_options_.readahead_size));
       }
+    } else if (!prefetch_buffer_) {
+      prefetch_buffer_.reset(
+          new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_,
+                                 compaction_readahead_size_));
     }
 
     Status s;
     table_->NewDataBlockIterator<TBlockIter>(
         read_options_, data_block_handle, &block_iter_, block_type_,
         key_includes_seq_, index_key_is_full_,
-        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get());
+        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
+        for_compaction_);
     block_iter_points_to_real_block_ = true;
   }
 }
@@ -2806,7 +2814,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
 
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
-    Arena* arena, bool skip_filters, bool for_compaction) {
+    Arena* arena, bool skip_filters, bool for_compaction,
+    size_t compaction_readahead_size) {
   BlockCacheLookupContext lookup_context{
       for_compaction ? BlockCacheLookupCaller::kCompaction
                      : BlockCacheLookupCaller::kUserIterator};
@@ -2823,7 +2832,8 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
+        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
+        compaction_readahead_size);
   } else {
     auto* mem =
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
@@ -2835,7 +2845,8 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
+        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
+        compaction_readahead_size);
   }
 }
 
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 3c92621bdcd..be758c96798 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -123,6 +123,8 @@ class BlockBasedTable : public TableReader {
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
   // @param skip_filters Disables loading/accessing the filter block
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
   InternalIterator* NewIterator(
       const ReadOptions&, const SliceTransform* prefix_extractor,
       Arena* arena = nullptr, bool skip_filters = false,
@@ -131,7 +133,8 @@ class BlockBasedTable : public TableReader {
       // i.e., it will populate the block cache with blocks in the new SST
       // files. We treat those as a user is calling iterator for now. We should
       // differentiate the callers.
-      bool for_compaction = false) override;
+      bool for_compaction = false,
+      size_t compaction_readahead_size = 0) override;
 
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
@@ -234,7 +237,7 @@ class BlockBasedTable : public TableReader {
       TBlockIter* input_iter, BlockType block_type, bool key_includes_seq,
       bool index_key_is_full, GetContext* get_context,
       BlockCacheLookupContext* lookup_context, Status s,
-      FilePrefetchBuffer* prefetch_buffer) const;
+      FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
 
   class PartitionedIndexIteratorState;
 
@@ -283,7 +286,8 @@ class BlockBasedTable : public TableReader {
                        const UncompressionDict& uncompression_dict,
                        CachableEntry<Block>* block_entry, BlockType block_type,
                        GetContext* get_context,
-                       BlockCacheLookupContext* lookup_context) const;
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction = false) const;
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
@@ -596,6 +600,8 @@ struct BlockBasedTable::Rep {
 // Iterates over the contents of BlockBasedTable.
 template <class TBlockIter, typename TValue = Slice>
 class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
  public:
   BlockBasedTableIterator(const BlockBasedTable* table,
                           const ReadOptions& read_options,
@@ -605,7 +611,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
                           const SliceTransform* prefix_extractor,
                           BlockType block_type, bool key_includes_seq = true,
                           bool index_key_is_full = true,
-                          bool for_compaction = false)
+                          bool for_compaction = false,
+                          size_t compaction_readahead_size = 0)
       : InternalIteratorBase<TValue>(false),
         table_(table),
         read_options_(read_options),
@@ -621,6 +628,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         key_includes_seq_(key_includes_seq),
         index_key_is_full_(index_key_is_full),
         for_compaction_(for_compaction),
+        compaction_readahead_size_(compaction_readahead_size),
         lookup_context_(for_compaction
                             ? BlockCacheLookupCaller::kCompaction
                             : BlockCacheLookupCaller::kUserIterator) {}
@@ -734,6 +742,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool index_key_is_full_;
   // If this iterator is created for compaction
   bool for_compaction_;
+  // Readahead size used in compaction, its value is used only if
+  // for_compaction_ = true
+  size_t compaction_readahead_size_;
   BlockHandle prev_index_value_;
   BlockCacheLookupContext lookup_context_;
 
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 35beb79502b..6fdddc37e49 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -93,7 +93,8 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
   if (prefetch_buffer_ != nullptr &&
       prefetch_buffer_->TryReadFromCache(
           handle_.offset(),
-          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_)) {
+          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_,
+          for_compaction_)) {
     block_size_ = static_cast<size_t>(handle_.size());
     CheckBlockChecksum();
     if (!status_.ok()) {
@@ -217,7 +218,7 @@ Status BlockFetcher::ReadBlockContents() {
       PERF_TIMER_GUARD(block_read_time);
       // Actual file read
       status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
-                            &slice_, used_buf_);
+                            &slice_, used_buf_, for_compaction_);
     }
     PERF_COUNTER_ADD(block_read_count, 1);
 
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 06e5d9dfa31..f67c974becb 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -44,7 +44,8 @@ class BlockFetcher {
                const UncompressionDict& uncompression_dict,
                const PersistentCacheOptions& cache_options,
                MemoryAllocator* memory_allocator = nullptr,
-               MemoryAllocator* memory_allocator_compressed = nullptr)
+               MemoryAllocator* memory_allocator_compressed = nullptr,
+               bool for_compaction = false)
       : file_(file),
         prefetch_buffer_(prefetch_buffer),
         footer_(footer),
@@ -58,7 +59,9 @@ class BlockFetcher {
         uncompression_dict_(uncompression_dict),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
-        memory_allocator_compressed_(memory_allocator_compressed) {}
+        memory_allocator_compressed_(memory_allocator_compressed),
+        for_compaction_(for_compaction) {}
+
   Status ReadBlockContents();
   CompressionType get_compression_type() const { return compression_type_; }
 
@@ -88,6 +91,7 @@ class BlockFetcher {
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
   rocksdb::CompressionType compression_type_;
+  bool for_compaction_ = false;
 
   // return true if found
   bool TryGetUncompressBlockFromPersistentCache();
diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
index 905528e9bbf..821743608e4 100644
--- a/table/cuckoo/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -377,7 +377,8 @@ Slice CuckooTableIterator::value() const {
 InternalIterator* CuckooTableReader::NewIterator(
     const ReadOptions& /*read_options*/,
     const SliceTransform* /* prefix_extractor */, Arena* arena,
-    bool /*skip_filters*/, bool /*for_compaction*/) {
+    bool /*skip_filters*/, bool /*for_compaction*/,
+    size_t /*compaction_readahead_size*/) {
   if (!status().ok()) {
     return NewErrorInternalIterator<Slice>(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index 0080a76e158..cdb0302bd3d 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -45,11 +45,15 @@ class CuckooTableReader: public TableReader {
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
+  // Returns a new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena = nullptr,
                                 bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                bool for_compaction = false,
+                                size_t compaction_readahead_size = 0) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 7bbbc7966de..4205d298b6d 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -487,12 +487,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   read_options.verify_checksums = false;
   PersistentCacheOptions cache_options;
 
-  BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
-                             metaindex_handle, &metaindex_contents, ioptions,
-                             false /* decompress */, false /*maybe_compressed*/,
-                             BlockType::kMetaIndex,
-                             UncompressionDict::GetEmptyDict(), cache_options,
-                             memory_allocator);
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, read_options, metaindex_handle,
+      &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   status = block_fetcher.ReadBlockContents();
   if (!status.ok()) {
     return status;
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 9b250604803..4d55bf7c9a8 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -34,7 +34,8 @@ stl_wrappers::KVMap MakeMockFile(
 
 InternalIterator* MockTableReader::NewIterator(
     const ReadOptions&, const SliceTransform* /* prefix_extractor */,
-    Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) {
+    Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/,
+    size_t /*compaction_readahead_size*/) {
   return new MockTableIterator(table_);
 }
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 005de1c3dc2..6a5b5ab31cd 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -44,7 +44,8 @@ class MockTableReader : public TableReader {
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena = nullptr,
                                 bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                bool for_compaction = false,
+                                size_t compaction_readahead_size = 0) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
              GetContext* get_context, const SliceTransform* prefix_extractor,
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 2f8f300d871..2f036e61ae1 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -196,7 +196,8 @@ void PlainTableReader::SetupForCompaction() {
 
 InternalIterator* PlainTableReader::NewIterator(
     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
-    Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) {
+    Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/,
+    size_t /*compaction_readahead_size*/) {
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index 774e2eb36ef..7a468bdb8c8 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -77,11 +77,15 @@ class PlainTableReader: public TableReader {
                      bool full_scan_mode, const bool immortal_table = false,
                      const SliceTransform* prefix_extractor = nullptr);
 
+  // Returns new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena = nullptr,
                                 bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                bool for_compaction = false,
+                                size_t compaction_readahead_size = 0) override;
 
   void Prepare(const Slice& target) override;
 
diff --git a/table/table_reader.h b/table/table_reader.h
index bf3289818d6..2904526e59b 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -44,11 +44,12 @@ class TableReader {
   //        all the states but those allocated in arena.
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
-  virtual InternalIterator* NewIterator(const ReadOptions&,
-                                        const SliceTransform* prefix_extractor,
-                                        Arena* arena = nullptr,
-                                        bool skip_filters = false,
-                                        bool for_compaction = false) = 0;
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  virtual InternalIterator* NewIterator(
+      const ReadOptions&, const SliceTransform* prefix_extractor,
+      Arena* arena = nullptr, bool skip_filters = false,
+      bool for_compaction = false, size_t compaction_readahead_size = 0) = 0;
 
   virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
diff --git a/table/table_test.cc b/table/table_test.cc
index e836f89a8df..8e290368428 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -3590,7 +3590,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     BlockContents metaindex_contents;
 
     BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
-        &metaindex_contents);
+                     &metaindex_contents);
     Block metaindex_block(std::move(metaindex_contents),
                           kDisableGlobalSequenceNumber);
 
@@ -3608,7 +3608,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     BlockContents properties_contents;
 
     BlockFetchHelper(properties_handle, BlockType::kProperties,
-        &properties_contents);
+                     &properties_contents);
     Block properties_block(std::move(properties_contents),
                            kDisableGlobalSequenceNumber);
 
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 2c4e0a39f67..0af4c2098f1 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -70,7 +70,7 @@ Status SequentialFileReader::Skip(uint64_t n) {
 }
 
 Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
-                                    char* scratch) const {
+                                    char* scratch, bool for_compaction) const {
   Status s;
   uint64_t elapsed = 0;
   {
@@ -90,7 +90,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
       buf.AllocateNewBuffer(read_size);
       while (buf.CurrentSize() < read_size) {
         size_t allowed;
-        if (for_compaction_ && rate_limiter_ != nullptr) {
+        if (for_compaction && rate_limiter_ != nullptr) {
           allowed = rate_limiter_->RequestToken(
               buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
               Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
@@ -134,7 +134,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
       const char* res_scratch = nullptr;
       while (pos < n) {
         size_t allowed;
-        if (for_compaction_ && rate_limiter_ != nullptr) {
+        if (for_compaction && rate_limiter_ != nullptr) {
           if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
             sw.DelayStart();
           }
@@ -711,7 +711,8 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 }  // namespace
 
 Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
-                                    uint64_t offset, size_t n) {
+                                    uint64_t offset, size_t n,
+                                    bool for_compaction) {
   size_t alignment = reader->file()->GetRequiredBufferAlignment();
   size_t offset_ = static_cast<size_t>(offset);
   uint64_t rounddown_offset = Rounddown(offset_, alignment);
@@ -771,7 +772,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
   Slice result;
   s = reader->Read(rounddown_offset + chunk_len,
                    static_cast<size_t>(roundup_len - chunk_len), &result,
-                   buffer_.BufferStart() + chunk_len);
+                   buffer_.BufferStart() + chunk_len, for_compaction);
   if (s.ok()) {
     buffer_offset_ = rounddown_offset;
     buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
@@ -780,7 +781,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
 }
 
 bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
-                                          Slice* result) {
+                                          Slice* result, bool for_compaction) {
   if (track_min_offset_ && offset < min_offset_read_) {
     min_offset_read_ = static_cast<size_t>(offset);
   }
@@ -797,7 +798,8 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
       assert(file_reader_ != nullptr);
       assert(max_readahead_size_ >= readahead_size_);
 
-      Status s = Prefetch(file_reader_, offset, n + readahead_size_);
+      Status s =
+          Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
       if (!s.ok()) {
         return false;
       }
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 5ec332fc7a1..01df1067ed9 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -158,7 +158,8 @@ class RandomAccessFileReader {
   RandomAccessFileReader(const RandomAccessFileReader&) = delete;
   RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const;
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
+              bool for_compaction = false) const;
 
   Status Prefetch(uint64_t offset, size_t n) const {
     return file_->Prefetch(offset, n);
@@ -343,7 +344,9 @@ class FilePrefetchBuffer {
   // reader : the file reader.
   // offset : the file offset to start reading from.
   // n      : the number of bytes to read.
-  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
+  // for_compaction : if prefetch is done for compaction read.
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
+                  bool for_compaction = false);
 
   // Tries returning the data for a file raed from this buffer, if that data is
   // in the buffer.
@@ -354,7 +357,9 @@ class FilePrefetchBuffer {
   // offset : the file offset.
   // n      : the number of bytes.
   // result : output buffer to put the data into.
-  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
+  // for_compaction : if cache read is done for compaction read.
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+                        bool for_compaction = false);
 
   // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
   // tracked if track_min_offset = true.

From 5830c619d5732017a542bbef1be69f3e92dcd5f1 Mon Sep 17 00:00:00 2001
From: Jurriaan Mous <jurmous@jurmo.us>
Date: Wed, 19 Jun 2019 14:39:19 -0700
Subject: [PATCH 167/572] Java: Make the generics of the Options interfaces
 more strict (#5461)

Summary:
Make the generics of the Options interfaces more strict so they are usable in a Kotlin Multiplatform expect/actual typealias implementation without causing a Violation of Finite Bound Restriction.

This fix would enable the creation of a generic Kotlin multiplatform library by just typealiasing the JVM implementation to the current Java implementation.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5461

Differential Revision: D15903288

Pulled By: sagar0

fbshipit-source-id: 75e83fdf5d2fcede40744a17e767563d6a4b0696
---
 .../java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java  | 2 +-
 .../rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java    | 2 +-
 .../src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java | 2 +-
 java/src/main/java/org/rocksdb/DBOptionsInterface.java          | 2 +-
 .../java/org/rocksdb/MutableColumnFamilyOptionsInterface.java   | 2 +-
 java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java   | 2 +-
 java/src/main/java/org/rocksdb/Options.java                     | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index ac8550f3ef7..532db473407 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -14,7 +14,7 @@
  * Taken from include/rocksdb/advanced_options.h
  */
 public interface AdvancedColumnFamilyOptionsInterface
-    <T extends AdvancedColumnFamilyOptionsInterface> {
+    <T extends AdvancedColumnFamilyOptionsInterface<T>> {
 
   /**
    * The minimum number of write buffers that will be merged together
diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
index 3ec46712389..64a6f9dccc7 100644
--- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -12,7 +12,7 @@
  * and MutableCFOptions in util/cf_options.h
  */
 public interface AdvancedMutableColumnFamilyOptionsInterface
-    <T extends AdvancedMutableColumnFamilyOptionsInterface> {
+    <T extends AdvancedMutableColumnFamilyOptionsInterface<T>> {
 
   /**
    * The maximum number of write buffers that are built up in memory.
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index f88a21af2b0..3c8cd5d5182 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -6,7 +6,7 @@
 package org.rocksdb;
 
 public interface ColumnFamilyOptionsInterface
-    <T extends ColumnFamilyOptionsInterface>
+    <T extends ColumnFamilyOptionsInterface<T>>
         extends AdvancedColumnFamilyOptionsInterface<T> {
 
   /**
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index af9aa179bf4..611f4f5da71 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -8,7 +8,7 @@
 import java.util.Collection;
 import java.util.List;
 
-public interface DBOptionsInterface<T extends DBOptionsInterface> {
+public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
 
   /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
index c2efcc54b6b..4f4749646f8 100644
--- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -6,7 +6,7 @@
 package org.rocksdb;
 
 public interface MutableColumnFamilyOptionsInterface
-    <T extends MutableColumnFamilyOptionsInterface>
+    <T extends MutableColumnFamilyOptionsInterface<T>>
         extends AdvancedMutableColumnFamilyOptionsInterface<T> {
 
   /**
diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
index 1715d69d093..00087a43cae 100644
--- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
@@ -1,7 +1,7 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb;
 
-public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface> {
+public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface<T>> {
 
   /**
    * Specifies the maximum number of concurrent background jobs (both flushes
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index 5831b1e298e..bb3c87aefd5 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -16,7 +16,7 @@
  * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
  *
  * If {@link #dispose()} function is not called, then it will be GC'd
- * automaticallyand native resources will be released as part of the process.
+ * automatically and native resources will be released as part of the process.
  */
 public class Options extends RocksObject
     implements DBOptionsInterface<Options>,

From 68614a9608f5d70a247cdcc4621a150141cfe72f Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Wed, 19 Jun 2019 16:42:59 -0700
Subject: [PATCH 168/572] Fix AlignedBuffer's usage in Encryption Env (#5396)

Summary:
The usage of `AlignedBuffer` in env_encryption.cc writes and reads to/from the AlignedBuffer's internal buffer directly without going through AlignedBuffer's APIs (like `Append` and `Read`), causing encapsulation to break in some cases. The writes are especially problematic as after the data is written to the buffer (directly using either memmove or memcpy), the size of the buffer is not updated ... causing the AlignedBuffer to lose track of the encapsulated buffer's current size.
Fixed this by updating the buffer size after every write.

Todo for later:
Add an overloaded method to AlignedBuffer to support a memmove in addition to a memcopy. Encryption env does a memmove, and hence I couldn't switch to using `AlignedBuffer.Append()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5396

Test Plan: `make check`

Differential Revision: D15764756

Pulled By: sagar0

fbshipit-source-id: 2e24b52bd3b4b5056c5c1da157f91ddf89370183
---
 env/env_encryption.cc | 87 ++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index df1b0011a01..6be2137ed6e 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -195,23 +195,26 @@ class EncryptedWritableFile : public WritableFileWrapper {
   EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength)
     : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { }
 
-  Status Append(const Slice& data) override { 
+  Status Append(const Slice& data) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToAppend(data); 
+    Slice dataToAppend(data);
     if (data.size() > 0) {
       auto offset = file_->GetFileSize(); // size including prefix
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
+      // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
+      // so that the next two lines can be replaced with buf.Append().
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToAppend = Slice(buf.BufferStart(), data.size());
+      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
     }
-    status = file_->Append(dataToAppend); 
+    status = file_->Append(dataToAppend);
     if (!status.ok()) {
       return status;
     }
@@ -221,18 +224,19 @@ class EncryptedWritableFile : public WritableFileWrapper {
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToAppend(data); 
+    Slice dataToAppend(data);
     offset += prefixLength_;
     if (data.size() > 0) {
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToAppend = Slice(buf.BufferStart(), data.size());
+      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
     }
     status = file_->PositionedAppend(dataToAppend, offset);
     if (!status.ok()) {
@@ -325,18 +329,19 @@ class EncryptedRandomRWFile : public RandomRWFile {
   Status Write(uint64_t offset, const Slice& data) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToWrite(data); 
+    Slice dataToWrite(data);
     offset += prefixLength_;
     if (data.size() > 0) {
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToWrite = Slice(buf.BufferStart(), data.size());
+      dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
     }
     status = file_->Write(offset, dataToWrite);
     return status;
@@ -393,13 +398,14 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Read prefix 
+      // Read prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart());
       if (!status.ok()) {
         return status;
       }
+      prefixBuf.Size(prefixLength);
     }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
@@ -430,13 +436,14 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Read prefix 
+      // Read prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart());
       if (!status.ok()) {
         return status;
       }
+      prefixBuf.Size(prefixLength);
     }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
@@ -467,12 +474,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -513,12 +521,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -554,12 +563,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -609,11 +619,13 @@ class EncryptedEnv : public EnvWrapper {
         if (!status.ok()) {
           return status;
         }
+        prefixBuf.Size(prefixLength);
       } else {
-        // File is new, initialize & write prefix 
+        // File is new, initialize & write prefix
         provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-        prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-        // Write prefix 
+        prefixBuf.Size(prefixLength);
+        prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+        // Write prefix
         status = underlying->Write(0, prefixSlice);
         if (!status.ok()) {
           return status;
@@ -630,7 +642,7 @@ class EncryptedEnv : public EnvWrapper {
     return Status::OK();
   }
 
-    // Store in *result the attributes of the children of the specified directory.
+  // Store in *result the attributes of the children of the specified directory.
   // In case the implementation lists the directory prior to iterating the files
   // and files are concurrently deleted, the deleted files will be omitted from
   // result.
@@ -670,8 +682,7 @@ class EncryptedEnv : public EnvWrapper {
   EncryptionProvider *provider_;
 };
 
-
-// Returns an Env that encrypts data when stored on disk and decrypts data when 
+// Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
 Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) {
   return new EncryptedEnv(base_env, provider);
@@ -694,14 +705,14 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
     char *block = data;
     size_t n = std::min(dataSize, blockSize - blockOffset);
     if (n != blockSize) {
-      // We're not encrypting a full block. 
+      // We're not encrypting a full block.
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
         // Allocate buffer
         blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
-      // Copy plain data to block buffer 
+      // Copy plain data to block buffer
       memmove(block + blockOffset, data, n);
     }
     auto status = EncryptBlock(blockIndex, block, (char*)scratch.data());
@@ -741,14 +752,14 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
     char *block = data;
     size_t n = std::min(dataSize, blockSize - blockOffset);
     if (n != blockSize) {
-      // We're not decrypting a full block. 
+      // We're not decrypting a full block.
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
         // Allocate buffer
         blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
-      // Copy encrypted data to block buffer 
+      // Copy encrypted data to block buffer
       memmove(block + blockOffset, data, n);
     }
     auto status = DecryptBlock(blockIndex, block, (char*)scratch.data());
@@ -807,7 +818,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra
   memmove(scratch, iv_.data(), blockSize);
   EncodeFixed64(scratch, blockIndex + initialCounter_);
 
-  // Encrypt nonce+counter 
+  // Encrypt nonce+counter
   auto status = cipher_.Encrypt(scratch);
   if (!status.ok()) {
     return status;
@@ -823,13 +834,13 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra
 // Decrypt a block of data at the given block index.
 // Length of data is equal to BlockSize();
 Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) {
-  // For CTR decryption & encryption are the same 
+  // For CTR decryption & encryption are the same
   return EncryptBlock(blockIndex, data, scratch);
 }
 
 // GetPrefixLength returns the length of the prefix that is added to every file
 // and used for storing encryption options.
-// For optimal performance, the prefix length should be a multiple of 
+// For optimal performance, the prefix length should be a multiple of
 // the page size.
 size_t CTREncryptionProvider::GetPrefixLength() {
   return defaultPrefixLength;
@@ -844,7 +855,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &
   iv = Slice(prefix + blockSize, blockSize);
 }
 
-// CreateNewPrefix initialized an allocated block of prefix memory 
+// CreateNewPrefix initialized an allocated block of prefix memory
 // for a new file.
 Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
                                               char* prefix,
@@ -873,7 +884,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
   return Status::OK();
 }
 
-// PopulateSecretPrefixPart initializes the data into a new prefix block 
+// PopulateSecretPrefixPart initializes the data into a new prefix block
 // in plain text.
 // Returns the amount of space (starting from the start of the prefix)
 // that has been initialized.
@@ -908,7 +919,7 @@ Status CTREncryptionProvider::CreateCipherStream(
     return status;
   }
 
-  // Create cipher stream 
+  // Create cipher stream
   return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result);
 }
 

From 24f73436fbdfb2728250ebeb076d4a953af58ddc Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 20 Jun 2019 11:41:59 -0700
Subject: [PATCH 169/572] sanitize and limit block_size under 4GB (#5492)

Summary:
`Block::restart_index_`, `Block::restarts_`, and `Block::current_` are defined as uint32_t but  `BlockBasedTableOptions::block_size` is defined as a size_t so user might see corruption as in https://github.com/facebook/rocksdb/issues/5486.
This PR adds a check in `BlockBasedTableFactory::SanitizeOptions` to disallow such configurations.
yiwu-arbug
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5492

Differential Revision: D15914047

Pulled By: miasantreble

fbshipit-source-id: c943f153d967e15aee7f2795730ab8259e2be201
---
 db/db_test.cc                                  | 11 +++++++++++
 table/block_based/block_based_table_factory.cc |  4 ++++
 2 files changed, 15 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 0204f4d9f62..69e91923cd6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6156,6 +6156,17 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) {
   fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
           flushes_done.load(), threads_destroyed.load());
 }
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(0, "foo", "bar"));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 8LL*1024*1024*1024LL;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 00b13033f3d..96812e233b8 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -257,6 +257,10 @@ Status BlockBasedTableFactory::SanitizeOptions(
     return Status::InvalidArgument(
         "Block alignment requested but block size is not a power of 2");
   }
+  if (table_options_.block_size > port::kMaxUint32) {
+    return Status::InvalidArgument(
+        "block size exceeds maximum number (4GiB) allowed");
+  }
   if (table_options_.data_block_index_type ==
           BlockBasedTableOptions::kDataBlockBinaryAndHash &&
       table_options_.data_block_hash_table_util_ratio <= 0) {

From 0b0cb6f1a2f71eb4532416a959ebcf682ac9096b Mon Sep 17 00:00:00 2001
From: feilongliu <feilongliu@devvm662.vll1.facebook.com>
Date: Thu, 20 Jun 2019 13:04:13 -0700
Subject: [PATCH 170/572] Fix segfalut in ~DBWithTTLImpl() when called after
 Close() (#5485)

Summary:
~DBWithTTLImpl() fails after calling Close() function (will invoke the
Close() function of DBImpl), because the Close() function deletes
default_cf_handle_ which is used in the GetOptions() function called
in ~DBWithTTLImpl(), hence lead to segfault.

Fix by creating a Close() function for the DBWithTTLImpl class and do
the close and the work originally in ~DBWithTTLImpl(). If the Close()
function is not called, it will be called in the ~DBWithTTLImpl()
function.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5485

Test Plan: make clean;  USE_CLANG=1 make all check -j

Differential Revision: D15924498

fbshipit-source-id: 567397fb972961059083a1ae0f9f99ff74872b78
---
 utilities/ttl/db_ttl_impl.cc | 21 ++++++++++++++----
 utilities/ttl/db_ttl_impl.h  |  6 +++++
 utilities/ttl/ttl_test.cc    | 43 ++++++++++++++++++++++++++++++++++--
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 47049a13585..2c79d01ba12 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -34,12 +34,25 @@ void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
 }
 
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
-DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {}
+DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
-  // Need to stop background compaction before getting rid of the filter
-  CancelAllBackgroundWork(db_, /* wait = */ true);
-  delete GetOptions().compaction_filter;
+  if (!closed_) {
+    Close();
+  }
+}
+
+Status DBWithTTLImpl::Close() {
+  Status ret = Status::OK();
+  if (!closed_) {
+    Options default_options = GetOptions();
+    // Need to stop background compaction before getting rid of the filter
+    CancelAllBackgroundWork(db_, /* wait = */ true);
+    ret = db_->Close();
+    delete default_options.compaction_filter;
+    closed_ = true;
+  }
+  return ret;
 }
 
 Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname,
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 593cd64a0fc..1111c13a79f 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -35,6 +35,8 @@ class DBWithTTLImpl : public DBWithTTL {
 
   virtual ~DBWithTTLImpl();
 
+  virtual Status Close() override;
+
   Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options,
                                    const std::string& column_family_name,
                                    ColumnFamilyHandle** handle,
@@ -99,6 +101,10 @@ class DBWithTTLImpl : public DBWithTTL {
   void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); }
 
   void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override;
+
+ private:
+   // remember whether the Close completes or not
+   bool closed_;
 };
 
 class TtlIterator : public Iterator {
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 38c6affab8f..61f5e64497d 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -86,9 +86,24 @@ class TtlTest : public testing::Test {
     ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true));
   }
 
+  // Call db_ttl_->Close() before delete db_ttl_
   void CloseTtl() {
-    delete db_ttl_;
-    db_ttl_ = nullptr;
+    CloseTtlHelper(true);
+  }
+
+  // No db_ttl_->Close() before delete db_ttl_
+  void CloseTtlNoDBClose() {
+    CloseTtlHelper(false);
+  }
+
+  void CloseTtlHelper(bool close_db) {
+    if (db_ttl_ != nullptr) {
+      if (close_db) {
+        db_ttl_->Close();
+      }
+      delete db_ttl_;
+      db_ttl_ = nullptr;
+    }
   }
 
   // Populates and returns a kv-map
@@ -401,6 +416,30 @@ TEST_F(TtlTest, NoEffect) {
   CloseTtl();
 }
 
+
+// Rerun the NoEffect test with a different version of CloseTtl
+// function, where db is directly deleted without close.
+TEST_F(TtlTest, DestructWithoutClose) {
+  MakeKVMap(kSampleSize_);
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);                       //T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);            //T=1: Set1 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);   //T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);            //T=2: Sets1 & 2 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);    //T=4: Sets 1,2,3 still there
+  CloseTtlNoDBClose();
+}
+
 // Puts a set of values and checks its presence using Get during ttl
 TEST_F(TtlTest, PresentDuringTTL) {
   MakeKVMap(kSampleSize_);

From 705b8eecb49272fb100b0ae4b735829e9adf5ca9 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 20 Jun 2019 14:28:22 -0700
Subject: [PATCH 171/572] Add more callers for table reader. (#5454)

Summary:
This PR adds more callers for table readers. These information are only used for block cache analysis so that we can know which caller accesses a block.
1. It renames the BlockCacheLookupCaller to TableReaderCaller as passing the caller from upstream requires changes to table_reader.h and TableReaderCaller is a more appropriate name.
2. It adds more table reader callers in table/table_reader_caller.h, e.g., kCompactionRefill, kExternalSSTIngestion, and kBuildTable.

This PR is long as it requires modification of interfaces in table_reader.h, e.g., NewIterator.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5454

Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32.

Differential Revision: D15819451

Pulled By: HaoyuHuang

fbshipit-source-id: b6caa704c8fb96ddd15b9a934b7e7ea87f88092d
---
 db/builder.cc                                 |   5 +-
 db/compaction/compaction_job.cc               |  12 ++-
 db/convenience.cc                             |   2 +-
 db/db_impl/db_impl.cc                         |   2 +-
 db/external_sst_file_ingestion_job.cc         |   6 +-
 db/forward_iterator.cc                        |  29 ++++-
 db/repair.cc                                  |   6 +-
 db/table_cache.cc                             |   6 +-
 db/table_cache.h                              |  10 +-
 db/version_set.cc                             | 100 ++++++++++--------
 db/version_set.h                              |   7 +-
 table/block_based/block_based_table_reader.cc |  48 ++++-----
 table/block_based/block_based_table_reader.h  |  44 +++-----
 table/block_based/partitioned_filter_block.cc |   2 +-
 table/cuckoo/cuckoo_table_reader.cc           |   2 +-
 table/cuckoo/cuckoo_table_reader.h            |   8 +-
 table/cuckoo/cuckoo_table_reader_test.cc      |   9 +-
 table/mock_table.cc                           |   3 +-
 table/mock_table.h                            |   9 +-
 table/plain/plain_table_reader.cc             |   4 +-
 table/plain/plain_table_reader.h              |   8 +-
 table/sst_file_reader.cc                      |   7 +-
 table/table_reader.h                          |  16 +--
 table/table_reader_bench.cc                   |   4 +-
 table/table_reader_caller.h                   |  39 +++++++
 table/table_test.cc                           |  73 ++++++++-----
 tools/block_cache_trace_analyzer.cc           |  38 +++++--
 tools/block_cache_trace_analyzer.h            |   2 +-
 tools/block_cache_trace_analyzer_test.cc      |  20 ++--
 tools/sst_dump_tool.cc                        |   9 +-
 trace_replay/block_cache_tracer.cc            |  10 +-
 trace_replay/block_cache_tracer.h             |  24 ++---
 trace_replay/block_cache_tracer_test.cc       |  16 +--
 33 files changed, 337 insertions(+), 243 deletions(-)
 create mode 100644 table/table_reader_caller.h

diff --git a/db/builder.cc b/db/builder.cc
index 67d764ad18b..eac1b5fe2e1 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -221,8 +221,9 @@ Status BuildTable(
           mutable_cf_options.prefix_extractor.get(), nullptr,
           (internal_stats == nullptr) ? nullptr
                                       : internal_stats->GetFileReadHist(0),
-          false /* for_compaction */, nullptr /* arena */,
-          false /* skip_filter */, level));
+          TableReaderCaller::kFlush, /*arena=*/nullptr,
+          /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr));
       s = it->status();
       if (s.ok() && paranoid_file_checks) {
         for (it->SeekToFirst(); it->Valid(); it->Next()) {
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 65efedad5b4..db701d19dad 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -521,7 +521,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
     // mutex to reduce contention
     db_mutex_->Unlock();
     uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1,
-                                               /*for_compaction*/ true);
+                                               TableReaderCaller::kCompaction);
     db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
     sum += size;
@@ -646,12 +646,14 @@ Status CompactionJob::Run() {
         // to cache it here for further user reads
         InternalIterator* iter = cfd->table_cache()->NewIterator(
             ReadOptions(), env_options_, cfd->internal_comparator(),
-            *files_meta[file_idx], nullptr /* range_del_agg */,
-            prefix_extractor, nullptr,
+            *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+            /*table_reader_ptr=*/nullptr,
             cfd->internal_stats()->GetFileReadHist(
                 compact_->compaction->output_level()),
-            false, nullptr /* arena */, false /* skip_filters */,
-            compact_->compaction->output_level());
+            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+            /*skip_filters=*/false, compact_->compaction->output_level(),
+            /*smallest_compaction_key=*/nullptr,
+            /*largest_compaction_key=*/nullptr);
         auto s = iter->status();
 
         if (s.ok() && paranoid_file_checks_) {
diff --git a/db/convenience.cc b/db/convenience.cc
index c11653fb190..271217cd4f8 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -59,7 +59,7 @@ Status VerifySstFileChecksum(const Options& options,
   if (!s.ok()) {
     return s;
   }
-  s = table_reader->VerifyChecksum();
+  s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum);
   return s;
 }
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 6341b76854c..f3fc96d8d1f 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2771,7 +2771,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
     if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) {
       sizes[i] += versions_->ApproximateSize(
           v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
-          /*for_compaction=*/false);
+          TableReaderCaller::kUserApproximateSize);
     }
     if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) {
       sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 0068685b0ba..7e9657cc901 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -308,7 +308,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   }
 
   if (ingestion_options_.verify_checksums_before_ingest) {
-    status = table_reader->VerifyChecksum();
+    status =
+        table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion);
   }
   if (!status.ok()) {
     return status;
@@ -368,7 +369,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   // updating the block cache.
   ro.fill_cache = false;
   std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
-      ro, sv->mutable_cf_options.prefix_extractor.get()));
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
   std::unique_ptr<InternalIterator> range_del_iter(
       table_reader->NewRangeTombstoneIterator(ro));
 
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 2633a3ff9bd..c875008c769 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -79,7 +79,11 @@ class ForwardLevelIterator : public InternalIterator {
         read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
         *files_[file_index_],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false);
+        prefix_extractor_, /*table_reader_ptr=*/nullptr,
+        /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
     file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
     valid_ = false;
     if (!range_del_agg.IsEmpty()) {
@@ -642,7 +646,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        sv_->mutable_cf_options.prefix_extractor.get()));
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
   }
   BuildLevelIterators(vstorage);
   current_ = nullptr;
@@ -714,7 +723,12 @@ void ForwardIterator::RenewIterators() {
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
         *l0_files_new[inew],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        svnew->mutable_cf_options.prefix_extractor.get()));
+        svnew->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
   }
 
   for (auto* f : l0_iters_) {
@@ -772,8 +786,13 @@ void ForwardIterator::ResetIncompleteIterators() {
     DeleteIterator(l0_iters_[i]);
     l0_iters_[i] = cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
-        *l0_files[i], nullptr /* range_del_agg */,
-        sv_->mutable_cf_options.prefix_extractor.get());
+        *l0_files[i], /*range_del_agg=*/nullptr,
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
     l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
   }
 
diff --git a/db/repair.cc b/db/repair.cc
index 3ae46c6e7ee..8967b39f30b 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -520,7 +520,11 @@ class Repairer {
       InternalIterator* iter = table_cache_->NewIterator(
           ropts, env_options_, cfd->internal_comparator(), t->meta,
           nullptr /* range_del_agg */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
+          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+          /*level=*/-1, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
diff --git a/db/table_cache.cc b/db/table_cache.cc
index bbfaf32e09e..b98d4b074ff 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -176,7 +176,7 @@ InternalIterator* TableCache::NewIterator(
     const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
     RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
     TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
-    bool for_compaction, Arena* arena, bool skip_filters, int level,
+    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
     const InternalKey* smallest_compaction_key,
     const InternalKey* largest_compaction_key) {
   PERF_TIMER_GUARD(new_table_iterator_nanos);
@@ -187,7 +187,7 @@ InternalIterator* TableCache::NewIterator(
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-
+  bool for_compaction = caller == TableReaderCaller::kCompaction;
   auto& fd = file_meta.fd;
   table_reader = fd.table_reader;
   if (table_reader == nullptr) {
@@ -206,7 +206,7 @@ InternalIterator* TableCache::NewIterator(
       result = NewEmptyInternalIterator<Slice>(arena);
     } else {
       result = table_reader->NewIterator(options, prefix_extractor, arena,
-                                         skip_filters, for_compaction,
+                                         skip_filters, caller,
                                          env_options.compaction_readahead_size);
     }
     if (handle != nullptr) {
diff --git a/db/table_cache.h b/db/table_cache.h
index dbf76039a23..f274337e952 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -70,12 +70,10 @@ class TableCache {
       const ReadOptions& options, const EnvOptions& toptions,
       const InternalKeyComparator& internal_comparator,
       const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
-      const SliceTransform* prefix_extractor = nullptr,
-      TableReader** table_reader_ptr = nullptr,
-      HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
-      Arena* arena = nullptr, bool skip_filters = false, int level = -1,
-      const InternalKey* smallest_compaction_key = nullptr,
-      const InternalKey* largest_compaction_key = nullptr);
+      const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
+      HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
+      bool skip_filters, int level, const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key);
 
   // If a seek to internal key "k" in specified file finds an entry,
   // call get_context->SaveValue() repeatedly until
diff --git a/db/version_set.cc b/db/version_set.cc
index 9978c8cd463..8e2d21b051a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -850,14 +850,15 @@ namespace {
 
 class LevelIterator final : public InternalIterator {
  public:
-  LevelIterator(
-      TableCache* table_cache, const ReadOptions& read_options,
-      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
-      const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor,
-      bool should_sample, HistogramImpl* file_read_hist, bool for_compaction,
-      bool skip_filters, int level, RangeDelAggregator* range_del_agg,
-      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
-          nullptr)
+  LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
+                const EnvOptions& env_options,
+                const InternalKeyComparator& icomparator,
+                const LevelFilesBrief* flevel,
+                const SliceTransform* prefix_extractor, bool should_sample,
+                HistogramImpl* file_read_hist, TableReaderCaller caller,
+                bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+                const std::vector<AtomicCompactionUnitBoundary>*
+                    compaction_boundaries = nullptr)
       : InternalIterator(false),
         table_cache_(table_cache),
         read_options_(read_options),
@@ -868,7 +869,7 @@ class LevelIterator final : public InternalIterator {
         prefix_extractor_(prefix_extractor),
         file_read_hist_(file_read_hist),
         should_sample_(should_sample),
-        for_compaction_(for_compaction),
+        caller_(caller),
         skip_filters_(skip_filters),
         file_index_(flevel_->num_files),
         level_(level),
@@ -957,9 +958,9 @@ class LevelIterator final : public InternalIterator {
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
-        nullptr /* don't need reference to table */,
-        file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
-        level_, smallest_compaction_key, largest_compaction_key);
+        nullptr /* don't need reference to table */, file_read_hist_, caller_,
+        /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key,
+        largest_compaction_key);
   }
 
   TableCache* table_cache_;
@@ -973,7 +974,7 @@ class LevelIterator final : public InternalIterator {
 
   HistogramImpl* file_read_hist_;
   bool should_sample_;
-  bool for_compaction_;
+  TableReaderCaller caller_;
   bool skip_filters_;
   size_t file_index_;
   int level_;
@@ -1442,10 +1443,14 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
     for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
       const auto& file = storage_info_.LevelFilesBrief(0).files[i];
       merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
-          read_options, soptions, cfd_->internal_comparator(), *file.file_metadata,
-          range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
-          cfd_->internal_stats()->GetFileReadHist(0), false, arena,
-          false /* skip_filters */, 0 /* level */));
+          read_options, soptions, cfd_->internal_comparator(),
+          *file.file_metadata, range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
     }
     if (should_sample) {
       // Count ones for every L0 files. This is done per iterator creation
@@ -1466,8 +1471,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
         mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
-        false /* for_compaction */, IsFilterSkipped(level), level,
-        range_del_agg));
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        range_del_agg, /*largest_compaction_key=*/nullptr));
   }
 }
 
@@ -1496,10 +1501,14 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         continue;
       }
       ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
-          read_options, env_options, cfd_->internal_comparator(), *file->file_metadata,
-          &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
-          cfd_->internal_stats()->GetFileReadHist(0), false, &arena,
-          false /* skip_filters */, 0 /* level */));
+          read_options, env_options, cfd_->internal_comparator(),
+          *file->file_metadata, &range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, &arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
       status = OverlapWithIterator(
           ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
       if (!status.ok() || *overlap) {
@@ -1513,7 +1522,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
         mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
-        false /* for_compaction */, IsFilterSkipped(level), level,
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
         &range_del_agg));
     status = OverlapWithIterator(
         ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
@@ -4823,7 +4832,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 // maintain state of where they first appear in the files.
 uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
                                      const Slice& end, int start_level,
-                                     int end_level, bool for_compaction) {
+                                     int end_level, TableReaderCaller caller) {
   // pre-condition
   assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
 
@@ -4844,7 +4853,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
 
     if (!level) {
       // level 0 data is sorted order, handle the use case explicitly
-      size += ApproximateSizeLevel0(v, files_brief, start, end, for_compaction);
+      size += ApproximateSizeLevel0(v, files_brief, start, end, caller);
       continue;
     }
 
@@ -4861,7 +4870,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
     // inferred from the sorted order
     for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
       uint64_t val;
-      val = ApproximateSize(v, files_brief.files[i], end, for_compaction);
+      val = ApproximateSize(v, files_brief.files[i], end, caller);
       if (!val) {
         // the files after this will not have the range
         break;
@@ -4872,7 +4881,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
       if (i == idx_start) {
         // subtract the bytes needed to be scanned to get to the starting
         // key
-        val = ApproximateSize(v, files_brief.files[i], start, for_compaction);
+        val = ApproximateSize(v, files_brief.files[i], start, caller);
         assert(size >= val);
         size -= val;
       }
@@ -4886,15 +4895,15 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
                                            const LevelFilesBrief& files_brief,
                                            const Slice& key_start,
                                            const Slice& key_end,
-                                           bool for_compaction) {
+                                           TableReaderCaller caller) {
   // level 0 files are not in sorted order, we need to iterate through
   // the list to compute the total bytes that require scanning
   uint64_t size = 0;
   for (size_t i = 0; i < files_brief.num_files; i++) {
     const uint64_t start =
-        ApproximateSize(v, files_brief.files[i], key_start, for_compaction);
+        ApproximateSize(v, files_brief.files[i], key_start, caller);
     const uint64_t end =
-        ApproximateSize(v, files_brief.files[i], key_end, for_compaction);
+        ApproximateSize(v, files_brief.files[i], key_end, caller);
     assert(end >= start);
     size += end - start;
   }
@@ -4902,7 +4911,8 @@ uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
 }
 
 uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
-                                     const Slice& key, bool for_compaction) {
+                                     const Slice& key,
+                                     TableReaderCaller caller) {
   // pre-condition
   assert(v);
 
@@ -4920,9 +4930,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
     InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
         ReadOptions(), v->env_options_, v->cfd_->internal_comparator(),
         *f.file_metadata, nullptr /* range_del_agg */,
-        v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr);
+        v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr,
+        /*file_read_hist=*/nullptr, caller,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
     if (table_reader_ptr != nullptr) {
-      result = table_reader_ptr->ApproximateOffsetOf(key, for_compaction);
+      result = table_reader_ptr->ApproximateOffsetOf(key, caller);
     }
     delete iter;
   }
@@ -5001,10 +5015,12 @@ InternalIterator* VersionSet::MakeInputIterator(
               read_options, env_options_compactions, cfd->internal_comparator(),
               *flevel->files[i].file_metadata, range_del_agg,
               c->mutable_cf_options()->prefix_extractor.get(),
-              nullptr /* table_reader_ptr */,
-              nullptr /* no per level latency histogram */,
-              true /* for_compaction */, nullptr /* arena */,
-              false /* skip_filters */, static_cast<int>(which) /* level */);
+              /*table_reader_ptr=*/nullptr,
+              /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+              /*arena=*/nullptr,
+              /*skip_filters=*/false, /*level=*/static_cast<int>(which),
+              /*smallest_compaction_key=*/nullptr,
+              /*largest_compaction_key=*/nullptr);
         }
       } else {
         // Create concatenating iterator for the files from this level
@@ -5012,10 +5028,10 @@ InternalIterator* VersionSet::MakeInputIterator(
             cfd->table_cache(), read_options, env_options_compactions,
             cfd->internal_comparator(), c->input_levels(which),
             c->mutable_cf_options()->prefix_extractor.get(),
-            false /* should_sample */,
-            nullptr /* no per level latency histogram */,
-            true /* for_compaction */, false /* skip_filters */,
-            static_cast<int>(which) /* level */, range_del_agg,
+            /*should_sample=*/false,
+            /*no per level latency histogram=*/nullptr,
+            TableReaderCaller::kCompaction, /*skip_filters=*/false,
+            /*level=*/static_cast<int>(which), range_del_agg,
             c->boundaries(which));
       }
     }
diff --git a/db/version_set.h b/db/version_set.h
index ba1b4d3e3d0..6b7c42881c1 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -984,7 +984,8 @@ class VersionSet {
   // in levels [start_level, end_level). If end_level == 0 it will search
   // through all non-empty levels
   uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
-                           int start_level, int end_level, bool for_compaction);
+                           int start_level, int end_level,
+                           TableReaderCaller caller);
 
   // Return the size of the current manifest file
   uint64_t manifest_file_size() const { return manifest_file_size_; }
@@ -1035,10 +1036,10 @@ class VersionSet {
   // ApproximateSize helper
   uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
                                  const Slice& start, const Slice& end,
-                                 bool for_compaction);
+                                 TableReaderCaller caller);
 
   uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
-                           const Slice& key, bool for_compaction);
+                           const Slice& key, TableReaderCaller caller);
 
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 9339c35364f..5b2f515006f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -349,7 +349,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
               nullptr, kNullStats, true, index_key_includes_seq(),
               index_value_is_full()),
           false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
-          index_key_includes_seq(), index_value_is_full());
+          index_key_includes_seq(), index_value_is_full(),
+          lookup_context ? lookup_context->caller
+                         : TableReaderCaller::kUncategorized);
     }
 
     assert(it != nullptr);
@@ -365,7 +367,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
 
   void CacheDependencies(bool pin) override {
     // Before read partitions, prefetch them to avoid lots of IOs
-    BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
+    BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
     auto rep = table()->rep_;
     IndexBlockIter biter;
     BlockHandle handle;
@@ -1075,7 +1077,7 @@ Status BlockBasedTable::Open(
   // Better not mutate rep_ after the creation. eg. internal_prefix_transform
   // raw pointer will be used to create HashIndexReader, whose reset may
   // access a dangling pointer.
-  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
                                       internal_comparator, skip_filters, level,
                                       immortal_table);
@@ -2681,7 +2683,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
     //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
     // Explicit user requested readahead:
     //   Enabled from the very first IO when ReadOptions.readahead_size is set.
-    if (!for_compaction_) {
+    if (lookup_context_.caller != TableReaderCaller::kCompaction) {
       if (read_options_.readahead_size == 0) {
         // Implicit auto readahead
         num_file_reads_++;
@@ -2728,7 +2730,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
         read_options_, data_block_handle, &block_iter_, block_type_,
         key_includes_seq_, index_key_is_full_,
         /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
-        for_compaction_);
+        /*for_compaction=*/lookup_context_.caller ==
+            TableReaderCaller::kCompaction);
     block_iter_points_to_real_block_ = true;
   }
 }
@@ -2814,11 +2817,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
 
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
-    Arena* arena, bool skip_filters, bool for_compaction,
-    size_t compaction_readahead_size) {
-  BlockCacheLookupContext lookup_context{
-      for_compaction ? BlockCacheLookupCaller::kCompaction
-                     : BlockCacheLookupCaller::kUserIterator};
+    Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {
+  BlockCacheLookupContext lookup_context{caller};
   bool need_upper_bound_check =
       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
   if (arena == nullptr) {
@@ -2832,7 +2832,7 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller,
         compaction_readahead_size);
   } else {
     auto* mem =
@@ -2845,8 +2845,7 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
-        compaction_readahead_size);
+        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size);
   }
 }
 
@@ -2933,7 +2932,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   CachableEntry<FilterBlockReader> filter_entry;
   bool may_match;
   FilterBlockReader* filter = nullptr;
-  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserGet};
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet};
   {
     if (!skip_filters) {
       filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
@@ -2989,7 +2988,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         break;
       } else {
         BlockCacheLookupContext lookup_data_block_context{
-            BlockCacheLookupCaller::kUserGet};
+            TableReaderCaller::kUserGet};
         bool does_referenced_key_exist = false;
         DataBlockIter biter;
         uint64_t referenced_data_size = 0;
@@ -3084,7 +3083,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                const MultiGetRange* mget_range,
                                const SliceTransform* prefix_extractor,
                                bool skip_filters) {
-  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kUserMGet};
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet};
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   FilterBlockReader* filter = nullptr;
@@ -3135,7 +3134,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         uint64_t referenced_data_size = 0;
         bool does_referenced_key_exist = false;
         BlockCacheLookupContext lookup_data_block_context(
-            BlockCacheLookupCaller::kUserMGet);
+            TableReaderCaller::kUserMultiGet);
         if (iiter->value().offset() != offset) {
           offset = iiter->value().offset();
           biter.Invalidate(Status::OK());
@@ -3244,7 +3243,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   if (begin && end && comparator.Compare(*begin, *end) > 0) {
     return Status::InvalidArgument(*begin, *end);
   }
-  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   IndexBlockIter iiter_on_stack;
   auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                                 &iiter_on_stack, /*get_context=*/nullptr,
@@ -3299,9 +3298,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   return Status::OK();
 }
 
-Status BlockBasedTable::VerifyChecksum() {
-  // TODO(haoyu): This function is called by external sst ingestion and the
-  // verify checksum public API. We don't log its block cache accesses for now.
+Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
   Status s;
   // Check Meta blocks
   std::unique_ptr<Block> meta;
@@ -3317,9 +3314,10 @@ Status BlockBasedTable::VerifyChecksum() {
   }
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{caller};
   InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
       ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
-      /*get_context=*/nullptr, /*lookup_contex=*/nullptr);
+      /*get_context=*/nullptr, &context);
   std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
     iiter_unique_ptr =
@@ -3536,10 +3534,8 @@ Status BlockBasedTable::CreateIndexReader(
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
-                                              bool for_compaction) {
-  BlockCacheLookupContext context(
-      for_compaction ? BlockCacheLookupCaller::kCompaction
-                     : BlockCacheLookupCaller::kUserApproximateSize);
+                                              TableReaderCaller caller) {
+  BlockCacheLookupContext context(caller);
   std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index be758c96798..b03e67128e2 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -123,18 +123,13 @@ class BlockBasedTable : public TableReader {
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
   // @param skip_filters Disables loading/accessing the filter block
-  // compaction_readahead_size: its value will only be used if for_compaction =
-  // true
-  InternalIterator* NewIterator(
-      const ReadOptions&, const SliceTransform* prefix_extractor,
-      Arena* arena = nullptr, bool skip_filters = false,
-      // TODO(haoyu) 1. External SST ingestion sets for_compaction as false. 2.
-      // Compaction also sets it to false when paranoid_file_checks is true,
-      // i.e., it will populate the block cache with blocks in the new SST
-      // files. We treat those as a user is calling iterator for now. We should
-      // differentiate the callers.
-      bool for_compaction = false,
-      size_t compaction_readahead_size = 0) override;
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction.
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
 
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
@@ -160,7 +155,8 @@ class BlockBasedTable : public TableReader {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  uint64_t ApproximateOffsetOf(const Slice& key, bool for_compaction) override;
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
 
   bool TEST_BlockInCache(const BlockHandle& handle) const;
 
@@ -180,7 +176,7 @@ class BlockBasedTable : public TableReader {
   Status DumpTable(WritableFile* out_file,
                    const SliceTransform* prefix_extractor = nullptr) override;
 
-  Status VerifyChecksum() override;
+  Status VerifyChecksum(TableReaderCaller caller) override;
 
   void Close() override;
 
@@ -609,9 +605,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
                           InternalIteratorBase<BlockHandle>* index_iter,
                           bool check_filter, bool need_upper_bound_check,
                           const SliceTransform* prefix_extractor,
-                          BlockType block_type, bool key_includes_seq = true,
-                          bool index_key_is_full = true,
-                          bool for_compaction = false,
+                          BlockType block_type, bool key_includes_seq,
+                          bool index_key_is_full, TableReaderCaller caller,
                           size_t compaction_readahead_size = 0)
       : InternalIteratorBase<TValue>(false),
         table_(table),
@@ -627,11 +622,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         block_type_(block_type),
         key_includes_seq_(key_includes_seq),
         index_key_is_full_(index_key_is_full),
-        for_compaction_(for_compaction),
-        compaction_readahead_size_(compaction_readahead_size),
-        lookup_context_(for_compaction
-                            ? BlockCacheLookupCaller::kCompaction
-                            : BlockCacheLookupCaller::kUserIterator) {}
+        lookup_context_(caller),
+        compaction_readahead_size_(compaction_readahead_size) {}
 
   ~BlockBasedTableIterator() { delete index_iter_; }
 
@@ -740,13 +732,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // If the keys in the blocks over which we iterate include 8 byte sequence
   bool key_includes_seq_;
   bool index_key_is_full_;
-  // If this iterator is created for compaction
-  bool for_compaction_;
-  // Readahead size used in compaction, its value is used only if
-  // for_compaction_ = true
-  size_t compaction_readahead_size_;
   BlockHandle prev_index_value_;
   BlockCacheLookupContext lookup_context_;
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
 
   // All the below fields control iterator readahead
   static const size_t kInitAutoReadaheadSize = 8 * 1024;
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index e80085dfb5b..cce6744157e 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -277,7 +277,7 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
 void PartitionedFilterBlockReader::CacheDependencies(
     bool pin, const SliceTransform* prefix_extractor) {
   // Before read partitions, prefetch them to avoid lots of IOs
-  BlockCacheLookupContext lookup_context{BlockCacheLookupCaller::kPrefetch};
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   IndexBlockIter biter;
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
index 821743608e4..30109ece6ce 100644
--- a/table/cuckoo/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -377,7 +377,7 @@ Slice CuckooTableIterator::value() const {
 InternalIterator* CuckooTableReader::NewIterator(
     const ReadOptions& /*read_options*/,
     const SliceTransform* /* prefix_extractor */, Arena* arena,
-    bool /*skip_filters*/, bool /*for_compaction*/,
+    bool /*skip_filters*/, TableReaderCaller /*caller*/,
     size_t /*compaction_readahead_size*/) {
   if (!status().ok()) {
     return NewErrorInternalIterator<Slice>(
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index cdb0302bd3d..10db084259f 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -50,10 +50,8 @@ class CuckooTableReader: public TableReader {
   // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false,
-                                size_t compaction_readahead_size = 0) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller, size_t compaction_readahead_size = 0) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
@@ -61,7 +59,7 @@ class CuckooTableReader: public TableReader {
 
   // Following methods are not implemented for Cuckoo Table Reader
   uint64_t ApproximateOffsetOf(const Slice& /*key*/,
-                               bool /*for_compaction*/ = false) override {
+                               TableReaderCaller /*caller*/) override {
     return 0;
   }
   void SetupForCompaction() override {}
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index dd65ffe8490..dd1557db147 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -146,8 +146,9 @@ class CuckooReaderTest : public testing::Test {
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
-    InternalIterator* it =
-        reader.NewIterator(ReadOptions(), nullptr, nullptr, false);
+    InternalIterator* it = reader.NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->SeekToFirst();
@@ -186,7 +187,9 @@ class CuckooReaderTest : public testing::Test {
     delete it;
 
     Arena arena;
-    it = reader.NewIterator(ReadOptions(), nullptr, &arena);
+    it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+                            /*skip_filters=*/false,
+                            TableReaderCaller::kUncategorized);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->Seek(keys[num_items/2]);
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 4d55bf7c9a8..022f9a63f52 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -34,8 +34,7 @@ stl_wrappers::KVMap MakeMockFile(
 
 InternalIterator* MockTableReader::NewIterator(
     const ReadOptions&, const SliceTransform* /* prefix_extractor */,
-    Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/,
-    size_t /*compaction_readahead_size*/) {
+    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, size_t /*compaction_readahead_size*/) {
   return new MockTableIterator(table_);
 }
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 6a5b5ab31cd..4b886e63e25 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -42,17 +42,16 @@ class MockTableReader : public TableReader {
 
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false,
-                                size_t compaction_readahead_size = 0) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                              size_t compaction_readahead_size = 0) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
   uint64_t ApproximateOffsetOf(const Slice& /*key*/,
-                               bool /*for_compaction*/ = false) override {
+                               TableReaderCaller /*caller*/) override {
     return 0;
   }
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 2f036e61ae1..f6c348fdbf9 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -196,7 +196,7 @@ void PlainTableReader::SetupForCompaction() {
 
 InternalIterator* PlainTableReader::NewIterator(
     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
-    Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/,
+    Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
     size_t /*compaction_readahead_size*/) {
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
@@ -616,7 +616,7 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
 }
 
 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
-                                               bool /*for_compaction*/) {
+                                               TableReaderCaller /*caller*/) {
   return 0;
 }
 
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index 7a468bdb8c8..f63649cacf8 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -82,10 +82,8 @@ class PlainTableReader: public TableReader {
   // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false,
-                                size_t compaction_readahead_size = 0) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller, size_t compaction_readahead_size = 0) override;
 
   void Prepare(const Slice& target) override;
 
@@ -94,7 +92,7 @@ class PlainTableReader: public TableReader {
              bool skip_filters = false) override;
 
   uint64_t ApproximateOffsetOf(const Slice& key,
-                               bool for_compaction = false) override;
+                               TableReaderCaller caller) override;
 
   uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
   void SetupForCompaction() override;
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index 54408bb50e9..7c3b91cc39a 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -65,8 +65,9 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
   auto sequence = options.snapshot != nullptr
                       ? options.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
-  auto internal_iter =
-      r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get());
+  auto internal_iter = r->table_reader->NewIterator(
+      options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTFileReader);
   return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
                        r->ioptions.user_comparator, internal_iter, sequence,
                        r->moptions.max_sequential_skip_in_iterations,
@@ -79,7 +80,7 @@ std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
 }
 
 Status SstFileReader::VerifyChecksum() {
-  return rep_->table_reader->VerifyChecksum();
+  return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader);
 }
 
 }  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
index 2904526e59b..1c879cb1f81 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -14,6 +14,7 @@
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/multiget_context.h"
+#include "table/table_reader_caller.h"
 
 namespace rocksdb {
 
@@ -44,12 +45,11 @@ class TableReader {
   //        all the states but those allocated in arena.
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
-  // compaction_readahead_size: its value will only be used if for_compaction =
-  // true
-  virtual InternalIterator* NewIterator(
-      const ReadOptions&, const SliceTransform* prefix_extractor,
-      Arena* arena = nullptr, bool skip_filters = false,
-      bool for_compaction = false, size_t compaction_readahead_size = 0) = 0;
+  // compaction_readahead_size: its value will only be used if caller = kCompaction
+  virtual InternalIterator* NewIterator(const ReadOptions&,
+                                        const SliceTransform* prefix_extractor,
+                                        Arena* arena, bool skip_filters,
+                                        TableReaderCaller caller, size_t compaction_readahead_size = 0) = 0;
 
   virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
@@ -63,7 +63,7 @@ class TableReader {
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
   virtual uint64_t ApproximateOffsetOf(const Slice& key,
-                                       bool for_compaction = false) = 0;
+                                       TableReaderCaller caller) = 0;
 
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
@@ -122,7 +122,7 @@ class TableReader {
   }
 
   // check whether there is corruption in this db file
-  virtual Status VerifyChecksum() {
+  virtual Status VerifyChecksum(TableReaderCaller /*caller*/) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 2ec7b2d0fb5..cec62df5949 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -198,7 +198,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           Iterator* iter = nullptr;
           InternalIterator* iiter = nullptr;
           if (!through_db) {
-            iiter = table_reader->NewIterator(read_options, nullptr);
+            iiter = table_reader->NewIterator(
+                read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+                /*skip_filters=*/false, TableReaderCaller::kUncategorized);
           } else {
             iter = db->NewIterator(read_options);
           }
diff --git a/table/table_reader_caller.h b/table/table_reader_caller.h
new file mode 100644
index 00000000000..90c64687197
--- /dev/null
+++ b/table/table_reader_caller.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+  kUserGet = 1,
+  kUserMultiGet = 2,
+  kUserIterator = 3,
+  kUserApproximateSize = 4,
+  kUserVerifyChecksum = 5,
+  kSSTDumpTool = 6,
+  kExternalSSTIngestion = 7,
+  kRepair = 8,
+  kPrefetch = 9,
+  kCompaction = 10,
+  // A compaction job may refill the block cache with blocks in the new SST
+  // files if paranoid_file_checks is true.
+  kCompactionRefill = 11,
+  // After building a table, it may load all its blocks into the block cache if
+  // paranoid_file_checks is true.
+  kFlush = 12,
+  // sst_file_reader.
+  kSSTFileReader = 13,
+  // A list of callers that are either not interesting for analysis or are
+  // calling from a test environment, e.g., unit test, benchmark, etc.
+  kUncategorized = 14,
+  // All callers should be added before kMaxBlockCacheLookupCaller.
+  kMaxBlockCacheLookupCaller
+};
+}  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index 8e290368428..2e2286efae4 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -370,7 +370,9 @@ class TableConstructor: public Constructor {
   InternalIterator* NewIterator(
       const SliceTransform* prefix_extractor) const override {
     ReadOptions ro;
-    InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor);
+    InternalIterator* iter = table_reader_->NewIterator(
+        ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false,
+        TableReaderCaller::kUncategorized);
     if (convert_to_internal_key_) {
       return new KeyConvertingIterator(iter);
     } else {
@@ -382,9 +384,11 @@ class TableConstructor: public Constructor {
     if (convert_to_internal_key_) {
       InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
       const Slice skey = ikey.Encode();
-      return table_reader_->ApproximateOffsetOf(skey);
+      return table_reader_->ApproximateOffsetOf(
+          skey, TableReaderCaller::kUncategorized);
     }
-    return table_reader_->ApproximateOffsetOf(key);
+    return table_reader_->ApproximateOffsetOf(
+        key, TableReaderCaller::kUncategorized);
   }
 
   virtual Status Reopen(const ImmutableCFOptions& ioptions,
@@ -1538,8 +1542,9 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     auto* reader = c.GetTableReader();
     ReadOptions ro;
     ro.total_order_seek = true;
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
     iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
     ASSERT_OK(iter->status());
@@ -1597,8 +1602,9 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) {
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
     ro.total_order_seek = (i == 0);
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
     iter->Seek(key.Encode());
     ASSERT_OK(iter->status());
@@ -1635,8 +1641,9 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
   const MutableCFOptions new_moptions(options);
   c.Reopen(new_ioptions, new_moptions);
   auto reader = c.GetTableReader();
-  std::unique_ptr<InternalIterator> db_iter(
-      reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
   // only one kv
@@ -1702,8 +1709,9 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   ASSERT_EQ(5u, props->num_data_blocks);
 
   // TODO(Zhongyi): update test to use MutableCFOptions
-  std::unique_ptr<InternalIterator> index_iter(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // -- Find keys do not exist, but have common prefix.
   std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
@@ -1819,8 +1827,9 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
   auto reader = c.GetTableReader();
   ReadOptions ropt;
   ropt.read_tier = ReadTier::kBlockCacheTier;
-  std::unique_ptr<InternalIterator> iter(
-      reader->NewIterator(ropt, /* prefix_extractor */ nullptr));
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   auto ikey = [](Slice user_key) {
     return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
@@ -3136,8 +3145,9 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
            &kvmap);
   auto reader = c.GetTableReader();
 
-  std::unique_ptr<InternalIterator> db_iter(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
   for (auto& kv : kvmap) {
@@ -3329,8 +3339,9 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
                            EnvOptions(), ikc),
         std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
-    return table_reader->NewIterator(ReadOptions(),
-                                     moptions.prefix_extractor.get());
+    return table_reader->NewIterator(
+        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
   };
 
   GetVersionAndGlobalSeqno();
@@ -3501,7 +3512,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
       std::move(file_reader), ss_rw.contents().size(), &table_reader));
 
   std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
-      ReadOptions(), moptions2.prefix_extractor.get()));
+      ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   int expected_key = 1;
   for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
@@ -3795,8 +3807,9 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
   auto reader = c.GetTableReader();
 
   std::unique_ptr<InternalIterator> seek_iter;
-  seek_iter.reset(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  seek_iter.reset(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
     // for every kv, we seek using two method: Get() and Seek()
@@ -3877,13 +3890,15 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
   Slice upper_bound_slice(upper_bound);
   read_opt.iterate_upper_bound = &upper_bound_slice;
   std::unique_ptr<InternalIterator> iter;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->SeekToFirst();
   ASSERT_FALSE(iter->Valid());
   ASSERT_TRUE(iter->IsOutOfBound());
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("foo");
   ASSERT_FALSE(iter->Valid());
   ASSERT_TRUE(iter->IsOutOfBound());
@@ -3913,8 +3928,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
   Slice ub_slice1(ub1);
   read_opt.iterate_upper_bound = &ub_slice1;
   std::unique_ptr<InternalIterator> iter;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("bar");
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("bar", iter->key());
@@ -3924,8 +3940,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
   std::string ub2 = "foo_after";
   Slice ub_slice2(ub2);
   read_opt.iterate_upper_bound = &ub_slice2;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("foo");
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("foo", iter->key());
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index a8259de71b5..732094bf29b 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -67,18 +67,36 @@ std::string block_type_to_string(TraceType type) {
   return "InvalidType";
 }
 
-std::string caller_to_string(BlockCacheLookupCaller caller) {
+std::string caller_to_string(TableReaderCaller caller) {
   switch (caller) {
     case kUserGet:
       return "Get";
-    case kUserMGet:
+    case kUserMultiGet:
       return "MultiGet";
     case kUserIterator:
       return "Iterator";
+    case kUserApproximateSize:
+      return "ApproximateSize";
+    case kUserVerifyChecksum:
+      return "VerifyChecksum";
+    case kSSTDumpTool:
+      return "SSTDumpTool";
+    case kExternalSSTIngestion:
+      return "ExternalSSTIngestion";
+    case kRepair:
+      return "Repair";
     case kPrefetch:
       return "Prefetch";
     case kCompaction:
       return "Compaction";
+    case kCompactionRefill:
+      return "CompactionRefill";
+    case kFlush:
+      return "Flush";
+    case kSSTFileReader:
+      return "SSTFileReader";
+    case kUncategorized:
+      return "Uncategorized";
     default:
       break;
   }
@@ -450,10 +468,10 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
   uint64_t total_num_blocks = 0;
   uint64_t total_num_accesses = 0;
   std::map<TraceType, uint64_t> bt_num_blocks_map;
-  std::map<BlockCacheLookupCaller, uint64_t> caller_num_access_map;
-  std::map<BlockCacheLookupCaller, std::map<TraceType, uint64_t>>
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
       caller_bt_num_access_map;
-  std::map<BlockCacheLookupCaller, std::map<uint32_t, uint64_t>>
+  std::map<TableReaderCaller, std::map<uint32_t, uint64_t>>
       caller_level_num_access_map;
   for (auto const& cf_aggregates : cf_aggregates_map_) {
     // Stats per column family.
@@ -462,12 +480,12 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
     uint64_t cf_num_blocks = 0;
     std::map<TraceType, uint64_t> cf_bt_blocks;
     uint64_t cf_num_accesses = 0;
-    std::map<BlockCacheLookupCaller, uint64_t> cf_caller_num_accesses_map;
-    std::map<BlockCacheLookupCaller, std::map<uint64_t, uint64_t>>
+    std::map<TableReaderCaller, uint64_t> cf_caller_num_accesses_map;
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
         cf_caller_level_num_accesses_map;
-    std::map<BlockCacheLookupCaller, std::map<uint64_t, uint64_t>>
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
         cf_caller_file_num_accesses_map;
-    std::map<BlockCacheLookupCaller, std::map<TraceType, uint64_t>>
+    std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
         cf_caller_bt_num_accesses_map;
     total_num_files += cf_aggregates.second.fd_aggregates_map.size();
     for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
@@ -492,7 +510,7 @@ void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
           for (auto const& stats :
                block_access_info.second.caller_num_access_map) {
             // Stats per caller.
-            const BlockCacheLookupCaller caller = stats.first;
+            const TableReaderCaller caller = stats.first;
             const uint64_t num_accesses = stats.second;
             // Overall stats.
             total_num_accesses += num_accesses;
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 0690d14d0f3..c953ecf2164 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -72,7 +72,7 @@ struct BlockAccessInfo {
   std::map<std::string, uint64_t>
       non_exist_key_num_access_map;  // for keys do not exist in this block.
   uint64_t num_referenced_key_exist_in_block = 0;
-  std::map<BlockCacheLookupCaller, uint64_t> caller_num_access_map;
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
 
   void AddAccess(const BlockCacheTraceRecord& access) {
     if (first_access_time == 0) {
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index df99e1f616e..c361ba054ac 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -61,23 +61,23 @@ class BlockCacheTracerTest : public testing::Test {
     EXPECT_OK(env_->DeleteDir(test_path_));
   }
 
-  BlockCacheLookupCaller GetCaller(uint32_t key_id) {
+  TableReaderCaller GetCaller(uint32_t key_id) {
     uint32_t n = key_id % 5;
     switch (n) {
       case 0:
-        return BlockCacheLookupCaller::kPrefetch;
+        return TableReaderCaller::kPrefetch;
       case 1:
-        return BlockCacheLookupCaller::kCompaction;
+        return TableReaderCaller::kCompaction;
       case 2:
-        return BlockCacheLookupCaller::kUserGet;
+        return TableReaderCaller::kUserGet;
       case 3:
-        return BlockCacheLookupCaller::kUserMGet;
+        return TableReaderCaller::kUserMultiGet;
       case 4:
-        return BlockCacheLookupCaller::kUserIterator;
+        return TableReaderCaller::kUserIterator;
     }
     // This cannot happend.
     assert(false);
-    return BlockCacheLookupCaller::kUserGet;
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
   }
 
   void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
@@ -124,15 +124,15 @@ class BlockCacheTracerTest : public testing::Test {
     ASSERT_GT(block_access_info.first_access_time, 0);
     ASSERT_GT(block_access_info.last_access_time, 0);
     ASSERT_EQ(1, block_access_info.caller_num_access_map.size());
-    BlockCacheLookupCaller expected_caller = GetCaller(key_id);
+    TableReaderCaller expected_caller = GetCaller(key_id);
     ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) !=
                 block_access_info.caller_num_access_map.end());
     ASSERT_EQ(
         1,
         block_access_info.caller_num_access_map.find(expected_caller)->second);
 
-    if ((expected_caller == BlockCacheLookupCaller::kUserGet ||
-         expected_caller == BlockCacheLookupCaller::kUserMGet) &&
+    if ((expected_caller == TableReaderCaller::kUserGet ||
+         expected_caller == TableReaderCaller::kUserMultiGet) &&
         type == TraceType::kBlockTraceDataBlock) {
       ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys);
       ASSERT_EQ(1, block_access_info.key_num_access_map.size());
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index ed5600194ad..260d15f303c 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -143,7 +143,7 @@ Status SstFileDumper::NewTableReader(
 }
 
 Status SstFileDumper::VerifyChecksum() {
-  return table_reader_->VerifyChecksum();
+  return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool);
 }
 
 Status SstFileDumper::DumpTable(const std::string& out_filename) {
@@ -173,7 +173,8 @@ uint64_t SstFileDumper::CalculateCompressedTableSize(
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       dest_writer.get()));
   std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
-      ReadOptions(), moptions_.prefix_extractor.get()));
+      ReadOptions(), moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     if (!iter->status().ok()) {
       fputs(iter->status().ToString().c_str(), stderr);
@@ -299,7 +300,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
   }
 
   InternalIterator* iter = table_reader_->NewIterator(
-      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get());
+      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get(),
+      /*arena=*/nullptr, /*skip_filters=*/false,
+      TableReaderCaller::kSSTDumpTool);
   uint64_t i = 0;
   if (has_from) {
     InternalKey ikey;
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index a0f0676eecf..4c5ad011609 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -31,11 +31,11 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
 const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
     "UnknownColumnFamily";
 
-bool BlockCacheTraceHelper::ShouldTraceReferencedKey(
-    TraceType block_type, BlockCacheLookupCaller caller) {
+bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type,
+                                                     TableReaderCaller caller) {
   return (block_type == TraceType::kBlockTraceDataBlock) &&
-         (caller == BlockCacheLookupCaller::kUserGet ||
-          caller == BlockCacheLookupCaller::kUserMGet);
+         (caller == TableReaderCaller::kUserGet ||
+          caller == TableReaderCaller::kUserMultiGet);
 }
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
@@ -182,7 +182,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
     return Status::Incomplete(
         "Incomplete access record: Failed to read caller.");
   }
-  record->caller = static_cast<BlockCacheLookupCaller>(enc_slice[0]);
+  record->caller = static_cast<TableReaderCaller>(enc_slice[0]);
   enc_slice.remove_prefix(kCharSize);
   if (enc_slice.empty()) {
     return Status::Incomplete(
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index bf88133111e..e7f38db3c6d 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -11,21 +11,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/trace_reader_writer.h"
+#include "table/table_reader_caller.h"
 #include "trace_replay/trace_replay.h"
 
 namespace rocksdb {
 
-enum BlockCacheLookupCaller : char {
-  kUserGet = 1,
-  kUserMGet = 2,
-  kUserIterator = 3,
-  kUserApproximateSize = 4,
-  kPrefetch = 5,
-  kCompaction = 6,
-  // All callers should be added before kMaxBlockCacheLookupCaller.
-  kMaxBlockCacheLookupCaller
-};
-
 // Lookup context for tracing block cache accesses.
 // We trace block accesses at five places:
 // 1. BlockBasedTable::GetFilter
@@ -46,9 +36,8 @@ enum BlockCacheLookupCaller : char {
 // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
 // kUserApproximateSize).
 struct BlockCacheLookupContext {
-  BlockCacheLookupContext(const BlockCacheLookupCaller& _caller)
-      : caller(_caller) {}
-  const BlockCacheLookupCaller caller;
+BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
+const TableReaderCaller caller;
   // These are populated when we perform lookup/insert on block cache. The block
   // cache tracer uses these inforation when logging the block access at
   // BlockBasedTable::GET and BlockBasedTable::MultiGet.
@@ -84,8 +73,7 @@ struct BlockCacheTraceRecord {
   std::string cf_name;
   uint32_t level = 0;
   uint64_t sst_fd_number = 0;
-  BlockCacheLookupCaller caller =
-      BlockCacheLookupCaller::kMaxBlockCacheLookupCaller;
+  TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
   Boolean is_cache_hit = Boolean::kFalse;
   Boolean no_insert = Boolean::kFalse;
 
@@ -100,7 +88,7 @@ struct BlockCacheTraceRecord {
   BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
                         TraceType _block_type, uint64_t _block_size,
                         uint64_t _cf_id, std::string _cf_name, uint32_t _level,
-                        uint64_t _sst_fd_number, BlockCacheLookupCaller _caller,
+                        uint64_t _sst_fd_number, TableReaderCaller _caller,
                         bool _is_cache_hit, bool _no_insert,
                         std::string _referenced_key = "",
                         uint64_t _referenced_data_size = 0,
@@ -134,7 +122,7 @@ struct BlockCacheTraceHeader {
 class BlockCacheTraceHelper {
  public:
   static bool ShouldTraceReferencedKey(TraceType block_type,
-                                       BlockCacheLookupCaller caller);
+                                       TableReaderCaller caller);
 
   static const std::string kUnknownColumnFamilyName;
 };
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index 95fe16b8c8f..44cba7bfbd8 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -37,19 +37,19 @@ class BlockCacheTracerTest : public testing::Test {
     EXPECT_OK(env_->DeleteDir(test_path_));
   }
 
-  BlockCacheLookupCaller GetCaller(uint32_t key_id) {
+  TableReaderCaller GetCaller(uint32_t key_id) {
     uint32_t n = key_id % 5;
     switch (n) {
       case 0:
-        return BlockCacheLookupCaller::kPrefetch;
+        return TableReaderCaller::kPrefetch;
       case 1:
-        return BlockCacheLookupCaller::kCompaction;
+        return TableReaderCaller::kCompaction;
       case 2:
-        return BlockCacheLookupCaller::kUserGet;
+        return TableReaderCaller::kUserGet;
       case 3:
-        return BlockCacheLookupCaller::kUserMGet;
+        return TableReaderCaller::kUserMultiGet;
       case 4:
-        return BlockCacheLookupCaller::kUserIterator;
+        return TableReaderCaller::kUserIterator;
     }
     assert(false);
   }
@@ -121,8 +121,8 @@ class BlockCacheTracerTest : public testing::Test {
       ASSERT_EQ(Boolean::kFalse, record.is_cache_hit);
       ASSERT_EQ(Boolean::kFalse, record.no_insert);
       if (block_type == TraceType::kBlockTraceDataBlock &&
-          (record.caller == BlockCacheLookupCaller::kUserGet ||
-           record.caller == BlockCacheLookupCaller::kUserMGet)) {
+          (record.caller == TableReaderCaller::kUserGet ||
+           record.caller == TableReaderCaller::kUserMultiGet)) {
         ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
                   record.referenced_key);
         ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block);

From 1bfeffab2dbff7eaf74a61cf52d57cf1404ef159 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 20 Jun 2019 22:14:00 -0700
Subject: [PATCH 172/572] Stop printing after verification fails (#5493)

Summary:
Stop verification and printing once verification fails.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5493

Differential Revision: D15928992

Pulled By: riversand963

fbshipit-source-id: 699feac034a217d57280aa3fb50f5aba06adf317
---
 tools/db_stress.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 5fd84258b1f..6a3e8bdefb1 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -4054,6 +4054,9 @@ class AtomicFlushStressTest : public StressTest {
     assert(num == iters.size());
     std::vector<Status> statuses(num, Status::OK());
     do {
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
       size_t valid_cnt = 0;
       size_t idx = 0;
       for (auto& iter : iters) {

From 2730fe693edf306aad11a48491cfe3be4c178a47 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Fri, 21 Jun 2019 10:12:29 -0700
Subject: [PATCH 173/572] Fix ingested file and direcotry not being sync
 (#5435)

Summary:
It it not safe to assume application had sync the SST file before ingest it into DB. Also the directory to put the ingested file needs to be fsync, otherwise the file can be lost. For integrity of RocksDB we need to sync the ingested file and directory before apply the change to manifest.

Also syncing after writing global sequence when write_global_seqno=true was removed in https://github.com/facebook/rocksdb/issues/4172. Adding it back.

Fixes https://github.com/facebook/rocksdb/issues/5287.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5435

Test Plan:
Test ingest file with ldb command and observe fsync/fdatasync in strace output. Tried both move_files=true and move_files=false.
https://gist.github.com/yiwu-arbug/650a4023f57979056d83485fa863bef9

More test suggestions are welcome.

Differential Revision: D15941675

Pulled By: riversand963

fbshipit-source-id: 389533f3923065a96df2cdde23ff4724a1810d78
---
 HISTORY.md                            |  1 +
 db/db_impl/db_impl.cc                 | 12 +----
 db/db_impl/db_impl.h                  | 56 ++++++++++++---------
 db/db_impl/db_impl_open.cc            |  6 +--
 db/external_sst_file_basic_test.cc    | 56 +++++++++++++++++++++
 db/external_sst_file_ingestion_job.cc | 67 +++++++++++++++++++++++--
 db/external_sst_file_ingestion_job.h  | 15 +++++-
 test_util/fault_injection_test_env.cc | 72 ++++++++++++++++++++++++++-
 test_util/fault_injection_test_env.h  | 29 +++++++++++
 9 files changed, 270 insertions(+), 44 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 825c1def47c..975ece580d4 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -29,6 +29,7 @@
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
+* Fix ingested file and directory not being fsync.
 * Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index f3fc96d8d1f..e2de696ef57 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -861,16 +861,6 @@ Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
   return ret_dir;
 }
 
-Directory* DBImpl::Directories::GetDataDir(size_t path_id) const {
-  assert(path_id < data_dirs_.size());
-  Directory* ret_dir = data_dirs_[path_id].get();
-  if (ret_dir == nullptr) {
-    // Should use db_dir_
-    return db_dir_.get();
-  }
-  return ret_dir;
-}
-
 Status DBImpl::SetOptions(
     ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
@@ -3644,7 +3634,7 @@ Status DBImpl::IngestExternalFiles(
     auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
     ingestion_jobs.emplace_back(env_, versions_.get(), cfd,
                                 immutable_db_options_, env_options_,
-                                &snapshots_, arg.options);
+                                &snapshots_, arg.options, &directories_);
   }
   std::vector<std::pair<bool, Status>> exec_results;
   for (size_t i = 0; i != num_cfs; ++i) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index e6d5a56e244..b5437c49543 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -77,6 +77,38 @@ struct JobContext;
 struct ExternalSstFileInfo;
 struct MemTableInfo;
 
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+  Status SetDirectories(Env* env, const std::string& dbname,
+                        const std::string& wal_dir,
+                        const std::vector<DbPath>& data_paths);
+
+  Directory* GetDataDir(size_t path_id) const {
+    assert(path_id < data_dirs_.size());
+    Directory* ret_dir = data_dirs_[path_id].get();
+    if (ret_dir == nullptr) {
+      // Should use db_dir_
+      return db_dir_.get();
+    }
+    return ret_dir;
+  }
+
+  Directory* GetWalDir() {
+    if (wal_dir_) {
+      return wal_dir_.get();
+    }
+    return db_dir_.get();
+  }
+
+  Directory* GetDbDir() { return db_dir_.get(); }
+
+ private:
+  std::unique_ptr<Directory> db_dir_;
+  std::vector<std::unique_ptr<Directory>> data_dirs_;
+  std::unique_ptr<Directory> wal_dir_;
+};
+
 // While DB is the public interface of RocksDB, and DBImpl is the actual
 // class implementing it. It's the entrance of the core RocksdB engine.
 // All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
@@ -1047,30 +1079,6 @@ class DBImpl : public DB {
     }
   };
 
-  // Class to maintain directories for all database paths other than main one.
-  class Directories {
-   public:
-    Status SetDirectories(Env* env, const std::string& dbname,
-                          const std::string& wal_dir,
-                          const std::vector<DbPath>& data_paths);
-
-    Directory* GetDataDir(size_t path_id) const;
-
-    Directory* GetWalDir() {
-      if (wal_dir_) {
-        return wal_dir_.get();
-      }
-      return db_dir_.get();
-    }
-
-    Directory* GetDbDir() { return db_dir_.get(); }
-
-   private:
-    std::unique_ptr<Directory> db_dir_;
-    std::vector<std::unique_ptr<Directory>> data_dirs_;
-    std::unique_ptr<Directory> wal_dir_;
-  };
-
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
     void AddSize(uint64_t new_size) { size += new_size; }
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index eec7cf16aa7..13d6959d474 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -265,9 +265,9 @@ Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
   return env->NewDirectory(dirname, directory);
 }
 
-Status DBImpl::Directories::SetDirectories(
-    Env* env, const std::string& dbname, const std::string& wal_dir,
-    const std::vector<DbPath>& data_paths) {
+Status Directories::SetDirectories(Env* env, const std::string& dbname,
+                                   const std::string& wal_dir,
+                                   const std::vector<DbPath>& data_paths) {
   Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
   if (!s.ok()) {
     return s;
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 91a422bed9e..ff7da502afb 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -9,6 +9,7 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
+#include "test_util/fault_injection_test_env.h"
 #include "test_util/testutil.h"
 
 namespace rocksdb {
@@ -20,6 +21,7 @@ class ExternalSSTFileBasicTest
  public:
   ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
     DestroyAndRecreateExternalSSTFilesDir();
   }
 
@@ -140,6 +142,7 @@ class ExternalSSTFileBasicTest
 
  protected:
   std::string sst_files_dir_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
 };
 
 TEST_F(ExternalSSTFileBasicTest, Basic) {
@@ -689,6 +692,59 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = fault_injection_test_env_.get();
+
+  std::vector<std::pair<std::string, std::string>> test_cases = {
+      {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+       "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+      {"ExternalSstFileIngestionJob::BeforeSyncDir",
+       "ExternalSstFileIngestionJob::AfterSyncDir"},
+      {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+       "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+  for (size_t i = 0; i < test_cases.size(); i++) {
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(false);
+    });
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(true);
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndReopen(options);
+    if (i == 2) {
+      ASSERT_OK(Put("foo", "v1"));
+    }
+
+    Options sst_file_writer_options;
+    std::unique_ptr<SstFileWriter> sst_file_writer(
+        new SstFileWriter(EnvOptions(), sst_file_writer_options));
+    std::string file_name =
+        sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst";
+    ASSERT_OK(sst_file_writer->Open(file_name));
+    ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+    ASSERT_OK(sst_file_writer->Finish());
+
+    IngestExternalFileOptions ingest_opt;
+    if (i == 0) {
+      ingest_opt.move_files = true;
+    }
+    const Snapshot* snapshot = db_->GetSnapshot();
+    if (i == 2) {
+      ingest_opt.write_global_seqno = true;
+    }
+    ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+    db_->ReleaseSnapshot(snapshot);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy(options);
+  }
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   int kNumLevels = 7;
   Options options = CurrentOptions();
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 7e9657cc901..44b50168566 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -7,11 +7,13 @@
 
 #include "db/external_sst_file_ingestion_job.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
 #include "db/version_edit.h"
 #include "file/file_util.h"
 #include "table/merging_iterator.h"
@@ -86,6 +88,7 @@ Status ExternalSstFileIngestionJob::Prepare(
   }
 
   // Copy/Move external files into DB
+  std::unordered_set<size_t> ingestion_path_ids;
   for (IngestedFileInfo& f : files_to_ingest_) {
     f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
     f.copy_file = false;
@@ -95,8 +98,26 @@ Status ExternalSstFileIngestionJob::Prepare(
                       f.fd.GetPathId());
     if (ingestion_options_.move_files) {
       status = env_->LinkFile(path_outside_db, path_inside_db);
-      if (status.IsNotSupported() &&
-          ingestion_options_.failed_move_fall_back_to_copy) {
+      if (status.ok()) {
+        // It is unsafe to assume application had sync the file and file
+        // directory before ingest the file. For integrity of RocksDB we need
+        // to sync the file.
+        std::unique_ptr<WritableFile> file_to_sync;
+        status = env_->ReopenWritableFile(path_inside_db, &file_to_sync,
+                                          env_options_);
+        if (status.ok()) {
+          TEST_SYNC_POINT(
+              "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+          status = SyncIngestedFile(file_to_sync.get());
+          TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log,
+                           "Failed to sync ingested file %s: %s",
+                           path_inside_db.c_str(), status.ToString().c_str());
+          }
+        }
+      } else if (status.IsNotSupported() &&
+                 ingestion_options_.failed_move_fall_back_to_copy) {
         // Original file is on a different FS, use copy instead of hard linking.
         f.copy_file = true;
       }
@@ -107,6 +128,7 @@ Status ExternalSstFileIngestionJob::Prepare(
     if (f.copy_file) {
       TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
                                nullptr);
+      // CopyFile also sync the new file.
       status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                         db_options_.use_fsync);
     }
@@ -115,8 +137,25 @@ Status ExternalSstFileIngestionJob::Prepare(
       break;
     }
     f.internal_file_path = path_inside_db;
+    ingestion_path_ids.insert(f.fd.GetPathId());
+  }
+
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+  if (status.ok()) {
+    for (auto path_id : ingestion_path_ids) {
+      status = directories_->GetDataDir(path_id)->Fsync();
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to sync directory %" ROCKSDB_PRIszt
+                       " while ingest file: %s",
+                       path_id, status.ToString().c_str());
+        break;
+      }
+    }
   }
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
 
+  // TODO: The following is duplicated with Cleanup().
   if (!status.ok()) {
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
@@ -559,6 +598,18 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
       std::string seqno_val;
       PutFixed64(&seqno_val, seqno);
       status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
+      if (status.ok()) {
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+        status = SyncIngestedFile(rwfile.get());
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+        if (!status.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Failed to sync ingested file %s after writing global "
+                         "sequence number: %s",
+                         file_to_ingest->internal_file_path.c_str(),
+                         status.ToString().c_str());
+        }
+      }
       if (!status.ok()) {
         return status;
       }
@@ -599,6 +650,16 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   return true;
 }
 
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+  assert(file != nullptr);
+  if (db_options_.use_fsync) {
+    return file->Fsync();
+  } else {
+    return file->Sync();
+  }
+}
+
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index baa8e9f0f64..50f3944054f 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -20,6 +20,8 @@
 
 namespace rocksdb {
 
+class Directories;
+
 struct IngestedFileInfo {
   // External file path
   std::string external_file_path;
@@ -77,7 +79,8 @@ class ExternalSstFileIngestionJob {
       Env* env, VersionSet* versions, ColumnFamilyData* cfd,
       const ImmutableDBOptions& db_options, const EnvOptions& env_options,
       SnapshotList* db_snapshots,
-      const IngestExternalFileOptions& ingestion_options)
+      const IngestExternalFileOptions& ingestion_options,
+      Directories* directories)
       : env_(env),
         versions_(versions),
         cfd_(cfd),
@@ -85,8 +88,11 @@ class ExternalSstFileIngestionJob {
         env_options_(env_options),
         db_snapshots_(db_snapshots),
         ingestion_options_(ingestion_options),
+        directories_(directories),
         job_start_time_(env_->NowMicros()),
-        consumed_seqno_(false) {}
+        consumed_seqno_(false) {
+    assert(directories != nullptr);
+  }
 
   // Prepare the job by copying external files into the DB.
   Status Prepare(const std::vector<std::string>& external_files_paths,
@@ -153,6 +159,10 @@ class ExternalSstFileIngestionJob {
   bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
                               int level);
 
+  // Helper method to sync given file.
+  template <typename TWritableFile>
+  Status SyncIngestedFile(TWritableFile* file);
+
   Env* env_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
@@ -161,6 +171,7 @@ class ExternalSstFileIngestionJob {
   SnapshotList* db_snapshots_;
   autovector<IngestedFileInfo> files_to_ingest_;
   const IngestExternalFileOptions& ingestion_options_;
+  Directories* directories_;
   VersionEdit edit_;
   uint64_t job_start_time_;
   bool consumed_seqno_;
diff --git a/test_util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc
index a591ff4b57b..5c47b7ea455 100644
--- a/test_util/fault_injection_test_env.cc
+++ b/test_util/fault_injection_test_env.cc
@@ -98,6 +98,9 @@ Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
 }
 
 Status TestDirectory::Fsync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
   env_->SyncDir(dirname_);
   return dir_->Fsync();
 }
@@ -158,6 +161,53 @@ Status TestWritableFile::Sync() {
   return Status::OK();
 }
 
+TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
+                                   std::unique_ptr<RandomRWFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : target_(std::move(f)), file_opened_(true), env_(env) {
+  assert(target_ != nullptr);
+}
+
+TestRandomRWFile::~TestRandomRWFile() {
+  if (file_opened_) {
+    Close();
+  }
+}
+
+Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Write(offset, data);
+}
+
+Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+                              char* scratch) const {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomRWFile::Close() {
+  file_opened_ = false;
+  return target_->Close();
+}
+
+Status TestRandomRWFile::Flush() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Flush();
+}
+
+Status TestRandomRWFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Sync();
+}
+
 Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
                                            std::unique_ptr<Directory>* result) {
   std::unique_ptr<Directory> r;
@@ -220,6 +270,27 @@ Status FaultInjectionTestEnv::ReopenWritableFile(
   return s;
 }
 
+Status FaultInjectionTestEnv::NewRandomRWFile(
+    const std::string& fname, std::unique_ptr<RandomRWFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->NewRandomRWFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
 Status FaultInjectionTestEnv::NewRandomAccessFile(
     const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
     const EnvOptions& soptions) {
@@ -238,7 +309,6 @@ Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
     fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
             s.ToString().c_str());
   }
-  assert(s.ok());
   if (s.ok()) {
     UntrackFile(f);
   }
diff --git a/test_util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h
index d962acfd585..b68b3faedce 100644
--- a/test_util/fault_injection_test_env.h
+++ b/test_util/fault_injection_test_env.h
@@ -82,6 +82,31 @@ class TestWritableFile : public WritableFile {
   FaultInjectionTestEnv* env_;
 };
 
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestRandomRWFile : public RandomRWFile {
+ public:
+  explicit TestRandomRWFile(const std::string& fname,
+                            std::unique_ptr<RandomRWFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestRandomRWFile();
+  Status Write(uint64_t offset, const Slice& data) override;
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
 class TestDirectory : public Directory {
  public:
   explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
@@ -114,6 +139,10 @@ class FaultInjectionTestEnv : public EnvWrapper {
                             std::unique_ptr<WritableFile>* result,
                             const EnvOptions& soptions) override;
 
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& soptions) override;
+
   Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result,
                              const EnvOptions& soptions) override;

From 22028aa9ab27cf860b74d12e006f82ff551caee0 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 21 Jun 2019 21:07:09 -0700
Subject: [PATCH 174/572] Compaction Reads should read no more than
 compaction_readahead_size bytes, when set! (#5498)

Summary:
As a result of https://github.com/facebook/rocksdb/issues/5431 the compaction_readahead_size given by a user was not used exactly, the reason being the code behind readahead for user-read and compaction-read was unified in the above PR and the behavior for user-read is to read readahead_size+n bytes (see FilePrefetchBuffer::TryReadFromCache method). Before the unification the ReadaheadRandomAccessFileReader used compaction_readahead_size as it is.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5498

Test Plan:
Ran strace command : strace -e pread64 -f -T -t ./db_compaction_test --gtest_filter=DBCompactionTest.PartialManualCompaction

In the test the compaction_readahead_size was configured to 2MB and verified the pread syscall did indeed request 2MB. Before the change it was requesting more than 2MB.

Strace Output:
strace: Process 3798982 attached
Note: Google Test filter = DBCompactionTest.PartialManualCompaction
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBCompactionTest
[ RUN      ] DBCompactionTest.PartialManualCompaction
strace: Process 3798983 attached
strace: Process 3798984 attached
strace: Process 3798985 attached
strace: Process 3798986 attached
strace: Process 3798987 attached
strace: Process 3798992 attached
[pid 3798987] 12:07:05 +++ exited with 0 +++
strace: Process 3798993 attached
[pid 3798993] 12:07:05 +++ exited with 0 +++
strace: Process 3798994 attached
strace: Process 3799008 attached
strace: Process 3799009 attached
[pid 3799008] 12:07:05 +++ exited with 0 +++
strace: Process 3799010 attached
[pid 3799009] 12:07:05 +++ exited with 0 +++
strace: Process 3799011 attached
[pid 3799010] 12:07:05 +++ exited with 0 +++
[pid 3799011] 12:07:05 +++ exited with 0 +++
strace: Process 3799012 attached
[pid 3799012] 12:07:05 +++ exited with 0 +++
strace: Process 3799013 attached
strace: Process 3799014 attached
[pid 3799013] 12:07:05 +++ exited with 0 +++
strace: Process 3799015 attached
[pid 3799014] 12:07:05 +++ exited with 0 +++
[pid 3799015] 12:07:05 +++ exited with 0 +++
strace: Process 3799016 attached
[pid 3799016] 12:07:05 +++ exited with 0 +++
strace: Process 3799017 attached
[pid 3799017] 12:07:05 +++ exited with 0 +++
strace: Process 3799019 attached
[pid 3799019] 12:07:05 +++ exited with 0 +++
strace: Process 3799020 attached
strace: Process 3799021 attached
[pid 3799020] 12:07:05 +++ exited with 0 +++
[pid 3799021] 12:07:05 +++ exited with 0 +++
strace: Process 3799022 attached
[pid 3799022] 12:07:05 +++ exited with 0 +++
strace: Process 3799023 attached
[pid 3799023] 12:07:05 +++ exited with 0 +++
strace: Process 3799047 attached
strace: Process 3799048 attached
[pid 3799047] 12:07:06 +++ exited with 0 +++
[pid 3799048] 12:07:06 +++ exited with 0 +++
[pid 3798994] 12:07:06 +++ exited with 0 +++
strace: Process 3799052 attached
[pid 3799052] 12:07:06 +++ exited with 0 +++
strace: Process 3799054 attached
strace: Process 3799069 attached
strace: Process 3799070 attached
[pid 3799069] 12:07:06 +++ exited with 0 +++
strace: Process 3799071 attached
[pid 3799070] 12:07:06 +++ exited with 0 +++
[pid 3799071] 12:07:06 +++ exited with 0 +++
strace: Process 3799072 attached
strace: Process 3799073 attached
[pid 3799072] 12:07:06 +++ exited with 0 +++
[pid 3799073] 12:07:06 +++ exited with 0 +++
strace: Process 3799074 attached
[pid 3799074] 12:07:06 +++ exited with 0 +++
strace: Process 3799075 attached
[pid 3799075] 12:07:06 +++ exited with 0 +++
strace: Process 3799076 attached
[pid 3799076] 12:07:06 +++ exited with 0 +++
strace: Process 3799077 attached
[pid 3799077] 12:07:06 +++ exited with 0 +++
strace: Process 3799078 attached
[pid 3799078] 12:07:06 +++ exited with 0 +++
strace: Process 3799079 attached
[pid 3799079] 12:07:06 +++ exited with 0 +++
strace: Process 3799080 attached
[pid 3799080] 12:07:06 +++ exited with 0 +++
strace: Process 3799081 attached
[pid 3799081] 12:07:06 +++ exited with 0 +++
strace: Process 3799082 attached
[pid 3799082] 12:07:06 +++ exited with 0 +++
strace: Process 3799083 attached
[pid 3799083] 12:07:06 +++ exited with 0 +++
strace: Process 3799086 attached
strace: Process 3799087 attached
[pid 3798984] 12:07:06 pread64(9, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000121>
[pid 3798984] 12:07:06 pread64(9, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000106>
[pid 3798984] 12:07:06 pread64(9, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000081>
[pid 3798984] 12:07:06 pread64(9, "\0\v\3foo\2\7\0\0\0\0\0\0\0\270 \0\v\4foo\2\3\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000138>
[pid 3798984] 12:07:06 pread64(11, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000097>
[pid 3798984] 12:07:06 pread64(11, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(11, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000064>
[pid 3798984] 12:07:06 pread64(11, "\0\v\3foo\2\21\0\0\0\0\0\0\0\270 \0\v\4foo\2\r\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000064>
[pid 3798984] 12:07:06 pread64(12, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(12, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000090>
[pid 3798984] 12:07:06 pread64(12, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000059>
[pid 3798984] 12:07:06 pread64(12, "\0\v\3foo\2\33\0\0\0\0\0\0\0\270 \0\v\4foo\2\27\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(13, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000070>
[pid 3798984] 12:07:06 pread64(13, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000059>
[pid 3798984] 12:07:06 pread64(13, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000061>
[pid 3798984] 12:07:06 pread64(13, "\0\v\3foo\2%\0\0\0\0\0\0\0\270 \0\v\4foo\2!\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(14, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000118>
[pid 3798984] 12:07:06 pread64(14, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(14, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000050>
[pid 3798984] 12:07:06 pread64(14, "\0\v\3foo\2/\0\0\0\0\0\0\0\270 \0\v\4foo\2+\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000082>
[pid 3798984] 12:07:06 pread64(15, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(15, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(15, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000091>
[pid 3798984] 12:07:06 pread64(15, "\0\v\3foo\0029\0\0\0\0\0\0\0\270 \0\v\4foo\0025\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000174>
[pid 3798984] 12:07:06 pread64(16, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(16, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(16, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000194>
[pid 3798984] 12:07:06 pread64(16, "\0\v\3foo\2C\0\0\0\0\0\0\0\270 \0\v\4foo\2?\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000086>
[pid 3798984] 12:07:06 pread64(17, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000079>
[pid 3798984] 12:07:06 pread64(17, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000047>
[pid 3798984] 12:07:06 pread64(17, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000045>
[pid 3798984] 12:07:06 pread64(17, "\0\v\3foo\2M\0\0\0\0\0\0\0\270 \0\v\4foo\2I\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000107>
[pid 3798983] 12:07:06 pread64(17, "\0\v\200\10foo\2P\0\0\0\0\0\0)U?MSg_)j(roFn($e"..., 2097152, 0) = 11230 <0.000091>
[pid 3798983] 12:07:06 pread64(17, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(16, "\0\v\200\10foo\2F\0\0\0\0\0\0k[h3%.OPH_^:\\S7T&"..., 2097152, 0) = 11230 <0.000083>
[pid 3798983] 12:07:06 pread64(16, "", 2085922, 11230) = 0 <0.000078>
[pid 3798983] 12:07:06 pread64(15, "\0\v\200\10foo\2<\0\0\0\0\0\0+qToi_c{*S+4:N(:"..., 2097152, 0) = 11230 <0.000095>
[pid 3798983] 12:07:06 pread64(15, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(14, "\0\v\200\10foo\0022\0\0\0\0\0\0%hw%OMa\"}9I609Q!B"..., 2097152, 0) = 11230 <0.000111>
[pid 3798983] 12:07:06 pread64(14, "", 2085922, 11230) = 0 <0.000093>
[pid 3798983] 12:07:06 pread64(13, "\0\v\200\10foo\2(\0\0\0\0\0\0p}Y&mu^DcaSGb2&nP"..., 2097152, 0) = 11230 <0.000128>
[pid 3798983] 12:07:06 pread64(13, "", 2085922, 11230) = 0 <0.000076>
[pid 3798983] 12:07:06 pread64(12, "\0\v\200\10foo\2\36\0\0\0\0\0\0YIyW#]oSs^6VHfB<`"..., 2097152, 0) = 11230 <0.000092>
[pid 3798983] 12:07:06 pread64(12, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(11, "\0\v\200\10foo\2\24\0\0\0\0\0\0mfF8Jel/*Zf :-#s("..., 2097152, 0) = 11230 <0.000088>
[pid 3798983] 12:07:06 pread64(11, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(9, "\0\v\200\10foo\2\n\0\0\0\0\0\0\\X'cjiHX)D,RSj1X!"..., 2097152, 0) = 11230 <0.000115>
[pid 3798983] 12:07:06 pread64(9, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(8, "\1\315\5 \36\30\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 754) = 53 <0.000098>
[pid 3798983] 12:07:06 pread64(8, "\0\22\3rocksdb.properties;\215\5\0\0\0\0\1\0\0\0"..., 37, 717) = 37 <0.000064>
[pid 3798983] 12:07:06 pread64(8, "\0$\4rocksdb.block.based.table.ind"..., 658, 59) = 658 <0.000074>
[pid 3798983] 12:07:06 pread64(8, "\0\v\2foo\1\0\0\0\0\0\0\0\0\31\0\0\0\0\1\0\0\0\0\212\216\222P", 29, 30) = 29 <0.000064>
[pid 3799086] 12:07:06 +++ exited with 0 +++
[pid 3799087] 12:07:06 +++ exited with 0 +++
[pid 3799054] 12:07:06 +++ exited with 0 +++
strace: Process 3799104 attached
[pid 3799104] 12:07:06 +++ exited with 0 +++
[       OK ] DBCompactionTest.PartialManualCompaction (757 ms)
[----------] 1 test from DBCompactionTest (758 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (759 ms total)
[  PASSED  ] 1 test.
[pid 3798983] 12:07:06 +++ exited with 0 +++
[pid 3798984] 12:07:06 +++ exited with 0 +++
[pid 3798992] 12:07:06 +++ exited with 0 +++
[pid 3798986] 12:07:06 +++ exited with 0 +++
[pid 3798982] 12:07:06 +++ exited with 0 +++
[pid 3798985] 12:07:06 +++ exited with 0 +++
12:07:06 +++ exited with 0 +++

Differential Revision: D15948422

Pulled By: vjnadimpalli

fbshipit-source-id: 9b189d1e8675d290c7784e4b33e5d3b5761d2ac8
---
 util/file_reader_writer.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 0af4c2098f1..bf88503339a 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -797,9 +797,12 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
     if (readahead_size_ > 0) {
       assert(file_reader_ != nullptr);
       assert(max_readahead_size_ >= readahead_size_);
-
-      Status s =
-          Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+      Status s;
+      if (for_compaction) {
+        s = Prefetch(file_reader_, offset, readahead_size_, for_compaction);
+      } else {
+        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+      }
       if (!s.ok()) {
         return false;
       }

From 68980df89cc67a553b589c0e9000cef9b60bd344 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Mon, 24 Jun 2019 10:38:02 -0700
Subject: [PATCH 175/572] Also build compression libraries on AppVeyor CI
 (#5226)

Summary:
This adds some compression dependencies to AppVeyor CI (those whose builds can be easily scripted on Windows, i.e. Snappy, LZ4, and ZStd).

Let's see if the CI passes ;-)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5226

Differential Revision: D15967223

fbshipit-source-id: 0914c613ac358cbb248df75cdee8099e836828dc
---
 appveyor.yml | 62 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 9dae40af8f7..6bdb164e84e 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,15 +1,67 @@
 version: 1.0.{build}
+
 image: Visual Studio 2017
+
+environment:
+  JAVA_HOME: C:\Program Files\Java\jdk1.8.0
+  THIRDPARTY_HOME: $(APPVEYOR_BUILD_FOLDER)\thirdparty
+  SNAPPY_HOME: $(THIRDPARTY_HOME)\snappy-1.1.7
+  SNAPPY_INCLUDE: $(SNAPPY_HOME);$(SNAPPY_HOME)\build
+  SNAPPY_LIB_DEBUG: $(SNAPPY_HOME)\build\Debug\snappy.lib
+  SNAPPY_LIB_RELEASE: $(SNAPPY_HOME)\build\Release\snappy.lib
+  LZ4_HOME: $(THIRDPARTY_HOME)\lz4-1.8.3
+  LZ4_INCLUDE: $(LZ4_HOME)\lib
+  LZ4_LIB_DEBUG: $(LZ4_HOME)\visual\VS2010\bin\x64_Debug\liblz4_static.lib
+  LZ4_LIB_RELEASE: $(LZ4_HOME)\visual\VS2010\bin\x64_Release\liblz4_static.lib
+  ZSTD_HOME: $(THIRDPARTY_HOME)\zstd-1.4.0
+  ZSTD_INCLUDE: $(ZSTD_HOME)\lib;$(ZSTD_HOME)\lib\dictBuilder
+  ZSTD_LIB_DEBUG: $(ZSTD_HOME)\build\VS2010\bin\x64_Debug\libzstd_static.lib
+  ZSTD_LIB_RELEASE: $(ZSTD_HOME)\build\VS2010\bin\x64_Release\libzstd_static.lib
+
+install:
+  - md %THIRDPARTY_HOME%
+  - echo "Building Snappy dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o snappy-1.1.7.zip https://github.com/google/snappy/archive/1.1.7.zip
+  - unzip snappy-1.1.7.zip
+  - cd snappy-1.1.7
+  - mkdir build
+  - cd build
+  - cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
+  - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building LZ4 dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o lz4-1.8.3.zip https://github.com/lz4/lz4/archive/v1.8.3.zip
+  - unzip lz4-1.8.3.zip
+  - cd lz4-1.8.3\visual\VS2010
+  - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD lz4.sln /upgrade
+  - msbuild lz4.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild lz4.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building ZStd dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o zstd-1.4.0.zip https://github.com/facebook/zstd/archive/v1.4.0.zip
+  - unzip zstd-1.4.0.zip
+  - cd zstd-1.4.0\build\VS2010
+  - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD zstd.sln /upgrade
+  - msbuild zstd.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild zstd.sln /p:Configuration=Release /p:Platform=x64
+
 before_build:
-- md %APPVEYOR_BUILD_FOLDER%\build
-- cd %APPVEYOR_BUILD_FOLDER%\build
-- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 -DJNI=1 ..
-- cd ..
+  - md %APPVEYOR_BUILD_FOLDER%\build
+  - cd %APPVEYOR_BUILD_FOLDER%\build
+  - cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 ..
+  - cd ..
 build:
   project: build\rocksdb.sln
   parallel: true
   verbosity: normal
+
 test:
+
 test_script:
-- ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
+  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
+
+on_failure:
+  - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip
 

From c92c58f84dff863ea0e41db2c31de3ae9d75a539 Mon Sep 17 00:00:00 2001
From: Jermy Li <javaloveme@gmail.com>
Date: Mon, 24 Jun 2019 11:32:45 -0700
Subject: [PATCH 176/572] JNI: Do not create 8M block cache for negative
 blockCacheSize values (#5465)

Summary:
As [BlockBasedTableConfig setBlockCacheSize()](https://github.com/facebook/rocksdb/blob/1966a7c055f6e182d627275051f5c09441aa922d/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java#L728) said, If cacheSize is non-positive, then cache will not be used. but when we configure a negative number or 0, there is an unexpected result: the block cache becomes 8M.

- Allow 0 as a valid size. When block cache size is 0, an 8MB block cache is created, as it is the default C++ API behavior. Also updated the comment.
- Set no_block_cache true if negative value is passed to block cache size, and no block cache will be created.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5465

Differential Revision: D15968788

Pulled By: sagar0

fbshipit-source-id: ee02d6e95841c9e2c316a64bfdf192d46ff5638a
---
 java/rocksjni/table.cc                                    | 5 ++++-
 java/src/main/java/org/rocksdb/BlockBasedTableConfig.java | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 1ccc550ab62..a4504d917ab 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -85,7 +85,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       std::shared_ptr<rocksdb::Cache> *pCache =
           reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jblock_cache_handle);
       options.block_cache = *pCache;
-    } else if (jblock_cache_size > 0) {
+    } else if (jblock_cache_size >= 0) {
       if (jblock_cache_num_shard_bits > 0) {
         options.block_cache = rocksdb::NewLRUCache(
             static_cast<size_t>(jblock_cache_size),
@@ -94,6 +94,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
         options.block_cache = rocksdb::NewLRUCache(
             static_cast<size_t>(jblock_cache_size));
       }
+    } else {
+      options.no_block_cache = true;
+      options.block_cache = nullptr;
     }
   }
   if (jpersistent_cache_handle > 0) {
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 4c88a0224c6..bf5c0c1a921 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -725,7 +725,7 @@ public long blockCacheSize() {
 
   /**
    * Set the size of the cache in bytes that will be used by RocksDB.
-   * If cacheSize is non-positive, then cache will not be used.
+   * If cacheSize is negative, then cache will not be used.
    * DEFAULT: 8M
    *
    * @param blockCacheSize block cache size in bytes

From e731f4402258554812c46334dc0d9483e6cc769b Mon Sep 17 00:00:00 2001
From: Sergei Petrunia <psergey@askmonty.org>
Date: Mon, 24 Jun 2019 16:08:17 -0700
Subject: [PATCH 177/572] C file should not include <cinttypes>, it is a C++
 header. (#5499)

Summary:
Include <inttypes.h> instead.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5499

Differential Revision: D15966937

Pulled By: miasantreble

fbshipit-source-id: 2156c4329b91d26d447de94f1231264d52786350
---
 util/crc32c_ppc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c
index ce0b9f27ce6..888a4943eaa 100644
--- a/util/crc32c_ppc.c
+++ b/util/crc32c_ppc.c
@@ -6,7 +6,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #define CRC_TABLE
-#include <cinttypes>
+#include <stdint.h>
 #include <stdlib.h>
 #include <strings.h>
 #include "util/crc32c_ppc_constants.h"

From acb80534cac798d250ee85812f0e45112f2e4b66 Mon Sep 17 00:00:00 2001
From: Huisheng Liu <hliu@microsoft.com>
Date: Mon, 24 Jun 2019 17:36:26 -0700
Subject: [PATCH 178/572] Fix build jemalloc api (#5470)

Summary:
There is a compile error on Windows with MSVC in malloc_stats.cc where malloc_stats_print is referenced. The compiler only knows je_malloc_stats_print from jemalloc.h. Adding JEMALLOC_NO_RENAME replaces malloc_stats_print with je_malloc_stats_print.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5470

Differential Revision: D15978720

fbshipit-source-id: c05757a2e89e2e015a661d9626c352e4f32f97e4
---
 thirdparty.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty.inc b/thirdparty.inc
index ed9d4c0f8db..25ecdab88c2 100644
--- a/thirdparty.inc
+++ b/thirdparty.inc
@@ -241,7 +241,7 @@ endif()
 
 if (WITH_JEMALLOC)
   message(STATUS "JEMALLOC library is enabled")
-  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= ")
+  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= -DJEMALLOC_NO_RENAME")
   
   if(DEFINED ENV{JEMALLOC_INCLUDE})
     set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE})

From 554a6456aad5b46149e05eab41779778c51607f4 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 24 Jun 2019 20:38:20 -0700
Subject: [PATCH 179/572] Block cache trace analysis: Write time series graphs
 in csv files (#5490)

Summary:
This PR adds a feature in block cache trace analysis tool to write statistics into csv files.
1. The analysis tool supports grouping the number of accesses per second by various labels, e.g., block, column family, block type, or a combination of them.
2. It also computes reuse distance and reuse interval.

Reuse distance: The cumulated size of unique blocks read between two consecutive accesses on the same block.
Reuse interval: The time between two consecutive accesses on the same block.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5490

Differential Revision: D15901322

Pulled By: HaoyuHuang

fbshipit-source-id: b5454fea408a32757a80be63de6fe1c8149ca70e
---
 tools/block_cache_trace_analyzer.cc      | 484 ++++++++++++++++++++++-
 tools/block_cache_trace_analyzer.h       |  58 ++-
 tools/block_cache_trace_analyzer_test.cc | 115 +++++-
 3 files changed, 628 insertions(+), 29 deletions(-)

diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 732094bf29b..78753a21622 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -11,7 +11,6 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <set>
 #include <sstream>
 #include "monitoring/histogram.h"
 #include "util/gflags_compat.h"
@@ -42,12 +41,70 @@ DEFINE_bool(print_data_block_access_count_stats, false,
 DEFINE_int32(cache_sim_warmup_seconds, 0,
              "The number of seconds to warmup simulated caches. The hit/miss "
              "counters are reset after the warmup completes.");
-DEFINE_string(output_miss_ratio_curve_path, "",
-              "The output file to save the computed miss ratios. File format: "
-              "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses");
+DEFINE_string(
+    block_cache_analysis_result_dir, "",
+    "The directory that saves block cache analysis results. It contains 1) a "
+    "mrc file that saves the computed miss ratios for simulated caches. Its "
+    "format is "
+    "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several "
+    "\"label_access_timeline\" files that contain number of accesses per "
+    "second grouped by the label. File format: "
+    "time,label_1_access_per_second,label_2_access_per_second,...,label_N_"
+    "access_per_second where N is the number of unique labels found in the "
+    "trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" "
+    "csv files that contain the reuse distance/interval grouped by label. File "
+    "format: bucket,label_1,label_2,...,label_N. The first N buckets are "
+    "absolute values. The second N buckets are percentage values.");
+DEFINE_string(
+    timeline_labels, "",
+    "Group the number of accesses per block per second using these labels. "
+    "Possible labels are a combination of the following: cf (column family), "
+    "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
+    "means the number of acccess per second is grouped by unique pairs of "
+    "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
+    "second across all possible labels.");
+DEFINE_string(reuse_distance_labels, "",
+              "Group the reuse distance of a block using these labels. Reuse "
+              "distance is defined as the cumulated size of unique blocks read "
+              "between two consecutive accesses on the same block.");
+DEFINE_string(
+    reuse_distance_buckets, "",
+    "Group blocks by their reuse distances given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M "
+    "and 1G, respectively. The last bucket contains the number of blocks with "
+    "reuse distance larger than 1G. ");
+DEFINE_string(
+    reuse_interval_labels, "",
+    "Group the reuse interval of a block using these labels. Reuse "
+    "interval is defined as the time between two consecutive accesses "
+    "on the same block.");
+DEFINE_string(
+    reuse_interval_buckets, "",
+    "Group blocks by their reuse interval given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse interval less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse interval longer than 100 "
+    "seconds.");
 
 namespace rocksdb {
 namespace {
+
+const std::string kMissRatioCurveFileName = "mrc";
+const std::string kGroupbyBlock = "block";
+const std::string kGroupbyColumnFamily = "cf";
+const std::string kGroupbySSTFile = "sst";
+const std::string kGroupbyBlockType = "bt";
+const std::string kGroupbyCaller = "caller";
+const std::string kGroupbyLevel = "level";
+const std::string kGroupbyAll = "all";
+const std::set<std::string> kGroupbyLabels{
+    kGroupbyBlock,     kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
+    kGroupbyBlockType, kGroupbyCaller,       kGroupbyAll};
+
 std::string block_type_to_string(TraceType type) {
   switch (type) {
     case kBlockTraceFilterBlock:
@@ -146,8 +203,9 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
     trace_start_time_ = access.access_timestamp;
   }
   // access.access_timestamp is in microseconds.
-  if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <=
-                               access.access_timestamp) {
+  if (!warmup_complete_ &&
+      trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
+          access.access_timestamp) {
     for (auto& sim_cache : sim_caches_) {
       sim_cache->reset_counter();
     }
@@ -162,14 +220,16 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
   }
 }
 
-void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const {
+void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   if (!cache_simulator_) {
     return;
   }
-  if (output_miss_ratio_curve_path_.empty()) {
+  if (output_dir_.empty()) {
     return;
   }
-  std::ofstream out(output_miss_ratio_curve_path_);
+  const std::string output_miss_ratio_curve_path =
+      output_dir_ + "/" + kMissRatioCurveFileName;
+  std::ofstream out(output_miss_ratio_curve_path);
   if (!out.is_open()) {
     return;
   }
@@ -203,14 +263,345 @@ void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const {
   out.close();
 }
 
+std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
+    const std::string& label_str) const {
+  std::stringstream ss(label_str);
+  std::set<std::string> labels;
+  // label_str is in the form of "label1_label2_label3", e.g., cf_bt.
+  while (ss.good()) {
+    std::string label_name;
+    getline(ss, label_name, '_');
+    if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) {
+      // Unknown label name.
+      fprintf(stderr, "Unknown label name %s, label string %s\n",
+              label_name.c_str(), label_str.c_str());
+      return {};
+    }
+    labels.insert(label_name);
+  }
+  return labels;
+}
+
+std::string BlockCacheTraceAnalyzer::BuildLabel(
+    const std::set<std::string>& labels, const std::string& cf_name,
+    uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
+    const std::string& block_key) const {
+  std::map<std::string, std::string> label_value_map;
+  label_value_map[kGroupbyAll] = kGroupbyAll;
+  label_value_map[kGroupbyLevel] = std::to_string(level);
+  label_value_map[kGroupbyCaller] = caller_to_string(caller);
+  label_value_map[kGroupbySSTFile] = std::to_string(fd);
+  label_value_map[kGroupbyBlockType] = block_type_to_string(type);
+  label_value_map[kGroupbyColumnFamily] = cf_name;
+  label_value_map[kGroupbyBlock] = block_key;
+  // Concatenate the label values.
+  std::string label;
+  for (auto const& l : labels) {
+    label += label_value_map[l];
+    label += "-";
+  }
+  if (!label.empty()) {
+    label.pop_back();
+  }
+  return label;
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessTimeline(
+    const std::string& label_str) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          for (auto const& timeline :
+               block_access_info.second.caller_num_accesses_timeline) {
+            const TableReaderCaller caller = timeline.first;
+            const std::string& block_key = block_access_info.first;
+            const std::string label =
+                BuildLabel(labels, cf_name, fd, level, type, caller, block_key);
+            for (auto const& naccess : timeline.second) {
+              const uint64_t timestamp = naccess.first;
+              const uint64_t num = naccess.second;
+              label_access_timeline[label][timestamp] += num;
+              start_time = std::min(start_time, timestamp);
+              end_time = std::max(end_time, timestamp);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // We have label_access_timeline now. Write them into a file.
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_access_timeline";
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("time");
+  for (auto const& label : label_access_timeline) {
+    header += ",";
+    header += label.first;
+  }
+  out << header << std::endl;
+  std::string row;
+  for (uint64_t now = start_time; now <= end_time; now++) {
+    row = std::to_string(now);
+    for (auto const& label : label_access_timeline) {
+      auto it = label.second.find(now);
+      row += ",";
+      if (it != label.second.end()) {
+        row += std::to_string(it->second);
+      } else {
+        row += "0";
+      }
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseDistance(
+    const std::string& label_str,
+    const std::set<uint64_t>& distance_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
+  uint64_t total_num_reuses = 0;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          const std::string& block_key = block_access_info.first;
+          const std::string label = BuildLabel(
+              labels, cf_name, fd, level, type,
+              TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
+          if (label_distance_num_reuses.find(label) ==
+              label_distance_num_reuses.end()) {
+            // The first time we encounter this label.
+            for (auto const& distance_bucket : distance_buckets) {
+              label_distance_num_reuses[label][distance_bucket] = 0;
+            }
+          }
+          for (auto const& reuse_distance :
+               block_access_info.second.reuse_distance_count) {
+            label_distance_num_reuses[label]
+                .upper_bound(reuse_distance.first)
+                ->second += reuse_distance.second;
+            total_num_reuses += reuse_distance.second;
+          }
+        }
+      }
+    }
+  }
+
+  // We have label_naccesses and label_distance_num_reuses now. Write them into
+  // a file.
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_reuse_distance";
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_distance_num_reuses) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  // Absolute values.
+  for (auto const& bucket : distance_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_distance_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(it->second);
+    }
+    out << row << std::endl;
+  }
+  // Percentage values.
+  for (auto const& bucket : distance_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_distance_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, total_num_reuses));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
+    const std::string& label, const std::set<uint64_t>& time_buckets,
+    const std::map<uint64_t, uint64_t> timeline,
+    std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
+    uint64_t* total_num_reuses) const {
+  assert(label_time_num_reuses);
+  assert(total_num_reuses);
+  if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) {
+    // The first time we encounter this label.
+    for (auto const& time_bucket : time_buckets) {
+      (*label_time_num_reuses)[label][time_bucket] = 0;
+    }
+  }
+  auto it = timeline.begin();
+  const uint64_t prev_timestamp = it->first;
+  const uint64_t prev_num = it->second;
+  it++;
+  // Reused within one second.
+  if (prev_num > 1) {
+    (*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1;
+    *total_num_reuses += prev_num - 1;
+  }
+  while (it != timeline.end()) {
+    const uint64_t timestamp = it->first;
+    const uint64_t num = it->second;
+    const uint64_t reuse_interval = timestamp - prev_timestamp;
+    (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num;
+    *total_num_reuses += num;
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseInterval(
+    const std::string& label_str,
+    const std::set<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
+  uint64_t total_num_reuses = 0;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          const std::string& block_key = block_access_info.first;
+          if (labels.find(kGroupbyCaller) != labels.end()) {
+            for (auto const& timeline :
+                 block_access_info.second.caller_num_accesses_timeline) {
+              const TableReaderCaller caller = timeline.first;
+              const std::string label = BuildLabel(labels, cf_name, fd, level,
+                                                   type, caller, block_key);
+              UpdateReuseIntervalStats(label, time_buckets, timeline.second,
+                                       &label_time_num_reuses,
+                                       &total_num_reuses);
+            }
+            continue;
+          }
+          // Does not group by caller so we need to flatten the access timeline.
+          const std::string label = BuildLabel(
+              labels, cf_name, fd, level, type,
+              TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
+          std::map<uint64_t, uint64_t> timeline;
+          for (auto const& caller_timeline :
+               block_access_info.second.caller_num_accesses_timeline) {
+            for (auto const& time_naccess : caller_timeline.second) {
+              timeline[time_naccess.first] += time_naccess.second;
+            }
+          }
+          UpdateReuseIntervalStats(label, time_buckets, timeline,
+                                   &label_time_num_reuses, &total_num_reuses);
+        }
+      }
+    }
+  }
+
+  // We have label_naccesses and label_interval_num_reuses now. Write them into
+  // a file.
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_reuse_interval";
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_time_num_reuses) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  // Absolute values.
+  for (auto const& bucket : time_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_time_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(it->second);
+    }
+    out << row << std::endl;
+  }
+  // Percentage values.
+  for (auto const& bucket : time_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_time_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, total_num_reuses));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
 BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
-    const std::string& trace_file_path,
-    const std::string& output_miss_ratio_curve_path,
+    const std::string& trace_file_path, const std::string& output_dir,
     std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
-    : trace_file_path_(trace_file_path),
-      output_miss_ratio_curve_path_(output_miss_ratio_curve_path),
-      cache_simulator_(std::move(cache_simulator)) {
-  env_ = rocksdb::Env::Default();
+    : env_(rocksdb::Env::Default()),
+      trace_file_path_(trace_file_path),
+      output_dir_(output_dir),
+      cache_simulator_(std::move(cache_simulator)) {}
+
+void BlockCacheTraceAnalyzer::ComputeReuseDistance(
+    BlockAccessInfo* info) const {
+  assert(info);
+  if (info->num_accesses == 0) {
+    return;
+  }
+  uint64_t reuse_distance = 0;
+  for (auto const& block_key : info->unique_blocks_since_last_access) {
+    auto const& it = block_info_map_.find(block_key);
+    // This block must exist.
+    assert(it != block_info_map_.end());
+    reuse_distance += it->second->block_size;
+  }
+  info->reuse_distance_count[reuse_distance] += 1;
+  // We clear this hash set since this is the second access on this block.
+  info->unique_blocks_since_last_access.clear();
 }
 
 void BlockCacheTraceAnalyzer::RecordAccess(
@@ -223,7 +614,23 @@ void BlockCacheTraceAnalyzer::RecordAccess(
       file_aggr.block_type_aggregates_map[access.block_type];
   BlockAccessInfo& block_access_info =
       block_type_aggr.block_access_info_map[access.block_key];
+  ComputeReuseDistance(&block_access_info);
   block_access_info.AddAccess(access);
+  block_info_map_[access.block_key] = &block_access_info;
+
+  // Add this block to all existing blocks.
+  for (auto& cf_aggregates : cf_aggregates_map_) {
+    for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      for (auto& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        for (auto& existing_block :
+             block_type_aggregates.second.block_access_info_map) {
+          existing_block.second.unique_blocks_since_last_access.insert(
+              access.block_key);
+        }
+      }
+    }
+  }
 }
 
 Status BlockCacheTraceAnalyzer::Analyze() {
@@ -659,6 +1066,18 @@ std::vector<CacheConfiguration> parse_cache_config_file(
   return configs;
 }
 
+std::set<uint64_t> parse_buckets(const std::string& bucket_str) {
+  std::set<uint64_t> buckets;
+  std::stringstream ss(bucket_str);
+  while (ss.good()) {
+    std::string bucket;
+    getline(ss, bucket, ',');
+    buckets.insert(ParseUint64(bucket));
+  }
+  buckets.insert(port::kMaxUint64);
+  return buckets;
+}
+
 int block_cache_trace_analyzer_tool(int argc, char** argv) {
   ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_block_cache_trace_path.empty()) {
@@ -678,7 +1097,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
         warmup_seconds, downsample_ratio, cache_configs));
   }
   BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
-                                   FLAGS_output_miss_ratio_curve_path,
+                                   FLAGS_block_cache_analysis_result_dir,
                                    std::move(cache_simulator));
   Status s = analyzer.Analyze();
   if (!s.IsIncomplete()) {
@@ -701,7 +1120,38 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
     analyzer.PrintDataBlockAccessStats();
   }
   print_break_lines(/*num_break_lines=*/3);
-  analyzer.PrintMissRatioCurves();
+  analyzer.WriteMissRatioCurves();
+
+  if (!FLAGS_timeline_labels.empty()) {
+    std::stringstream ss(FLAGS_timeline_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteAccessTimeline(label);
+    }
+  }
+
+  if (!FLAGS_reuse_distance_labels.empty() &&
+      !FLAGS_reuse_distance_buckets.empty()) {
+    std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
+    std::stringstream ss(FLAGS_reuse_distance_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseDistance(label, buckets);
+    }
+  }
+
+  if (!FLAGS_reuse_interval_labels.empty() &&
+      !FLAGS_reuse_interval_buckets.empty()) {
+    std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
+    std::stringstream ss(FLAGS_reuse_interval_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseInterval(label, buckets);
+    }
+  }
   return 0;
 }
 
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index c953ecf2164..21a99f7db76 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <map>
+#include <set>
 #include <vector>
 
 #include "rocksdb/env.h"
@@ -14,6 +15,8 @@
 
 namespace rocksdb {
 
+const uint64_t kMicrosInSecond = 1000000;
+
 class BlockCacheTraceAnalyzer;
 
 // A cache configuration provided by user.
@@ -73,6 +76,14 @@ struct BlockAccessInfo {
       non_exist_key_num_access_map;  // for keys do not exist in this block.
   uint64_t num_referenced_key_exist_in_block = 0;
   std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  // caller:timestamp:number_of_accesses. The granularity of the timestamp is
+  // seconds.
+  std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+      caller_num_accesses_timeline;
+  // Unique blocks since the last access.
+  std::set<std::string> unique_blocks_since_last_access;
+  // Number of reuses grouped by reuse distance.
+  std::map<uint64_t, uint64_t> reuse_distance_count;
 
   void AddAccess(const BlockCacheTraceRecord& access) {
     if (first_access_time == 0) {
@@ -82,10 +93,13 @@ struct BlockAccessInfo {
     block_size = access.block_size;
     caller_num_access_map[access.caller]++;
     num_accesses++;
+    // access.access_timestamp is in microsecond.
+    const uint64_t timestamp_in_seconds =
+        access.access_timestamp / kMicrosInSecond;
+    caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
     if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type,
                                                         access.caller)) {
       num_keys = access.num_keys_in_block;
-
       if (access.referenced_key_exist_in_block == Boolean::kTrue) {
         key_num_access_map[access.referenced_key]++;
         num_referenced_key_exist_in_block++;
@@ -115,8 +129,7 @@ struct ColumnFamilyAccessInfoAggregate {
 class BlockCacheTraceAnalyzer {
  public:
   BlockCacheTraceAnalyzer(
-      const std::string& trace_file_path,
-      const std::string& output_miss_ratio_curve_path,
+      const std::string& trace_file_path, const std::string& output_dir,
       std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
   ~BlockCacheTraceAnalyzer() = default;
   // No copy and move.
@@ -165,7 +178,24 @@ class BlockCacheTraceAnalyzer {
   // accesses on keys exist in a data block and its break down by column family.
   void PrintDataBlockAccessStats() const;
 
-  void PrintMissRatioCurves() const;
+  // Write miss ratio curves of simulated cache configurations into a csv file
+  // saved in 'output_dir'.
+  void WriteMissRatioCurves() const;
+
+  // Write the access timeline into a csv file saved in 'output_dir'.
+  void WriteAccessTimeline(const std::string& label) const;
+
+  // Write the reuse distance into a csv file saved in 'output_dir'. Reuse
+  // distance is defined as the cumulated size of unique blocks read between two
+  // consective accesses on the same block.
+  void WriteReuseDistance(const std::string& label_str,
+                          const std::set<uint64_t>& distance_buckets) const;
+
+  // Write the reuse interval into a csv file saved in 'output_dir'. Reuse
+  // interval is defined as the time between two consecutive accesses on the
+  // same block..
+  void WriteReuseInterval(const std::string& label_str,
+                          const std::set<uint64_t>& time_buckets) const;
 
   const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
   TEST_cf_aggregates_map() const {
@@ -173,15 +203,33 @@ class BlockCacheTraceAnalyzer {
   }
 
  private:
+  std::set<std::string> ParseLabelStr(const std::string& label_str) const;
+
+  std::string BuildLabel(const std::set<std::string>& labels,
+                         const std::string& cf_name, uint64_t fd,
+                         uint32_t level, TraceType type,
+                         TableReaderCaller caller,
+                         const std::string& block_key) const;
+
+  void ComputeReuseDistance(BlockAccessInfo* info) const;
+
   void RecordAccess(const BlockCacheTraceRecord& access);
 
+  void UpdateReuseIntervalStats(
+      const std::string& label, const std::set<uint64_t>& time_buckets,
+      const std::map<uint64_t, uint64_t> timeline,
+      std::map<std::string, std::map<uint64_t, uint64_t>>*
+          label_time_num_reuses,
+      uint64_t* total_num_reuses) const;
+
   rocksdb::Env* env_;
   const std::string trace_file_path_;
-  const std::string output_miss_ratio_curve_path_;
+  const std::string output_dir_;
 
   BlockCacheTraceHeader header_;
   std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
   std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
+  std::map<std::string, BlockAccessInfo*> block_info_map_;
 };
 
 int block_cache_trace_analyzer_tool(int argc, char** argv);
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index c361ba054ac..80734565a3d 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -49,7 +49,13 @@ class BlockCacheTracerTest : public testing::Test {
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
     block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
-    output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve";
+    timeline_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_buckets_ = "1,1K,1M,1G";
+    reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_interval_buckets_ = "1,10,100,1000";
   }
 
   ~BlockCacheTracerTest() override {
@@ -85,11 +91,12 @@ class BlockCacheTracerTest : public testing::Test {
     assert(writer);
     for (uint32_t i = 0; i < nblocks; i++) {
       uint32_t key_id = from_key_id + i;
+      uint32_t timestamp = (key_id + 1) * kMicrosInSecond;
       BlockCacheTraceRecord record;
       record.block_type = block_type;
       record.block_size = kBlockSize + key_id;
       record.block_key = kBlockKeyPrefix + std::to_string(key_id);
-      record.access_timestamp = env_->NowMicros();
+      record.access_timestamp = timestamp;
       record.cf_id = kCFId;
       record.cf_name = kDefaultColumnFamilyName;
       record.caller = GetCaller(key_id);
@@ -146,11 +153,17 @@ class BlockCacheTracerTest : public testing::Test {
         "./block_cache_trace_analyzer",
         "-block_cache_trace_path=" + trace_file_path_,
         "-block_cache_sim_config_path=" + block_cache_sim_config_path_,
-        "-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_,
+        "-block_cache_analysis_result_dir=" + test_path_,
         "-print_block_size_stats",
         "-print_access_count_stats",
         "-print_data_block_access_count_stats",
-        "-cache_sim_warmup_seconds=0"};
+        "-cache_sim_warmup_seconds=0",
+        "-timeline_labels=" + timeline_labels_,
+        "-reuse_distance_labels=" + reuse_distance_labels_,
+        "-reuse_distance_buckets=" + reuse_distance_buckets_,
+        "-reuse_interval_labels=" + reuse_interval_labels_,
+        "-reuse_interval_buckets=" + reuse_interval_buckets_,
+    };
     char arg_buffer[kArgBufferSize];
     char* argv[kMaxArgCount];
     int argc = 0;
@@ -168,10 +181,14 @@ class BlockCacheTracerTest : public testing::Test {
 
   Env* env_;
   EnvOptions env_options_;
-  std::string output_miss_ratio_curve_path_;
   std::string block_cache_sim_config_path_;
   std::string trace_file_path_;
   std::string test_path_;
+  std::string timeline_labels_;
+  std::string reuse_distance_labels_;
+  std::string reuse_distance_buckets_;
+  std::string reuse_interval_labels_;
+  std::string reuse_interval_buckets_;
 };
 
 TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
@@ -199,7 +216,8 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
     // Validate the cache miss ratios.
     const std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
                                                     1024 * 1024 * 1024};
-    std::ifstream infile(output_miss_ratio_curve_path_);
+    const std::string mrc_path = test_path_ + "/mrc";
+    std::ifstream infile(mrc_path);
     uint32_t config_index = 0;
     std::string line;
     // Read header.
@@ -224,8 +242,91 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
     }
     ASSERT_EQ(expected_capacities.size(), config_index);
     infile.close();
+    ASSERT_OK(env_->DeleteFile(mrc_path));
+  }
+  {
+    // Validate the timeline csv files.
+    const uint32_t expected_num_lines = 50;
+    std::stringstream ss(timeline_labels_);
+    while (ss.good()) {
+      std::string l;
+      ASSERT_TRUE(getline(ss, l, ','));
+      const std::string timeline_file =
+          test_path_ + "/" + l + "_access_timeline";
+      std::ifstream infile(timeline_file);
+      std::string line;
+      uint32_t nlines = 0;
+      ASSERT_TRUE(getline(infile, line));
+      uint64_t expected_time = 1;
+      while (getline(infile, line)) {
+        std::stringstream ss_naccess(line);
+        uint32_t naccesses = 0;
+        std::string substr;
+        uint32_t time = 0;
+        while (ss_naccess.good()) {
+          ASSERT_TRUE(getline(ss_naccess, substr, ','));
+          if (time == 0) {
+            time = ParseUint32(substr);
+            continue;
+          }
+          naccesses += ParseUint32(substr);
+        }
+        nlines++;
+        ASSERT_EQ(1, naccesses);
+        ASSERT_EQ(expected_time, time);
+        expected_time += 1;
+      }
+      ASSERT_EQ(expected_num_lines, nlines);
+      ASSERT_OK(env_->DeleteFile(timeline_file));
+    }
+  }
+  {
+    // Validate the reuse_interval and reuse_distance csv files.
+    std::map<std::string, std::string> test_reuse_csv_files;
+    test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
+    for (auto const& test : test_reuse_csv_files) {
+      const std::string& file_suffix = test.first;
+      const std::string& labels = test.second;
+      const uint32_t expected_num_rows = 10;
+      const uint32_t expected_num_rows_absolute_values = 5;
+      const uint32_t expected_reused_blocks = 0;
+      std::stringstream ss(labels);
+      while (ss.good()) {
+        std::string l;
+        ASSERT_TRUE(getline(ss, l, ','));
+        const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix;
+        std::ifstream infile(reuse_csv_file);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        uint32_t nblocks = 0;
+        double npercentage = 0;
+        uint32_t nrows = 0;
+        while (getline(infile, line)) {
+          std::stringstream ss_naccess(line);
+          bool label_read = false;
+          nrows++;
+          while (ss_naccess.good()) {
+            std::string substr;
+            ASSERT_TRUE(getline(ss_naccess, substr, ','));
+            if (!label_read) {
+              label_read = true;
+              continue;
+            }
+            if (nrows < expected_num_rows_absolute_values) {
+              nblocks += ParseUint32(substr);
+            } else {
+              npercentage += ParseDouble(substr);
+            }
+          }
+        }
+        ASSERT_EQ(expected_num_rows, nrows);
+        ASSERT_EQ(expected_reused_blocks, nblocks);
+        ASSERT_LT(npercentage, 0);
+        ASSERT_OK(env_->DeleteFile(reuse_csv_file));
+      }
+    }
   }
-  ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_));
   ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
 }
 

From b4d72094280e1e0220ec321779902aba6662db25 Mon Sep 17 00:00:00 2001
From: Mike Kolupaev <kolmike@fb.com>
Date: Mon, 24 Jun 2019 20:50:35 -0700
Subject: [PATCH 180/572] Add an option to put first key of each sst block in
 the index (#5289)

Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.

Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.

So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.

Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.

This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289

Differential Revision: D15256423

Pulled By: al13n321

fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
---
 HISTORY.md                                    |   1 +
 db/db_iterator_test.cc                        | 142 ++++
 include/rocksdb/table.h                       |  24 +-
 java/rocksjni/portal.h                        |   7 +-
 options/options_helper.cc                     |   4 +-
 table/block_based/block.cc                    |  77 ++-
 table/block_based/block.h                     | 103 +--
 table/block_based/block_based_table_reader.cc | 653 ++++++++++--------
 table/block_based/block_based_table_reader.h  | 107 ++-
 table/block_based/block_test.cc               | 253 +++----
 .../block_based/data_block_hash_index_test.cc |   8 +-
 table/block_based/index_builder.cc            |  10 +-
 table/block_based/index_builder.h             |  48 +-
 table/block_based/partitioned_filter_block.cc |  25 +-
 table/block_fetcher.cc                        |   1 -
 table/format.cc                               |  52 ++
 table/format.h                                |  29 +
 table/internal_iterator.h                     |   7 +-
 table/iterator.cc                             |   8 +-
 table/meta_blocks.cc                          |  17 +-
 table/table_test.cc                           | 319 ++++++++-
 table/two_level_iterator.cc                   |  26 +-
 table/two_level_iterator.h                    |   7 +-
 test_util/testutil.cc                         |   9 +-
 util/coding.h                                 |  13 +
 25 files changed, 1362 insertions(+), 588 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 975ece580d4..07eb2759736 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -41,6 +41,7 @@
 * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior.
 * When reading from option file/string/map, customized envs can be filled according to object registry.
 * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator.
+* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right.
 
 ### Public API Change
 * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index e2b9f503ffb..d514e7683de 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -1049,6 +1049,148 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
     ASSERT_EQ(upper_bound_hits, 1);
   }
 }
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+  for (int tailing = 0; tailing < 2; ++tailing) {
+    SCOPED_TRACE("tailing = " + std::to_string(tailing));
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    options.statistics = rocksdb::CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    BlockBasedTableOptions table_options;
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Merge("a1", "x1"));
+    ASSERT_OK(Merge("b1", "y1"));
+    ASSERT_OK(Merge("c0", "z1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a2", "x2"));
+    ASSERT_OK(Merge("b2", "y2"));
+    ASSERT_OK(Merge("c0", "z2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a3", "x3"));
+    ASSERT_OK(Merge("b3", "y3"));
+    ASSERT_OK(Merge("c3", "z3"));
+    ASSERT_OK(Flush());
+
+    // Block cache is not important for this test.
+    // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+    // available way of counting block accesses.
+
+    ReadOptions ropt;
+    ropt.tailing = tailing;
+    std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+    iter->Seek("b10");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b3", iter->key().ToString());
+    EXPECT_EQ("y3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Seek("c0");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c0", iter->key().ToString());
+    EXPECT_EQ("z1,z2", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c3", iter->key().ToString());
+    EXPECT_EQ("z3", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter.reset();
+
+    // Enable iterate_upper_bound and check that iterator is not trying to read
+    // blocks that are fully above upper bound.
+    std::string ub = "b3";
+    Slice ub_slice(ub);
+    ropt.iterate_upper_bound = &ub_slice;
+    iter.reset(NewIterator(ropt));
+
+    iter->Seek("b2");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor = nullptr;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.statistics = rocksdb::CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  BlockBasedTableOptions table_options;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("a", "x1"));
+  ASSERT_OK(Merge("c", "y1"));
+  ASSERT_OK(Merge("e", "z1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("c", "y2"));
+  ASSERT_OK(Merge("e", "z2"));
+  ASSERT_OK(Flush());
+
+  // Get() between blocks shouldn't read any blocks.
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // Get() of an existing key shouldn't read any unnecessary blocks when there's
+  // only one key per block.
+
+  ASSERT_EQ("y1,y2", Get("c"));
+  EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ASSERT_EQ("x1", Get("a"));
+  EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+            MultiGet({"b", "e"}));
+}
+
 // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
 //             return the biggest key which is smaller than the seek key.
 TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 88fcc78ed8c..929239100a4 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -93,14 +93,32 @@ struct BlockBasedTableOptions {
   enum IndexType : char {
     // A space efficient index block that is optimized for
     // binary-search-based index.
-    kBinarySearch,
+    kBinarySearch = 0x00,
 
     // The hash index, if enabled, will do the hash lookup when
     // `Options.prefix_extractor` is provided.
-    kHashSearch,
+    kHashSearch = 0x01,
 
     // A two-level index implementation. Both levels are binary search indexes.
-    kTwoLevelIndexSearch,
+    kTwoLevelIndexSearch = 0x02,
+
+    // Like kBinarySearch, but index also contains first key of each block.
+    // This allows iterators to defer reading the block until it's actually
+    // needed. May significantly reduce read amplification of short range scans.
+    // Without it, iterator seek usually reads one block from each level-0 file
+    // and from each level, which may be expensive.
+    // Works best in combination with:
+    //  - IndexShorteningMode::kNoShortening,
+    //  - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+    //    e.g. when prefix changes.
+    // Makes the index significantly bigger (2x or more), especially when keys
+    // are long.
+    //
+    // IO errors are not handled correctly in this mode right now: if an error
+    // happens when lazily reading a block in value(), value() returns empty
+    // slice, and you need to call Valid()/status() afterwards.
+    // TODO(kolmike): Fix it.
+    kBinarySearchWithFirstKey = 0x03,
   };
 
   IndexType index_type = kBinarySearch;
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index d1585fcfa80..667af809bdc 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5902,8 +5902,10 @@ class IndexTypeJni {
        return 0x0;
      case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch:
        return 0x1;
-    case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch:
+     case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch:
        return 0x2;
+     case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey:
+       return 0x3;
      default:
        return 0x7F;  // undefined
    }
@@ -5920,6 +5922,9 @@ class IndexTypeJni {
        return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch;
      case 0x2:
        return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+     case 0x3:
+       return rocksdb::BlockBasedTableOptions::IndexType::
+           kBinarySearchWithFirstKey;
      default:
        // undefined/default
        return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 71a7f9b2fc0..47aba7ad035 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -1671,7 +1671,9 @@ std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
         {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
         {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
         {"kTwoLevelIndexSearch",
-         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}};
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+        {"kBinarySearchWithFirstKey",
+         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
 
 std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
     OptionsHelper::block_base_table_data_block_index_type_string_map = {
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index 6c7e46d5969..8fa3ff9b986 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -608,8 +608,7 @@ bool IndexBlockIter::ParseNextIndexKey() {
   }
   // else we are in the middle of a restart interval and the restart_index_
   // thus has not changed
-  if (value_delta_encoded_) {
-    assert(value_length == 0);
+  if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
     DecodeCurrentValue(shared);
   }
   return true;
@@ -627,24 +626,32 @@ bool IndexBlockIter::ParseNextIndexKey() {
 // Otherwise the format is delta-size = block handle size - size of last block
 // handle.
 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
-  assert(value_delta_encoded_);
-  const char* limit = data_ + restarts_;
-  if (shared == 0) {
-    uint64_t o, s;
-    const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
-    assert(newp);
-    newp = GetVarint64Ptr(newp, limit, &s);
-    assert(newp);
-    decoded_value_ = BlockHandle(o, s);
-    value_ = Slice(value_.data(), newp - value_.data());
-  } else {
-    uint64_t next_value_base =
-        decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
-    int64_t delta;
-    const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
-    decoded_value_ =
-        BlockHandle(next_value_base, decoded_value_.size() + delta);
-    value_ = Slice(value_.data(), newp - value_.data());
+  Slice v(value_.data(), data_ + restarts_ - value_.data());
+  // Delta encoding is used if `shared` != 0.
+  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+      &v, have_first_key_,
+      (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
+  assert(decode_s.ok());
+  value_ = Slice(value_.data(), v.data() - value_.data());
+
+  if (global_seqno_state_ != nullptr) {
+    // Overwrite sequence number the same way as in DataBlockIter.
+
+    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+                                      /* copy */ true);
+
+    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+    assert(value_type == ValueType::kTypeValue ||
+           value_type == ValueType::kTypeMerge ||
+           value_type == ValueType::kTypeDeletion ||
+           value_type == ValueType::kTypeRangeDeletion);
+
+    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+                                         value_type);
+    decoded_value_.first_internal_key = first_internal_key.GetKey();
   }
 }
 
@@ -875,14 +882,10 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
   }
 }
 
-template <>
-DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
-                                  DataBlockIter* iter, Statistics* stats,
-                                  bool /*total_order_seek*/,
-                                  bool /*key_includes_seq*/,
-                                  bool /*value_is_full*/,
-                                  bool block_contents_pinned,
-                                  BlockPrefixIndex* /*prefix_index*/) {
+DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
+                                      const Comparator* ucmp,
+                                      DataBlockIter* iter, Statistics* stats,
+                                      bool block_contents_pinned) {
   DataBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -913,13 +916,11 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
   return ret_iter;
 }
 
-template <>
-IndexBlockIter* Block::NewIterator(const Comparator* cmp,
-                                   const Comparator* ucmp, IndexBlockIter* iter,
-                                   Statistics* /*stats*/, bool total_order_seek,
-                                   bool key_includes_seq, bool value_is_full,
-                                   bool block_contents_pinned,
-                                   BlockPrefixIndex* prefix_index) {
+IndexBlockIter* Block::NewIndexIterator(
+    const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
+    Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
+    bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+    BlockPrefixIndex* prefix_index) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -938,9 +939,9 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
     ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         prefix_index_ptr, key_includes_seq, value_is_full,
-                         block_contents_pinned,
-                         nullptr /* data_block_hash_index */);
+                         global_seqno_, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full,
+                         block_contents_pinned);
   }
 
   return ret_iter;
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 2bb577d33bd..3af92b6a262 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -165,17 +165,7 @@ class Block {
   // If iter is null, return new Iterator
   // If iter is not null, update this one and return it as Iterator*
   //
-  // key_includes_seq, default true, means that the keys are in internal key
-  // format.
-  // value_is_full, default true, means that no delta encoding is
-  // applied to values.
-  //
-  // NewIterator<DataBlockIter>
-  // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
-  //
-  // NewIterator<IndexBlockIter>
-  // If `prefix_index` is not nullptr this block will do hash lookup for the key
-  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  // Updates read_amp_bitmap_ if it is not nullptr.
   //
   // If `block_contents_pinned` is true, the caller will guarantee that when
   // the cleanup functions are transferred from the iterator to other
@@ -188,13 +178,32 @@ class Block {
   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
-  template <typename TBlockIter>
-  TBlockIter* NewIterator(
-      const Comparator* comparator, const Comparator* user_comparator,
-      TBlockIter* iter = nullptr, Statistics* stats = nullptr,
-      bool total_order_seek = true, bool key_includes_seq = true,
-      bool value_is_full = true, bool block_contents_pinned = false,
-      BlockPrefixIndex* prefix_index = nullptr);
+
+  DataBlockIter* NewDataIterator(const Comparator* comparator,
+                                 const Comparator* user_comparator,
+                                 DataBlockIter* iter = nullptr,
+                                 Statistics* stats = nullptr,
+                                 bool block_contents_pinned = false);
+
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // `have_first_key` controls whether IndexValue will contain
+  // first_internal_key. It affects data serialization format, so the same value
+  // have_first_key must be used when writing and reading index.
+  // It is determined by IndexType property of the table.
+  IndexBlockIter* NewIndexIterator(const Comparator* comparator,
+                                   const Comparator* user_comparator,
+                                   IndexBlockIter* iter, Statistics* stats,
+                                   bool total_order_seek, bool have_first_key,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned = false,
+                                   BlockPrefixIndex* prefix_index = nullptr);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -471,7 +480,7 @@ class DataBlockIter final : public BlockIter<Slice> {
   bool SeekForGetImpl(const Slice& target);
 };
 
-class IndexBlockIter final : public BlockIter<BlockHandle> {
+class IndexBlockIter final : public BlockIter<IndexValue> {
  public:
   IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
 
@@ -483,23 +492,12 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
   // format.
   // value_is_full, default true, means that no delta encoding is
   // applied to values.
-  IndexBlockIter(const Comparator* comparator,
-                 const Comparator* user_comparator, const char* data,
-                 uint32_t restarts, uint32_t num_restarts,
-                 BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                 bool value_is_full, bool block_contents_pinned)
-      : IndexBlockIter() {
-    Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               prefix_index, key_includes_seq, block_contents_pinned,
-               value_is_full, nullptr /* data_block_hash_index */);
-  }
-
   void Initialize(const Comparator* comparator,
                   const Comparator* user_comparator, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
-                  BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                  bool value_is_full, bool block_contents_pinned,
-                  DataBlockHashIndex* /*data_block_hash_index*/) {
+                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+                  bool have_first_key, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned) {
     InitializeBase(key_includes_seq ? comparator : user_comparator, data,
                    restarts, num_restarts, kDisableGlobalSequenceNumber,
                    block_contents_pinned);
@@ -507,6 +505,12 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
     key_.SetIsUserKey(!key_includes_seq_);
     prefix_index_ = prefix_index;
     value_delta_encoded_ = !value_is_full;
+    have_first_key_ = have_first_key;
+    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+    } else {
+      global_seqno_state_.reset();
+    }
   }
 
   Slice user_key() const override {
@@ -516,16 +520,17 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
     return key();
   }
 
-  virtual BlockHandle value() const override {
+  virtual IndexValue value() const override {
     assert(Valid());
-    if (value_delta_encoded_) {
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
       return decoded_value_;
     } else {
-      BlockHandle handle;
+      IndexValue entry;
       Slice v = value_;
-      Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
+      Status decode_s __attribute__((__unused__)) =
+          entry.DecodeFrom(&v, have_first_key_, nullptr);
       assert(decode_s.ok());
-      return handle;
+      return entry;
     }
   }
 
@@ -552,10 +557,15 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
 
   void Invalidate(Status s) { InvalidateBase(s); }
 
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
  private:
   // Key is in InternalKey format
   bool key_includes_seq_;
   bool value_delta_encoded_;
+  bool have_first_key_;  // value includes first_internal_key
   BlockPrefixIndex* prefix_index_;
   // Whether the value is delta encoded. In that case the value is assumed to be
   // BlockHandle. The first value in each restart interval is the full encoded
@@ -563,7 +573,22 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
   // offset of delta encoded BlockHandles is computed by adding the size of
   // previous delta encoded values in the same restart interval to the offset of
   // the first value in that restart interval.
-  BlockHandle decoded_value_;
+  IndexValue decoded_value_;
+
+  // When sequence number overwriting is enabled, this struct contains the seqno
+  // to overwrite with, and current first_internal_key with overwritten seqno.
+  // This is rarely used, so we put it behind a pointer and only allocate when
+  // needed.
+  struct GlobalSeqnoState {
+    // First internal key according to current index entry, but with sequence
+    // number overwritten to global_seqno.
+    IterKey first_internal_key;
+    SequenceNumber global_seqno;
+
+    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+  };
+
+  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
 
   bool PrefixSeek(const Slice& target, uint32_t* index);
   bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 5b2f515006f..5344625ec94 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -191,24 +191,22 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
     return &table_->get_rep()->internal_comparator;
   }
 
-  bool index_key_includes_seq() const {
+  bool index_has_first_key() const {
     assert(table_ != nullptr);
     assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
 
-    const TableProperties* const properties =
-        table_->get_rep()->table_properties.get();
-
-    return properties == nullptr || !properties->index_key_is_user_key;
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
   }
 
   bool index_value_is_full() const {
     assert(table_ != nullptr);
     assert(table_->get_rep() != nullptr);
-
-    const TableProperties* const properties =
-        table_->get_rep()->table_properties.get();
-
-    return properties == nullptr || !properties->index_value_is_delta_encoded;
+    return table_->get_rep()->index_value_is_full;
   }
 
   Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
@@ -305,7 +303,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   }
 
   // return a two-level iterator: first level is on the partition index
-  InternalIteratorBase<BlockHandle>* NewIterator(
+  InternalIteratorBase<IndexValue>* NewIterator(
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
@@ -319,10 +317,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
         return iter;
       }
 
-      return NewErrorInternalIterator<BlockHandle>(s);
+      return NewErrorInternalIterator<IndexValue>(s);
     }
 
-    InternalIteratorBase<BlockHandle>* it = nullptr;
+    InternalIteratorBase<IndexValue>* it = nullptr;
 
     Statistics* kNullStats = nullptr;
     // Filters are already checked before seeking the index
@@ -330,26 +328,24 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       // We don't return pinned data from index blocks, so no need
       // to set `block_contents_pinned`.
       it = NewTwoLevelIterator(
-          new BlockBasedTable::PartitionedIndexIteratorState(
-              table(), &partition_map_, index_key_includes_seq(),
-              index_value_is_full()),
-          index_block.GetValue()->NewIterator<IndexBlockIter>(
+          new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                             &partition_map_),
+          index_block.GetValue()->NewIndexIterator(
               internal_comparator(), internal_comparator()->user_comparator(),
-              nullptr, kNullStats, true, index_key_includes_seq(),
-              index_value_is_full()));
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()));
     } else {
       ReadOptions ro;
       ro.fill_cache = read_options.fill_cache;
       // We don't return pinned data from index blocks, so no need
       // to set `block_contents_pinned`.
-      it = new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
+      it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
           table(), ro, *internal_comparator(),
-          index_block.GetValue()->NewIterator<IndexBlockIter>(
+          index_block.GetValue()->NewIndexIterator(
               internal_comparator(), internal_comparator()->user_comparator(),
-              nullptr, kNullStats, true, index_key_includes_seq(),
-              index_value_is_full()),
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()),
           false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
-          index_key_includes_seq(), index_value_is_full(),
           lookup_context ? lookup_context->caller
                          : TableReaderCaller::kUncategorized);
     }
@@ -368,7 +364,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   void CacheDependencies(bool pin) override {
     // Before read partitions, prefetch them to avoid lots of IOs
     BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
-    auto rep = table()->rep_;
+    const BlockBasedTable::Rep* rep = table()->rep_;
     IndexBlockIter biter;
     BlockHandle handle;
     Statistics* kNullStats = nullptr;
@@ -386,9 +382,10 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
 
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    index_block.GetValue()->NewIterator<IndexBlockIter>(
+    index_block.GetValue()->NewIndexIterator(
         internal_comparator(), internal_comparator()->user_comparator(), &biter,
-        kNullStats, true, index_key_includes_seq(), index_value_is_full());
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
     // Index partitions are assumed to be consecuitive. Prefetch them all.
     // Read the first block offset
     biter.SeekToFirst();
@@ -396,7 +393,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       // Empty index.
       return;
     }
-    handle = biter.value();
+    handle = biter.value().handle;
     uint64_t prefetch_off = handle.offset();
 
     // Read the last block's offset
@@ -405,7 +402,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       // Empty index.
       return;
     }
-    handle = biter.value();
+    handle = biter.value().handle;
     uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
     uint64_t prefetch_len = last_off - prefetch_off;
     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
@@ -418,7 +415,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     biter.SeekToFirst();
     auto ro = ReadOptions();
     for (; biter.Valid(); biter.Next()) {
-      handle = biter.value();
+      handle = biter.value().handle;
       CachableEntry<Block> block;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
@@ -493,7 +490,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
     return Status::OK();
   }
 
-  InternalIteratorBase<BlockHandle>* NewIterator(
+  InternalIteratorBase<IndexValue>* NewIterator(
       const ReadOptions& read_options, bool /* disable_prefix_seek */,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
@@ -507,15 +504,16 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
         return iter;
       }
 
-      return NewErrorInternalIterator<BlockHandle>(s);
+      return NewErrorInternalIterator<IndexValue>(s);
     }
 
     Statistics* kNullStats = nullptr;
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
+    auto it = index_block.GetValue()->NewIndexIterator(
         internal_comparator(), internal_comparator()->user_comparator(), iter,
-        kNullStats, true, index_key_includes_seq(), index_value_is_full());
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
 
     assert(it != nullptr);
     index_block.TransferTo(it);
@@ -552,7 +550,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     assert(index_reader != nullptr);
     assert(!pin || prefetch);
 
-    auto rep = table->get_rep();
+    const BlockBasedTable::Rep* rep = table->get_rep();
     assert(rep != nullptr);
 
     CachableEntry<Block> index_block;
@@ -636,7 +634,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     return Status::OK();
   }
 
-  InternalIteratorBase<BlockHandle>* NewIterator(
+  InternalIteratorBase<IndexValue>* NewIterator(
       const ReadOptions& read_options, bool disable_prefix_seek,
       IndexBlockIter* iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) override {
@@ -650,7 +648,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
         return iter;
       }
 
-      return NewErrorInternalIterator<BlockHandle>(s);
+      return NewErrorInternalIterator<IndexValue>(s);
     }
 
     Statistics* kNullStats = nullptr;
@@ -658,11 +656,11 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
         read_options.total_order_seek || disable_prefix_seek;
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    auto it = index_block.GetValue()->NewIterator<IndexBlockIter>(
+    auto it = index_block.GetValue()->NewIndexIterator(
         internal_comparator(), internal_comparator()->user_comparator(), iter,
-        kNullStats, total_order_seek, index_key_includes_seq(),
-        index_value_is_full(), false /* block_contents_pinned */,
-        prefix_index_.get());
+        kNullStats, total_order_seek, index_has_first_key(),
+        index_key_includes_seq(), index_value_is_full(),
+        false /* block_contents_pinned */, prefix_index_.get());
 
     assert(it != nullptr);
     index_block.TransferTo(it);
@@ -1083,7 +1081,6 @@ Status BlockBasedTable::Open(
                                       immortal_table);
   rep->file = std::move(file);
   rep->footer = footer;
-  rep->index_type = table_options.index_type;
   rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
   // We need to wrap data with internal_prefix_transform to make sure it can
   // handle prefix correctly.
@@ -1113,6 +1110,8 @@ Status BlockBasedTable::Open(
     return s;
   }
 
+  // Populates table_properties and some fields that depend on it,
+  // such as index_type.
   s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
                                      largest_seqno);
   if (!s.ok()) {
@@ -1317,6 +1316,24 @@ Status BlockBasedTable::ReadPropertiesBlock(
                            BlockBasedTablePropertyNames::kPrefixFiltering,
                            rep_->ioptions.info_log);
 
+    rep_->index_key_includes_seq =
+        rep_->table_properties->index_key_is_user_key == 0;
+    rep_->index_value_is_full =
+        rep_->table_properties->index_value_is_delta_encoded == 0;
+
+    // Update index_type with the true type.
+    // If table properties don't contain index type, we assume that the table
+    // is in very old format and has kBinarySearch index type.
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+
+    rep_->index_has_first_key =
+        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                 &(rep_->global_seqno));
     if (!s.ok()) {
@@ -1344,7 +1361,6 @@ Status BlockBasedTable::ReadRangeDelBlock(
     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
         read_options, range_del_handle,
         /*input_iter=*/nullptr, BlockType::kRangeDeletion,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
         /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
     assert(iter != nullptr);
     s = iter->status();
@@ -1436,7 +1452,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
                                    &rep_->compression_dict_handle);
   }
 
-  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
+  BlockBasedTableOptions::IndexType index_type = rep_->index_type;
 
   const bool use_cache = table_options.cache_index_and_filter_blocks;
 
@@ -1602,8 +1618,8 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
 
   *meta_block = std::move(meta);
   // meta block uses bytewise comparator.
-  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
+  iter->reset(meta_block->get()->NewDataIterator(BytewiseComparator(),
+                                                 BytewiseComparator()));
   return Status::OK();
 }
 
@@ -1846,10 +1862,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
           rep->prefix_filtering ? prefix_extractor : nullptr,
           rep->whole_key_filtering, std::move(block), nullptr,
           rep->ioptions.statistics, rep->internal_comparator, this,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0);
+          rep_->index_key_includes_seq, rep_->index_value_is_full);
     }
 
     case Rep::FilterType::kBlockFilter:
@@ -2055,7 +2068,7 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
 
 // disable_prefix_seek should be set to true when prefix_extractor found in SST
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
-InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
     IndexBlockIter* input_iter, GetContext* get_context,
     BlockCacheLookupContext* lookup_context) const {
@@ -2076,8 +2089,8 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
 template <typename TBlockIter>
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
     const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
-    BlockType block_type, bool key_includes_seq, bool index_key_is_full,
-    GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, Status s,
     FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
@@ -2106,7 +2119,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   }
 
   assert(block.GetValue() != nullptr);
-  constexpr bool kTotalOrderSeek = true;
+
   // Block contents are pinned and it is still pinned after the iterator
   // is destroyed as long as cleanup functions are moved to another object,
   // when:
@@ -2117,10 +2130,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   const bool block_contents_pinned =
       block.IsCached() ||
       (!block.GetValue()->own_bytes() && rep_->immortal_table);
-  iter = block.GetValue()->NewIterator<TBlockIter>(
-      &rep_->internal_comparator, rep_->internal_comparator.user_comparator(),
-      iter, rep_->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
-      index_key_is_full, block_contents_pinned);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
 
   if (!block.IsCached()) {
     if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
@@ -2162,6 +2173,26 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   return iter;
 }
 
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+    const Rep* rep, Block* block, DataBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewDataIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+    const Rep* rep, Block* block, IndexBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewIndexIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full, block_contents_pinned);
+}
+
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
@@ -2360,14 +2391,10 @@ Status BlockBasedTable::RetrieveBlock(
 
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
-    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-    bool index_key_includes_seq, bool index_key_is_full)
-    : table_(table),
-      block_map_(block_map),
-      index_key_includes_seq_(index_key_includes_seq),
-      index_key_is_full_(index_key_is_full) {}
-
-InternalIteratorBase<BlockHandle>*
+    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
+    : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
     const BlockHandle& handle) {
   // Return a block iterator on the index partition
@@ -2375,15 +2402,16 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
   // This is a possible scenario since block cache might not have had space
   // for the partition
   if (block != block_map_->end()) {
-    auto rep = table_->get_rep();
+    const Rep* rep = table_->get_rep();
     assert(rep);
 
     Statistics* kNullStats = nullptr;
     // We don't return pinned data from index blocks, so no need
     // to set `block_contents_pinned`.
-    return block->second.GetValue()->NewIterator<IndexBlockIter>(
+    return block->second.GetValue()->NewIndexIterator(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
+        nullptr, kNullStats, true, rep->index_has_first_key,
+        rep->index_key_includes_seq, rep->index_value_is_full);
   }
   // Create an empty iterator
   return new IndexBlockIter();
@@ -2459,10 +2487,10 @@ bool BlockBasedTable::PrefixMayMatch(
       // Then, try find it within each block
       // we already know prefix_extractor and prefix_extractor_name must match
       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
-      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
+      std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
           no_io_read_options,
           /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
-          /*need_upper_bound_check=*/nullptr, lookup_context));
+          /*get_context=*/nullptr, lookup_context));
       iiter->Seek(internal_prefix);
 
       if (!iiter->Valid()) {
@@ -2471,10 +2499,8 @@ bool BlockBasedTable::PrefixMayMatch(
         // and we're not really sure that we're past the end
         // of the file
         may_match = iiter->status().IsIncomplete();
-      } else if ((rep_->table_properties &&
-                          rep_->table_properties->index_key_is_user_key
-                      ? iiter->key()
-                      : ExtractUserKey(iiter->key()))
+      } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
+                                               : iiter->key())
                      .starts_with(ExtractUserKey(internal_prefix))) {
         // we need to check for this subtle case because our only
         // guarantee is that "the key is a string >= last key in that data
@@ -2493,7 +2519,7 @@ bool BlockBasedTable::PrefixMayMatch(
         // after the data block corresponding to iiter->key() cannot
         // possibly contain the key.  Thus, the corresponding data block
         // is the only on could potentially contain the prefix.
-        BlockHandle handle = iiter->value();
+        BlockHandle handle = iiter->value().handle;
         may_match = filter->PrefixMayMatch(
             prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
             /*const_key_ptr=*/nullptr, lookup_context);
@@ -2514,8 +2540,20 @@ bool BlockBasedTable::PrefixMayMatch(
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
+  SeekImpl(&target);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
+  SeekImpl(nullptr);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
+    const Slice* target) {
   is_out_of_bound_ = false;
-  if (!CheckPrefixMayMatch(target)) {
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target)) {
     ResetDataIter();
     return;
   }
@@ -2523,47 +2561,82 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
   bool need_seek_index = true;
   if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
     // Reseek.
-    prev_index_value_ = index_iter_->value();
-    // We can avoid an index seek if:
-    // 1. The new seek key is larger than the current key
-    // 2. The new seek key is within the upper bound of the block
-    // Since we don't necessarily know the internal key for either
-    // the current key or the upper bound, we check user keys and
-    // exclude the equality case. Considering internal keys can
-    // improve for the boundary cases, but it would complicate the
-    // code.
-    if (user_comparator_.Compare(ExtractUserKey(target),
-                                 block_iter_.user_key()) > 0 &&
-        user_comparator_.Compare(ExtractUserKey(target),
-                                 index_iter_->user_key()) < 0) {
-      need_seek_index = false;
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
     }
   }
 
   if (need_seek_index) {
-    index_iter_->Seek(target);
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
     if (!index_iter_->Valid()) {
       ResetDataIter();
       return;
     }
-    InitDataBlock();
   }
 
-  block_iter_.Seek(target);
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  // TODO(kolmike): Remove the != kBlockCacheTier condition.
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      read_options_.read_tier != kBlockCacheTier) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      InitDataBlock();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
 
-  FindKeyForward();
   CheckOutOfBound();
-  assert(
-      !block_iter_.Valid() ||
-      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
-      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
-                                                      block_iter_.key()) <= 0));
+
+  if (target) {
+    assert(
+        !Valid() ||
+        ((block_type_ == BlockType::kIndex &&
+          !table_->get_rep()->index_key_includes_seq)
+             ? (user_comparator_.Compare(ExtractUserKey(*target), key()) <= 0)
+             : (icomp_.Compare(*target, key()) <= 0)));
+  }
 }
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
     const Slice& target) {
   is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
   if (!CheckPrefixMayMatch(target)) {
     ResetDataIter();
     return;
@@ -2587,10 +2660,14 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
   index_iter_->Seek(target);
 
   if (!index_iter_->Valid()) {
+    if (!index_iter_->status().ok()) {
+      ResetDataIter();
+      return;
+    }
+
     index_iter_->SeekToLast();
     if (!index_iter_->Valid()) {
       ResetDataIter();
-      block_iter_points_to_real_block_ = false;
       return;
     }
   }
@@ -2604,24 +2681,10 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
          icomp_.Compare(target, block_iter_.key()) >= 0);
 }
 
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
-  is_out_of_bound_ = false;
-  SavePrevIndexValue();
-  index_iter_->SeekToFirst();
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
-  }
-  InitDataBlock();
-  block_iter_.SeekToFirst();
-  FindKeyForward();
-  CheckOutOfBound();
-}
-
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
   is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
   SavePrevIndexValue();
   index_iter_->SeekToLast();
   if (!index_iter_->Valid()) {
@@ -2635,9 +2698,13 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
   assert(block_iter_points_to_real_block_);
   block_iter_.Next();
   FindKeyForward();
+  CheckOutOfBound();
 }
 
 template <class TBlockIter, typename TValue>
@@ -2653,8 +2720,21 @@ bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
-  assert(block_iter_points_to_real_block_);
-  block_iter_.Prev();
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
   FindKeyBackward();
 }
 
@@ -2667,9 +2747,9 @@ const size_t
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
-  BlockHandle data_block_handle = index_iter_->value();
+  BlockHandle data_block_handle = index_iter_->value().handle;
   if (!block_iter_points_to_real_block_ ||
-      data_block_handle.offset() != prev_index_value_.offset() ||
+      data_block_handle.offset() != prev_block_offset_ ||
       // if previous attempt of reading the block missed cache, try again
       block_iter_.status().IsIncomplete()) {
     if (block_iter_points_to_real_block_) {
@@ -2728,7 +2808,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
     Status s;
     table_->NewDataBlockIterator<TBlockIter>(
         read_options_, data_block_handle, &block_iter_, block_type_,
-        key_includes_seq_, index_key_is_full_,
         /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
         /*for_compaction=*/lookup_context_.caller ==
             TableReaderCaller::kCompaction);
@@ -2736,6 +2815,47 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
   }
 }
 
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    // Uh oh.
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
   // TODO the while loop inherits from two-level-iterator. We don't know
@@ -2766,22 +2886,23 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
       return;
     }
 
-    if (index_iter_->Valid()) {
-      InitDataBlock();
-      block_iter_.SeekToFirst();
-    } else {
+    if (!index_iter_->Valid()) {
       return;
     }
-  } while (!block_iter_.Valid());
-}
 
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
-  assert(!is_out_of_bound_);
+    IndexValue v = index_iter_->value();
 
-  if (!block_iter_.Valid()) {
-    FindBlockForward();
-  }
+    // TODO(kolmike): Remove the != kBlockCacheTier condition.
+    if (!v.first_internal_key.empty() &&
+        read_options_.read_tier != kBlockCacheTier) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
 }
 
 template <class TBlockIter, typename TValue>
@@ -2808,8 +2929,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
 
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
-  if (read_options_.iterate_upper_bound != nullptr &&
-      block_iter_points_to_real_block_ && block_iter_.Valid()) {
+  if (read_options_.iterate_upper_bound != nullptr && Valid()) {
     is_out_of_bound_ = user_comparator_.Compare(
                            *read_options_.iterate_upper_bound, user_key()) <= 0;
   }
@@ -2832,8 +2952,7 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller,
-        compaction_readahead_size);
+        caller, compaction_readahead_size);
   } else {
     auto* mem =
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
@@ -2845,7 +2964,7 @@ InternalIterator* BlockBasedTable::NewIterator(
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true, caller, compaction_readahead_size);
+        caller, compaction_readahead_size);
   }
 }
 
@@ -2961,7 +3080,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                          get_context, &lookup_context);
-    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
     }
@@ -2971,12 +3090,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
     bool matched = false;  // if such user key mathced a key in SST
     bool done = false;
     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-      BlockHandle handle = iiter->value();
+      IndexValue v = iiter->value();
 
       bool not_exist_in_filter =
           filter != nullptr && filter->IsBlockBased() == true &&
           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
-                               prefix_extractor, handle.offset(), no_io,
+                               prefix_extractor, v.handle.offset(), no_io,
                                /*const_ikey_ptr=*/nullptr, &lookup_context);
 
       if (not_exist_in_filter) {
@@ -2986,78 +3105,85 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
-      } else {
-        BlockCacheLookupContext lookup_data_block_context{
-            TableReaderCaller::kUserGet};
-        bool does_referenced_key_exist = false;
-        DataBlockIter biter;
-        uint64_t referenced_data_size = 0;
-        NewDataBlockIterator<DataBlockIter>(
-            read_options, iiter->value(), &biter, BlockType::kData,
-            /*key_includes_seq=*/true,
-            /*index_key_is_full=*/true, get_context, &lookup_data_block_context,
-            /*s=*/Status(), /*prefetch_buffer*/ nullptr);
+      }
 
-        if (read_options.read_tier == kBlockCacheTier &&
-            biter.status().IsIncomplete()) {
-          // couldn't get block from block_cache
-          // Update Saver.state to Found because we are only looking for
-          // whether we can guarantee the key is not there when "no_io" is set
-          get_context->MarkKeyMayExist();
-          break;
-        }
-        if (!biter.status().ok()) {
-          s = biter.status();
-          break;
-        }
+      if (!v.first_internal_key.empty() && !skip_filters &&
+          UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                  .Compare(ExtractUserKey(key),
+                           ExtractUserKey(v.first_internal_key)) < 0) {
+        // The requested key falls between highest key in previous block and
+        // lowest key in current block.
+        break;
+      }
 
-        bool may_exist = biter.SeekForGet(key);
-        // If user-specified timestamp is supported, we cannot end the search
-        // just because hash index lookup indicates the key+ts does not exist.
-        if (!may_exist && ts_sz == 0) {
-          // HashSeek cannot find the key this block and the the iter is not
-          // the end of the block, i.e. cannot be in the following blocks
-          // either. In this case, the seek_key cannot be found, so we break
-          // from the top level for-loop.
-          done = true;
-        } else {
-          // Call the *saver function on each entry/block until it returns false
-          for (; biter.Valid(); biter.Next()) {
-            ParsedInternalKey parsed_key;
-            if (!ParseInternalKey(biter.key(), &parsed_key)) {
-              s = Status::Corruption(Slice());
-            }
+      BlockCacheLookupContext lookup_data_block_context{
+          TableReaderCaller::kUserGet};
+      bool does_referenced_key_exist = false;
+      DataBlockIter biter;
+      uint64_t referenced_data_size = 0;
+      NewDataBlockIterator<DataBlockIter>(
+          read_options, v.handle, &biter, BlockType::kData,
+          get_context, &lookup_data_block_context,
+          /*s=*/Status(), /*prefetch_buffer*/ nullptr);
+
+      if (no_io && biter.status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for
+        // whether we can guarantee the key is not there when "no_io" is set
+        get_context->MarkKeyMayExist();
+        break;
+      }
+      if (!biter.status().ok()) {
+        s = biter.status();
+        break;
+      }
 
-            if (!get_context->SaveValue(
-                    parsed_key, biter.value(), &matched,
-                    biter.IsValuePinned() ? &biter : nullptr)) {
-              does_referenced_key_exist = true;
-              referenced_data_size = biter.key().size() + biter.value().size();
-              done = true;
-              break;
-            }
+      bool may_exist = biter.SeekForGet(key);
+      // If user-specified timestamp is supported, we cannot end the search
+      // just because hash index lookup indicates the key+ts does not exist.
+      if (!may_exist && ts_sz == 0) {
+        // HashSeek cannot find the key this block and the the iter is not
+        // the end of the block, i.e. cannot be in the following blocks
+        // either. In this case, the seek_key cannot be found, so we break
+        // from the top level for-loop.
+        done = true;
+      } else {
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          if (!ParseInternalKey(biter.key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
+            does_referenced_key_exist = true;
+            referenced_data_size = biter.key().size() + biter.value().size();
+            done = true;
+            break;
           }
-          s = biter.status();
-        }
-        // Write the block cache access record.
-        if (block_cache_tracer_) {
-          // Avoid making copy of block_key, cf_name, and referenced_key when
-          // constructing the access record.
-          BlockCacheTraceRecord access_record(
-              rep_->ioptions.env->NowMicros(),
-              /*block_key=*/"", lookup_data_block_context.block_type,
-              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
-              /*cf_name=*/"", rep_->level_for_tracing(),
-              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
-              lookup_data_block_context.is_cache_hit,
-              lookup_data_block_context.no_insert,
-              /*referenced_key=*/"", referenced_data_size,
-              lookup_data_block_context.num_keys_in_block,
-              does_referenced_key_exist);
-          block_cache_tracer_->WriteBlockAccess(
-              access_record, lookup_data_block_context.block_key,
-              rep_->cf_name_for_tracing(), key);
         }
+        s = biter.status();
+      }
+      // Write the block cache access record.
+      if (block_cache_tracer_) {
+        // Avoid making copy of block_key, cf_name, and referenced_key when
+        // constructing the access record.
+        BlockCacheTraceRecord access_record(
+            rep_->ioptions.env->NowMicros(),
+            /*block_key=*/"", lookup_data_block_context.block_type,
+            lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+            /*cf_name=*/"", rep_->level_for_tracing(),
+            rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+            lookup_data_block_context.is_cache_hit,
+            lookup_data_block_context.no_insert,
+            /*referenced_key=*/"", referenced_data_size,
+            lookup_data_block_context.num_keys_in_block,
+            does_referenced_key_exist);
+        block_cache_tracer_->WriteBlockAccess(
+            access_record, lookup_data_block_context.block_key,
+            rep_->cf_name_for_tracing(), key);
       }
 
       if (done) {
@@ -3115,7 +3241,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                          sst_file_range.begin()->get_context, &lookup_context);
-    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
     }
@@ -3130,21 +3256,30 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       bool matched = false;  // if such user key matched a key in SST
       bool done = false;
       for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+        IndexValue v = iiter->value();
+        if (!v.first_internal_key.empty() && !skip_filters &&
+            UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                    .Compare(ExtractUserKey(key),
+                             ExtractUserKey(v.first_internal_key)) < 0) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          break;
+        }
+
         bool reusing_block = true;
         uint64_t referenced_data_size = 0;
         bool does_referenced_key_exist = false;
         BlockCacheLookupContext lookup_data_block_context(
             TableReaderCaller::kUserMultiGet);
-        if (iiter->value().offset() != offset) {
-          offset = iiter->value().offset();
+        if (iiter->value().handle.offset() != offset) {
+          offset = iiter->value().handle.offset();
           biter.Invalidate(Status::OK());
           NewDataBlockIterator<DataBlockIter>(
-              read_options, iiter->value(), &biter, BlockType::kData,
-              /*key_includes_seq=*/false,
-              /*index_key_is_full=*/true, get_context,
-              &lookup_data_block_context, Status(), nullptr);
+              read_options, v.handle, &biter, BlockType::kData,
+              get_context, &lookup_data_block_context, Status(), nullptr);
           reusing_block = false;
         }
+
         if (read_options.read_tier == kBlockCacheTier &&
             biter.status().IsIncomplete()) {
           // couldn't get block from block_cache
@@ -3238,7 +3373,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
 Status BlockBasedTable::Prefetch(const Slice* const begin,
                                  const Slice* const end) {
   auto& comparator = rep_->internal_comparator;
-  auto user_comparator = comparator.user_comparator();
+  UserComparatorWrapper user_comparator(comparator.user_comparator());
   // pre-condition
   if (begin && end && comparator.Compare(*begin, *end) > 0) {
     return Status::InvalidArgument(*begin, *end);
@@ -3248,10 +3383,9 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                                 &iiter_on_stack, /*get_context=*/nullptr,
                                 &lookup_context);
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr =
-        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
   }
 
   if (!iiter->status().ok()) {
@@ -3264,13 +3398,12 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 
   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
        iiter->Next()) {
-    BlockHandle block_handle = iiter->value();
-    const bool is_user_key = rep_->table_properties &&
-                             rep_->table_properties->index_key_is_user_key > 0;
+    BlockHandle block_handle = iiter->value().handle;
+    const bool is_user_key = !rep_->index_key_includes_seq;
     if (end &&
         ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
          (is_user_key &&
-          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+          user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
       if (prefetching_boundary_page) {
         break;
       }
@@ -3285,7 +3418,6 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 
     NewDataBlockIterator<DataBlockIter>(
         ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
         /*get_context=*/nullptr, &lookup_context, Status(),
         /*prefetch_buffer=*/nullptr);
 
@@ -3315,13 +3447,12 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
   BlockCacheLookupContext context{caller};
-  InternalIteratorBase<BlockHandle>* iiter = NewIndexIterator(
+  InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
       ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
       /*get_context=*/nullptr, &context);
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr =
-        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
   }
   if (!iiter->status().ok()) {
     // error opening index iterator
@@ -3332,14 +3463,14 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
 }
 
 Status BlockBasedTable::VerifyChecksumInBlocks(
-    InternalIteratorBase<BlockHandle>* index_iter) {
+    InternalIteratorBase<IndexValue>* index_iter) {
   Status s;
   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
     s = index_iter->status();
     if (!s.ok()) {
       break;
     }
-    BlockHandle handle = index_iter->value();
+    BlockHandle handle = index_iter->value().handle;
     BlockContents contents;
     BlockFetcher block_fetcher(
         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
@@ -3445,31 +3576,13 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
 
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(NewIndexIterator(
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
       options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
       /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
   iiter->Seek(key);
   assert(iiter->Valid());
 
-  return TEST_BlockInCache(iiter->value());
-}
-
-BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
-  // Some old version of block-based tables don't have index type present in
-  // table properties. If that's the case we can safely use the kBinarySearch.
-  BlockBasedTableOptions::IndexType index_type_on_file =
-      BlockBasedTableOptions::kBinarySearch;
-  if (rep_->table_properties) {
-    auto& props = rep_->table_properties->user_collected_properties;
-    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (pos != props.end()) {
-      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(pos->second.c_str()));
-      // update index_type with the true type
-      rep_->index_type = index_type_on_file;
-    }
-  }
-  return index_type_on_file;
+  return TEST_BlockInCache(iiter->value().handle);
 }
 
 // REQUIRES: The following fields of rep_ should have already been populated:
@@ -3483,21 +3596,20 @@ Status BlockBasedTable::CreateIndexReader(
     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
     bool pin, IndexReader** index_reader,
     BlockCacheLookupContext* lookup_context) {
-  auto index_type_on_file = rep_->index_type;
-
   // kHashSearch requires non-empty prefix_extractor but bypass checking
   // prefix_extractor here since we have no access to MutableCFOptions.
   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
   // If prefix_extractor does not match prefix_extractor_name from table
   // properties, turn off Hash Index by setting total_order_seek to true
 
-  switch (index_type_on_file) {
+  switch (rep_->index_type) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
                                           prefetch, pin, index_reader,
                                           lookup_context);
     }
-    case BlockBasedTableOptions::kBinarySearch: {
+    case BlockBasedTableOptions::kBinarySearch:
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
                                              prefetch, pin, index_reader,
                                              lookup_context);
@@ -3527,7 +3639,7 @@ Status BlockBasedTable::CreateIndexReader(
     }
     default: {
       std::string error_message =
-          "Unrecognized index type: " + ToString(index_type_on_file);
+          "Unrecognized index type: " + ToString(rep_->index_type);
       return Status::InvalidArgument(error_message.c_str());
     }
   }
@@ -3536,7 +3648,7 @@ Status BlockBasedTable::CreateIndexReader(
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
                                               TableReaderCaller caller) {
   BlockCacheLookupContext context(caller);
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/&context));
@@ -3544,7 +3656,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
   index_iter->Seek(key);
   uint64_t result;
   if (index_iter->Valid()) {
-    BlockHandle handle = index_iter->value();
+    BlockHandle handle = index_iter->value().handle;
     result = handle.offset();
   } else {
     // key is past the last key in the file. If table_properties is not
@@ -3574,7 +3686,7 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const {
 
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     std::vector<KVPairBlock>* kv_pair_blocks) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/nullptr));
@@ -3595,9 +3707,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
-        /*type=*/BlockType::kData,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
         /*prefetch_buffer=*/nullptr));
     s = datablock_iter->status();
@@ -3806,7 +3917,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
   out_file->Append(
       "Index Details:\n"
       "--------------------------------------\n");
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/nullptr));
@@ -3827,8 +3938,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
     Slice key = blockhandles_iter->key();
     Slice user_key;
     InternalKey ikey;
-    if (rep_->table_properties &&
-        rep_->table_properties->index_key_is_user_key != 0) {
+    if (!rep_->index_key_includes_seq) {
       user_key = key;
     } else {
       ikey.DecodeFrom(key);
@@ -3838,7 +3948,9 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
     out_file->Append("  HEX    ");
     out_file->Append(user_key.ToString(true).c_str());
     out_file->Append(": ");
-    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append(blockhandles_iter->value()
+                         .ToString(true, rep_->index_has_first_key)
+                         .c_str());
     out_file->Append("\n");
 
     std::string str_key = user_key.ToString();
@@ -3857,7 +3969,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
 }
 
 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/nullptr));
@@ -3879,7 +3991,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
       break;
     }
 
-    BlockHandle bh = blockhandles_iter->value();
+    BlockHandle bh = blockhandles_iter->value().handle;
     uint64_t datablock_size = bh.size();
     datablock_size_min = std::min(datablock_size_min, datablock_size);
     datablock_size_max = std::max(datablock_size_max, datablock_size);
@@ -3888,15 +4000,14 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
     out_file->Append("Data Block # ");
     out_file->Append(rocksdb::ToString(block_id));
     out_file->Append(" @ ");
-    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
     out_file->Append("\n");
     out_file->Append("--------------------------------------\n");
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        ReadOptions(), blockhandles_iter->value(), /*input_iter=*/nullptr,
-        /*type=*/BlockType::kData,
-        /*key_includes_seq=*/true, /*index_key_is_full=*/true,
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
         /*prefetch_buffer=*/nullptr));
     s = datablock_iter->status();
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b03e67128e2..9300fb36a70 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -43,7 +43,6 @@
 
 namespace rocksdb {
 
-class BlockHandle;
 class Cache;
 class FilterBlockReader;
 class BlockBasedFilterBlockReader;
@@ -198,7 +197,7 @@ class BlockBasedTable : public TableReader {
     // wraps the passed iter. In the latter case the return value points
     // to a different object then iter, and the callee has the ownership of the
     // returned object.
-    virtual InternalIteratorBase<BlockHandle>* NewIterator(
+    virtual InternalIteratorBase<IndexValue>* NewIterator(
         const ReadOptions& read_options, bool disable_prefix_seek,
         IndexBlockIter* iter, GetContext* get_context,
         BlockCacheLookupContext* lookup_context) = 0;
@@ -230,8 +229,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlockIter>
   TBlockIter* NewDataBlockIterator(
       const ReadOptions& ro, const BlockHandle& block_handle,
-      TBlockIter* input_iter, BlockType block_type, bool key_includes_seq,
-      bool index_key_is_full, GetContext* get_context,
+      TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
       BlockCacheLookupContext* lookup_context, Status s,
       FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
 
@@ -259,6 +257,12 @@ class BlockBasedTable : public TableReader {
                                    BlockType block_type,
                                    GetContext* get_context) const;
 
+  // Either Block::NewDataIterator() or Block::NewIndexIterator().
+  template <typename TBlockIter>
+  static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       TBlockIter* input_iter,
+                                       bool block_contents_pinned);
+
   // If block cache enabled (compressed or uncompressed), looks for the block
   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
   // then (3) file. If found, inserts into the cache(s) that were searched
@@ -312,7 +316,7 @@ class BlockBasedTable : public TableReader {
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  InternalIteratorBase<BlockHandle>* NewIndexIterator(
+  InternalIteratorBase<IndexValue>* NewIndexIterator(
       const ReadOptions& read_options, bool need_upper_bound_check,
       IndexBlockIter* input_iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) const;
@@ -355,9 +359,6 @@ class BlockBasedTable : public TableReader {
   friend class TableCache;
   friend class BlockBasedTableBuilder;
 
-  // Figure the index type, update it in rep_, and also return it.
-  BlockBasedTableOptions::IndexType UpdateIndexType();
-
   // Create a index reader based on the index type stored in the table.
   // Optionally, user can pass a preloaded meta_index_iter for the index that
   // need to access extra meta blocks for index construction. This parameter
@@ -410,7 +411,7 @@ class BlockBasedTable : public TableReader {
   static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
-  Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
+  Status VerifyChecksumInBlocks(InternalIteratorBase<IndexValue>* index_iter);
 
   // Create the filter from the filter block.
   virtual FilterBlockReader* ReadFilter(
@@ -446,17 +447,14 @@ class BlockBasedTable::PartitionedIndexIteratorState
  public:
   PartitionedIndexIteratorState(
       const BlockBasedTable* table,
-      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-      const bool index_key_includes_seq, const bool index_key_is_full);
-  InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
+  InternalIteratorBase<IndexValue>* NewSecondaryIterator(
       const BlockHandle& index_value) override;
 
  private:
   // Don't own table_
   const BlockBasedTable* table_;
   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
-  bool index_key_includes_seq_;
-  bool index_key_is_full_;
 };
 
 // Stores all the properties associated with a BlockBasedTable.
@@ -564,12 +562,16 @@ struct BlockBasedTable::Rep {
   // still work, just not as quickly.
   bool blocks_definitely_zstd_compressed = false;
 
+  // These describe how index is encoded.
+  bool index_has_first_key = false;
+  bool index_key_includes_seq = true;
+  bool index_value_is_full = true;
+
   bool closed = false;
   const bool immortal_table;
 
   SequenceNumber get_global_seqno(BlockType block_type) const {
     return (block_type == BlockType::kFilter ||
-            block_type == BlockType::kIndex ||
             block_type == BlockType::kCompressionDictionary)
                ? kDisableGlobalSequenceNumber
                : global_seqno;
@@ -602,11 +604,10 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   BlockBasedTableIterator(const BlockBasedTable* table,
                           const ReadOptions& read_options,
                           const InternalKeyComparator& icomp,
-                          InternalIteratorBase<BlockHandle>* index_iter,
+                          InternalIteratorBase<IndexValue>* index_iter,
                           bool check_filter, bool need_upper_bound_check,
                           const SliceTransform* prefix_extractor,
-                          BlockType block_type, bool key_includes_seq,
-                          bool index_key_is_full, TableReaderCaller caller,
+                          BlockType block_type, TableReaderCaller caller,
                           size_t compaction_readahead_size = 0)
       : InternalIteratorBase<TValue>(false),
         table_(table),
@@ -620,8 +621,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         need_upper_bound_check_(need_upper_bound_check),
         prefix_extractor_(prefix_extractor),
         block_type_(block_type),
-        key_includes_seq_(key_includes_seq),
-        index_key_is_full_(index_key_is_full),
         lookup_context_(caller),
         compaction_readahead_size_(compaction_readahead_size) {}
 
@@ -635,19 +634,38 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool NextAndGetResult(Slice* ret_key) override;
   void Prev() override;
   bool Valid() const override {
-    return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
-           block_iter_.Valid();
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
   }
   Slice key() const override {
     assert(Valid());
-    return block_iter_.key();
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
   }
   Slice user_key() const override {
     assert(Valid());
-    return block_iter_.user_key();
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
   }
   TValue value() const override {
     assert(Valid());
+
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_ &&
+        !const_cast<BlockBasedTableIterator*>(this)
+             ->MaterializeCurrentBlock()) {
+      // Oops, index is not consistent with block contents, but we have
+      // no good way to report error at this point. Let's return empty value.
+      return TValue();
+    }
+
     return block_iter_.value();
   }
   Status status() const override {
@@ -667,10 +685,17 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
   bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           block_iter_points_to_real_block_ && block_iter_.IsKeyPinned();
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
   }
   bool IsValuePinned() const override {
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_) {
+      const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
+    }
     // BlockIter::IsValuePinned() is always true. No need to check
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            block_iter_points_to_real_block_;
@@ -704,35 +729,33 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
     if (block_iter_points_to_real_block_) {
       // Reseek. If they end up with the same data block, we shouldn't re-fetch
       // the same data block.
-      prev_index_value_ = index_iter_->value();
+      prev_block_offset_ = index_iter_->value().handle.offset();
     }
   }
 
-  void InitDataBlock();
-  inline void FindKeyForward();
-  void FindBlockForward();
-  void FindKeyBackward();
-  void CheckOutOfBound();
-
  private:
   const BlockBasedTable* table_;
   const ReadOptions read_options_;
   const InternalKeyComparator& icomp_;
   UserComparatorWrapper user_comparator_;
-  InternalIteratorBase<BlockHandle>* index_iter_;
+  InternalIteratorBase<IndexValue>* index_iter_;
   PinnedIteratorsManager* pinned_iters_mgr_;
   TBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
   bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
   bool is_out_of_bound_ = false;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to value() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
   bool check_filter_;
   // TODO(Zhongyi): pick a better name
   bool need_upper_bound_check_;
   const SliceTransform* prefix_extractor_;
   BlockType block_type_;
-  // If the keys in the blocks over which we iterate include 8 byte sequence
-  bool key_includes_seq_;
-  bool index_key_is_full_;
-  BlockHandle prev_index_value_;
+  uint64_t prev_block_offset_;
   BlockCacheLookupContext lookup_context_;
   // Readahead size used in compaction, its value is used only if
   // lookup_context_.caller = kCompaction.
@@ -748,6 +771,16 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   size_t readahead_limit_ = 0;
   int64_t num_file_reads_ = 0;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitDataBlock();
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index 2dab4627cb6..e0ca24bf482 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -69,37 +69,12 @@ void GenerateRandomKVs(std::vector<std::string> *keys,
   }
 }
 
-// Same as GenerateRandomKVs but the values are BlockHandle
-void GenerateRandomKBHs(std::vector<std::string> *keys,
-                        std::vector<BlockHandle> *values, const int from,
-                        const int len, const int step = 1,
-                        const int padding_size = 0,
-                        const int keys_share_prefix = 1) {
-  Random rnd(302);
-  uint64_t offset = 0;
-
-  // generate different prefix
-  for (int i = from; i < from + len; i += step) {
-    // generate keys that shares the prefix
-    for (int j = 0; j < keys_share_prefix; ++j) {
-      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
-
-      uint64_t size = rnd.Uniform(1024 * 16);
-      BlockHandle handle(offset, size);
-      offset += size + kBlockTrailerSize;
-      values->emplace_back(handle);
-    }
-  }
-}
-
 class BlockTest : public testing::Test {};
 
 // block test
 TEST_F(BlockTest, SimpleTest) {
   Random rnd(301);
   Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
@@ -123,7 +98,7 @@ TEST_F(BlockTest, SimpleTest) {
   // read contents of block sequentially
   int count = 0;
   InternalIterator *iter =
-      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
+      reader.NewDataIterator(options.comparator, options.comparator);
   for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
     // read kv from block
     Slice k = iter->key();
@@ -136,8 +111,7 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 
   // read block contents randomly
-  iter =
-      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
+  iter = reader.NewDataIterator(options.comparator, options.comparator);
   for (int i = 0; i < num_records; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
@@ -152,83 +126,6 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 }
 
-TEST_F(BlockTest, ValueDeltaEncodingTest) {
-  Random rnd(301);
-  Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
-
-  std::vector<std::string> keys;
-  std::vector<BlockHandle> values;
-  const bool kUseDeltaEncoding = true;
-  const bool kUseValueDeltaEncoding = true;
-  BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
-  int num_records = 100;
-
-  GenerateRandomKBHs(&keys, &values, 0, num_records);
-  // add a bunch of records to a block
-  BlockHandle last_encoded_handle;
-  for (int i = 0; i < num_records; i++) {
-    auto block_handle = values[i];
-    std::string handle_encoding;
-    block_handle.EncodeTo(&handle_encoding);
-    std::string handle_delta_encoding;
-    PutVarsignedint64(&handle_delta_encoding,
-                      block_handle.size() - last_encoded_handle.size());
-    last_encoded_handle = block_handle;
-    const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
-  }
-
-  // read serialized contents of the block
-  Slice rawblock = builder.Finish();
-
-  // create block reader
-  BlockContents contents;
-  contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
-
-  const bool kTotalOrderSeek = true;
-  const bool kIncludesSeq = true;
-  const bool kValueIsFull = !kUseValueDeltaEncoding;
-  IndexBlockIter *kNullIter = nullptr;
-  Statistics *kNullStats = nullptr;
-  // read contents of block sequentially
-  int count = 0;
-  InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
-      options.comparator, options.comparator, kNullIter, kNullStats,
-      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
-  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
-    // read kv from block
-    Slice k = iter->key();
-    BlockHandle handle = iter->value();
-
-    // compare with lookaside array
-    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
-
-    ASSERT_EQ(values[count].offset(), handle.offset());
-    ASSERT_EQ(values[count].size(), handle.size());
-  }
-  delete iter;
-
-  // read block contents randomly
-  iter = reader.NewIterator<IndexBlockIter>(
-      options.comparator, options.comparator, kNullIter, kNullStats,
-      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
-  for (int i = 0; i < num_records; i++) {
-    // find a random key in the lookaside array
-    int index = rnd.Uniform(num_records);
-    Slice k(keys[index]);
-
-    // search in block for this key
-    iter->Seek(k);
-    ASSERT_TRUE(iter->Valid());
-    BlockHandle handle = iter->value();
-    ASSERT_EQ(values[index].offset(), handle.offset());
-    ASSERT_EQ(values[index].size(), handle.size());
-  }
-  delete iter;
-}
 // return the block contents
 BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
@@ -261,8 +158,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
       NewFixedPrefixTransform(prefix_size));
 
   std::unique_ptr<InternalIterator> regular_iter(
-      reader2.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                         BytewiseComparator()));
+      reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator()));
 
   // Seek existent keys
   for (size_t i = 0; i < keys.size(); i++) {
@@ -457,8 +353,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
 TEST_F(BlockTest, BlockWithReadAmpBitmap) {
   Random rnd(301);
   Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
@@ -486,9 +380,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
     // read contents of block sequentially
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
       read_bytes += iter->TEST_CurrentEntrySize();
@@ -519,9 +412,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
 
@@ -555,9 +447,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
       int index = rnd.Uniform(num_records);
@@ -602,6 +493,132 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) {
   ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32);
 }
 
+class IndexBlockTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  IndexBlockTest() = default;
+
+  bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+  bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+                                std::vector<BlockHandle> *block_handles,
+                                std::vector<std::string> *first_keys,
+                                const int len) {
+  Random rnd(42);
+
+  // For each of `len` blocks, we need to generate a first and last key.
+  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  std::set<std::string> keys;
+  while ((int)keys.size() < len * 2) {
+    // Keys need to be at least 8 bytes long to look like internal keys.
+    keys.insert(test::RandomKey(&rnd, 12));
+  }
+
+  uint64_t offset = 0;
+  for (auto it = keys.begin(); it != keys.end();) {
+    first_keys->emplace_back(*it++);
+    separators->emplace_back(*it++);
+    uint64_t size = rnd.Uniform(1024 * 16);
+    BlockHandle handle(offset, size);
+    offset += size + kBlockTrailerSize;
+    block_handles->emplace_back(handle);
+  }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> separators;
+  std::vector<BlockHandle> block_handles;
+  std::vector<std::string> first_keys;
+  const bool kUseDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+  int num_records = 100;
+
+  GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+                             num_records);
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    IndexValue entry(block_handles[i], first_keys[i]);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+    if (useValueDeltaEncoding() && i > 0) {
+      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+                     &last_encoded_handle);
+    }
+    last_encoded_handle = entry.handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !useValueDeltaEncoding();
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter->SeekToFirst();
+  for (int index = 0; index < num_records; ++index) {
+    ASSERT_TRUE(iter->Valid());
+
+    Slice k = iter->key();
+    IndexValue v = iter->value();
+
+    EXPECT_EQ(separators[index], k.ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+
+    iter->Next();
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIndexIterator(options.comparator, options.comparator,
+                                 kNullIter, kNullStats, kTotalOrderSeek,
+                                 includeFirstKey(), kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records * 2; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(separators[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    IndexValue v = iter->value();
+    EXPECT_EQ(separators[index], iter->key().ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+  }
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+                        ::testing::Values(std::make_tuple(false, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(true, true)));
+
 }  // namespace rocksdb
 
 int main(int argc, char **argv) {
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 5ec0938714f..484617d7e14 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) {
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   const InternalKeyComparator icmp(BytewiseComparator());
-  auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+  auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
   bool may_exist;
   // search in block for the key just inserted
   {
@@ -474,8 +474,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
 
   // random seek existent keys
   for (int i = 0; i < num_records; i++) {
-    auto iter =
-        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "1" /* existing key marker */);
@@ -512,8 +511,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
   //     C         true          false
 
   for (int i = 0; i < num_records; i++) {
-    auto iter =
-        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "0" /* non-existing key marker */);
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index c1ce541ae56..f3a4b10e01e 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -36,7 +36,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       result = new ShortenedIndexBuilder(
           comparator, table_opt.index_block_restart_interval,
           table_opt.format_version, use_value_delta_encoding,
-          table_opt.index_shortening);
+          table_opt.index_shortening, /* include_first_key */ false);
     } break;
     case BlockBasedTableOptions::kHashSearch: {
       result = new HashIndexBuilder(
@@ -48,6 +48,12 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       result = PartitionedIndexBuilder::CreateIndexBuilder(
           comparator, use_value_delta_encoding, table_opt);
     } break;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ true);
+    } break;
     default: {
       assert(!"Do not recognize the index type ");
     } break;
@@ -94,7 +100,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
   sub_index_builder_ = new ShortenedIndexBuilder(
       comparator_, table_opt_.index_block_restart_interval,
       table_opt_.format_version, use_value_delta_encoding_,
-      table_opt_.index_shortening);
+      table_opt_.index_shortening, /* include_first_key */ false);
   flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_opt_.metadata_block_size, table_opt_.block_size_deviation,
       // Note: this is sub-optimal since sub_index_builder_ could later reset
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 6baa9891b1d..47348b31f78 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -58,6 +58,7 @@ class IndexBuilder {
   // To allow further optimization, we provide `last_key_in_current_block` and
   // `first_key_in_next_block`, based on which the specific implementation can
   // determine the best index key to be used for the index block.
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
   // @last_key_in_current_block: this parameter maybe overridden with the value
   //                             "substitute key".
   // @first_key_in_next_block: it will be nullptr if the entry being added is
@@ -123,7 +124,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
       const InternalKeyComparator* comparator,
       const int index_block_restart_interval, const uint32_t format_version,
       const bool use_value_delta_encoding,
-      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+      bool include_first_key)
       : IndexBuilder(comparator),
         index_block_builder_(index_block_restart_interval,
                              true /*use_delta_encoding*/,
@@ -131,11 +133,19 @@ class ShortenedIndexBuilder : public IndexBuilder {
         index_block_builder_without_seq_(index_block_restart_interval,
                                          true /*use_delta_encoding*/,
                                          use_value_delta_encoding),
+        use_value_delta_encoding_(use_value_delta_encoding),
+        include_first_key_(include_first_key),
         shortening_mode_(shortening_mode) {
     // Making the default true will disable the feature for old versions
     seperator_is_key_plus_seq_ = (format_version <= 2);
   }
 
+  virtual void OnKeyAdded(const Slice& key) override {
+    if (include_first_key_ && current_block_first_internal_key_.empty()) {
+      current_block_first_internal_key_.assign(key.data(), key.size());
+    }
+  }
+
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
                              const Slice* first_key_in_next_block,
                              const BlockHandle& block_handle) override {
@@ -159,20 +169,27 @@ class ShortenedIndexBuilder : public IndexBuilder {
     }
     auto sep = Slice(*last_key_in_current_block);
 
-    std::string handle_encoding;
-    block_handle.EncodeTo(&handle_encoding);
-    std::string handle_delta_encoding;
-    PutVarsignedint64(&handle_delta_encoding,
-                      block_handle.size() - last_encoded_handle_.size());
-    assert(handle_delta_encoding.size() != 0);
+    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+    IndexValue entry(block_handle, current_block_first_internal_key_);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+      entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+                     &last_encoded_handle_);
+    } else {
+      // If it's the first block, or delta encoding is disabled,
+      // BlockBuilder::Add() below won't use delta-encoded slice.
+    }
     last_encoded_handle_ = block_handle;
-    const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_block_builder_.Add(sep, handle_encoding,
-                             &handle_delta_encoding_slice);
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
     if (!seperator_is_key_plus_seq_) {
-      index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
-                                           &handle_delta_encoding_slice);
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+                                           &delta_encoded_entry_slice);
     }
+
+    current_block_first_internal_key_.clear();
   }
 
   using IndexBuilder::Finish;
@@ -200,9 +217,12 @@ class ShortenedIndexBuilder : public IndexBuilder {
  private:
   BlockBuilder index_block_builder_;
   BlockBuilder index_block_builder_without_seq_;
+  const bool use_value_delta_encoding_;
   bool seperator_is_key_plus_seq_;
+  const bool include_first_key_;
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
-  BlockHandle last_encoded_handle_;
+  BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+  std::string current_block_first_internal_key_;
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -243,7 +263,7 @@ class HashIndexBuilder : public IndexBuilder {
       : IndexBuilder(comparator),
         primary_index_builder_(comparator, index_block_restart_interval,
                                format_version, use_value_delta_encoding,
-                               shortening_mode),
+                               shortening_mode, /* include_first_key */ false),
         hash_key_extractor_(hash_key_extractor) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index cce6744157e..dcd985152bb 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -147,12 +147,13 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
   IndexBlockIter biter;
   BlockHandle handle;
   Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+  idx_on_fltr_blk_->NewIndexIterator(
       &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
+      /* have_first_key */ false, index_key_includes_seq_,
+      index_value_is_full_);
   biter.SeekToFirst();
   for (; biter.Valid(); biter.Next()) {
-    handle = biter.value();
+    handle = biter.value().handle;
     auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
                                             table_->rep_->cache_key_prefix_size,
                                             handle, cache_key);
@@ -221,15 +222,16 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
     const Slice& entry) {
   IndexBlockIter iter;
   Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+  idx_on_fltr_blk_->NewIndexIterator(
       &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
+      /* have_first_key */ false, index_key_includes_seq_,
+      index_value_is_full_);
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
     return BlockHandle(0, 0);
   }
   assert(iter.Valid());
-  BlockHandle fltr_blk_handle = iter.value();
+  BlockHandle fltr_blk_handle = iter.value().handle;
   return fltr_blk_handle;
 }
 
@@ -280,18 +282,19 @@ void PartitionedFilterBlockReader::CacheDependencies(
   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   IndexBlockIter biter;
   Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+  idx_on_fltr_blk_->NewIndexIterator(
       &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
+      /* have_first_key */ false, index_key_includes_seq_,
+      index_value_is_full_);
   // Index partitions are assumed to be consecuitive. Prefetch them all.
   // Read the first block offset
   biter.SeekToFirst();
-  BlockHandle handle = biter.value();
+  BlockHandle handle = biter.value().handle;
   uint64_t prefetch_off = handle.offset();
 
   // Read the last block's offset
   biter.SeekToLast();
-  handle = biter.value();
+  handle = biter.value().handle;
   uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
@@ -304,7 +307,7 @@ void PartitionedFilterBlockReader::CacheDependencies(
   // After prefetch, read the partitions one by one
   biter.SeekToFirst();
   for (; biter.Valid(); biter.Next()) {
-    handle = biter.value();
+    handle = biter.value().handle;
     const bool no_io = true;
     const bool is_a_filter_partition = true;
     auto filter = table_->GetFilter(
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 6fdddc37e49..81e1345d9c2 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -15,7 +15,6 @@
 #include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
-#include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
diff --git a/table/format.cc b/table/format.cc
index 2046903a703..b3eb281a2e5 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -91,6 +91,58 @@ std::string BlockHandle::ToString(bool hex) const {
 
 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
+void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
+                          const BlockHandle* previous_handle) const {
+  if (previous_handle) {
+    assert(handle.offset() == previous_handle->offset() +
+                                  previous_handle->size() + kBlockTrailerSize);
+    PutVarsignedint64(dst, handle.size() - previous_handle->size());
+  } else {
+    handle.EncodeTo(dst);
+  }
+  assert(dst->size() != 0);
+
+  if (have_first_key) {
+    PutLengthPrefixedSlice(dst, first_internal_key);
+  }
+}
+
+Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
+                              const BlockHandle* previous_handle) {
+  if (previous_handle) {
+    int64_t delta;
+    if (!GetVarsignedint64(input, &delta)) {
+      return Status::Corruption("bad delta-encoded index value");
+    }
+    handle = BlockHandle(
+        previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
+        previous_handle->size() + delta);
+  } else {
+    Status s = handle.DecodeFrom(input);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!have_first_key) {
+    first_internal_key = Slice();
+  } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
+    return Status::Corruption("bad first key in block info");
+  }
+
+  return Status::OK();
+}
+
+std::string IndexValue::ToString(bool hex, bool have_first_key) const {
+  std::string s;
+  EncodeTo(&s, have_first_key, nullptr);
+  if (hex) {
+    return Slice(s).ToString(true);
+  } else {
+    return s;
+  }
+}
+
 namespace {
 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
   return magic_number == kLegacyBlockBasedTableMagicNumber ||
diff --git a/table/format.h b/table/format.h
index baad78070ca..539ca88805c 100644
--- a/table/format.h
+++ b/table/format.h
@@ -76,6 +76,35 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+  BlockHandle handle;
+  // Empty means unknown.
+  Slice first_internal_key;
+
+  IndexValue() = default;
+  IndexValue(BlockHandle _handle, Slice _first_internal_key)
+      : handle(_handle), first_internal_key(_first_internal_key) {}
+
+  // have_first_key indicates whether the `first_internal_key` is used.
+  // If previous_handle is not null, delta encoding is used;
+  // in this case, the two handles must point to consecutive blocks:
+  // handle.offset() ==
+  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+  void EncodeTo(std::string* dst, bool have_first_key,
+                const BlockHandle* previous_handle) const;
+  Status DecodeFrom(Slice* input, bool have_first_key,
+                    const BlockHandle* previous_handle);
+
+  std::string ToString(bool hex, bool have_first_key) const;
+};
+
 inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
                                             uint32_t version) {
 #ifdef NDEBUG
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 8f1cc9dd68e..696e66135dc 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -90,8 +90,11 @@ class InternalIteratorBase : public Cleanable {
   // satisfied without doing some IO, then this returns Status::Incomplete().
   virtual Status status() const = 0;
 
-  // True if the iterator is invalidated because it is out of the iterator
-  // upper bound
+  // True if the iterator is invalidated because it reached a key that is above
+  // the iterator upper bound. Used by LevelIterator to decide whether it should
+  // stop or move on to the next file.
+  // Important: if iterator reached the end of the file without encountering any
+  // keys above the upper bound, IsOutOfBound() must return false.
   virtual bool IsOutOfBound() { return false; }
 
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
diff --git a/table/iterator.cc b/table/iterator.cc
index 97a0cef5e08..f6c7f9cec3f 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -167,7 +167,7 @@ template <class TValue>
 InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
   return new EmptyInternalIterator<TValue>(status);
 }
-template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
     const Status& status);
 template InternalIteratorBase<Slice>* NewErrorInternalIterator(
     const Status& status);
@@ -182,7 +182,7 @@ InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
     return new (mem) EmptyInternalIterator<TValue>(status);
   }
 }
-template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
     const Status& status, Arena* arena);
 template InternalIteratorBase<Slice>* NewErrorInternalIterator(
     const Status& status, Arena* arena);
@@ -191,7 +191,7 @@ template <class TValue>
 InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
   return new EmptyInternalIterator<TValue>(Status::OK());
 }
-template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator();
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator();
 template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
 
 template <class TValue>
@@ -203,7 +203,7 @@ InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
     return new (mem) EmptyInternalIterator<TValue>(Status::OK());
   }
 }
-template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator(
     Arena* arena);
 template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 4205d298b6d..3bbc6d87080 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -229,8 +229,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   Block properties_block(std::move(block_contents),
                          kDisableGlobalSequenceNumber);
   DataBlockIter iter;
-  properties_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                              BytewiseComparator(), &iter);
+  properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(),
+                                   &iter);
 
   auto new_table_properties = new TableProperties();
   // All pre-defined properties of type uint64_t
@@ -386,9 +386,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   // are to compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
-  std::unique_ptr<InternalIterator> meta_iter(
-      metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                                 BytewiseComparator()));
+  std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+      BytewiseComparator(), BytewiseComparator()));
 
   // -- Read property block
   bool found_properties_block = true;
@@ -459,8 +458,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+                                                  BytewiseComparator()));
 
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
@@ -504,8 +503,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+                                                  BytewiseComparator()));
 
   BlockHandle block_handle;
   status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
diff --git a/table/table_test.cc b/table/table_test.cc
index 2e2286efae4..418ecf004b7 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -236,7 +236,7 @@ class BlockConstructor: public Constructor {
   }
   InternalIterator* NewIterator(
       const SliceTransform* /*prefix_extractor*/) const override {
-    return block_->NewIterator<DataBlockIter>(comparator_, comparator_);
+    return block_->NewDataIterator(comparator_, comparator_);
   }
 
  private:
@@ -308,8 +308,9 @@ class TableConstructor: public Constructor {
  public:
   explicit TableConstructor(const Comparator* cmp,
                             bool convert_to_internal_key = false,
-                            int level = -1)
+                            int level = -1, SequenceNumber largest_seqno = 0)
       : Constructor(cmp),
+        largest_seqno_(largest_seqno),
         convert_to_internal_key_(convert_to_internal_key),
         level_(level) {}
   ~TableConstructor() override { Reset(); }
@@ -326,6 +327,14 @@ class TableConstructor: public Constructor {
     std::unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
+
+    if (largest_seqno_ != 0) {
+      // Pretend that it's an external file written by SstFileWriter.
+      int_tbl_prop_collector_factories.emplace_back(
+          new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                      0 /* global_seqno*/));
+    }
+
     std::string column_family_name;
     builder.reset(ioptions.table_factory->NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, internal_comparator,
@@ -362,7 +371,7 @@ class TableConstructor: public Constructor {
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            internal_comparator, !kSkipFilters, !kImmortal,
-                           level_),
+                           level_, largest_seqno_, nullptr),
         std::move(file_reader_), TEST_GetSink()->contents().size(),
         &table_reader_);
   }
@@ -428,6 +437,7 @@ class TableConstructor: public Constructor {
   std::unique_ptr<WritableFileWriter> file_writer_;
   std::unique_ptr<RandomAccessFileReader> file_reader_;
   std::unique_ptr<TableReader> table_reader_;
+  SequenceNumber largest_seqno_;
   bool convert_to_internal_key_;
   int level_;
 
@@ -1484,7 +1494,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
 
 TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i <= 5; ++i) {
     Options options;
     // Make each key/value an individual block
     table_options.block_size = 64;
@@ -1515,11 +1525,16 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
       options.prefix_extractor.reset(NewFixedPrefixTransform(4));
       break;
     case 4:
-    default:
-      // Binary search index
+      // Two-level index
       table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
       options.table_factory.reset(new BlockBasedTableFactory(table_options));
       break;
+    case 5:
+      // Binary search with first key
+      table_options.index_type =
+          BlockBasedTableOptions::kBinarySearchWithFirstKey;
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      break;
     }
 
     TableConstructor c(BytewiseComparator(),
@@ -1663,10 +1678,10 @@ static std::string RandomString(Random* rnd, int len) {
 }
 
 void AddInternalKey(TableConstructor* c, const std::string& prefix,
-                    int /*suffix_len*/ = 800) {
+                    std::string value = "v", int /*suffix_len*/ = 800) {
   static Random rnd(1023);
   InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
-  c->Add(k.Encode().ToString(), "v");
+  c->Add(k.Encode().ToString(), value);
 }
 
 void TableTest::IndexTest(BlockBasedTableOptions table_options) {
@@ -1845,6 +1860,286 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
   ASSERT_TRUE(iter->status().IsIncomplete());
 }
 
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  IndexTest(table_options);
+}
+
+class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
+                               public FlushBlockPolicy {
+ public:
+  explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
+      : keys_per_block_(keys_per_block) {}
+
+  const char* Name() const override { return "table_test"; }
+  FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                        const BlockBuilder&) const override {
+    return new CustomFlushBlockPolicy(keys_per_block_);
+  }
+
+  bool Update(const Slice&, const Slice&) override {
+    if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
+      ++current_block_idx_;
+      keys_in_current_block_ = 1;
+      return true;
+    }
+
+    ++keys_in_current_block_;
+    return false;
+  }
+
+  std::vector<int> keys_per_block_;
+
+  int current_block_idx_ = 0;
+  int keys_in_current_block_ = 0;
+};
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
+  for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
+    SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type =
+        use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
+                      : BlockBasedTableOptions::kBinarySearch;
+    table_options.block_cache = NewLRUCache(10000);  // fits all blocks
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
+    Options options;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    std::unique_ptr<InternalKeyComparator> comparator(
+        new InternalKeyComparator(BytewiseComparator()));
+    const ImmutableCFOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+
+    TableConstructor c(BytewiseComparator());
+
+    // Block 0.
+    AddInternalKey(&c, "aaaa", "v0");
+    AddInternalKey(&c, "aaac", "v1");
+
+    // Block 1.
+    AddInternalKey(&c, "aaca", "v2");
+
+    // Block 2.
+    AddInternalKey(&c, "caaa", "v3");
+    AddInternalKey(&c, "caac", "v4");
+    AddInternalKey(&c, "caae", "v5");
+
+    // Block 3.
+    AddInternalKey(&c, "ccaa", "v6");
+    AddInternalKey(&c, "ccac", "v7");
+
+    // Write the file.
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+             &kvmap);
+    ASSERT_EQ(8, keys.size());
+
+    auto reader = c.GetTableReader();
+    auto props = reader->GetTableProperties();
+    ASSERT_EQ(4u, props->num_data_blocks);
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    // Shouldn't have read data blocks before iterator is seeked.
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    auto ikey = [](Slice user_key) {
+      return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+    };
+
+    // Seek to a key between blocks. If index contains first key, we shouldn't
+    // read any data blocks until value is requested.
+    iter->Seek(ikey("aaba"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the middle of a block. The block should be read right away.
+    iter->Seek(ikey("caab"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to just before the same block and don't access value.
+    // The iterator should keep pinning the block contents.
+    iter->Seek(ikey("baaa"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the same block again to check that the block is still pinned.
+    iter->Seek(ikey("caae"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[5], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v5", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and fall through to the next block. Don't access value.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[6], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward again. Block should be read.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and reach the end.
+    iter->Next();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to a single-key block and step forward without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 1 : 2,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // Seek between blocks and step back without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[1], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    // All blocks are in cache now, there'll be no more misses ever.
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v1", iter->value().ToString());
+
+    // Next into the next block again.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 4,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to first and step back without accessing value.
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[0], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Prev();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Do some SeekForPrev() and SeekToLast() just to cover all methods.
+    iter->SeekForPrev(ikey("caad"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  table_options.block_cache = NewLRUCache(10000);
+  Options options;
+  options.statistics = CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
+                     /* level */ -1, /* largest_seqno */ 42);
+
+  c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
+  c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(2, keys.size());
+
+  auto reader = c.GetTableReader();
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(1u, props->num_data_blocks);
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+  EXPECT_NE(keys[0], iter->key().ToString());
+  // Key should have been served from index, without reading data blocks.
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+  EXPECT_EQ("x", iter->value().ToString());
+  EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+
+  c.ResetTableReader();
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
@@ -3606,9 +3901,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     Block metaindex_block(std::move(metaindex_contents),
                           kDisableGlobalSequenceNumber);
 
-    std::unique_ptr<InternalIterator> meta_iter(
-        metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                                   BytewiseComparator()));
+    std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+        BytewiseComparator(), BytewiseComparator()));
     bool found_properties_block = true;
     ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block));
     ASSERT_TRUE(found_properties_block);
@@ -3688,8 +3982,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
 
   // verify properties block comes last
   std::unique_ptr<InternalIterator> metaindex_iter{
-      metaindex_block.NewIterator<DataBlockIter>(options.comparator,
-                                                 options.comparator)};
+      metaindex_block.NewDataIterator(options.comparator, options.comparator)};
   uint64_t max_offset = 0;
   std::string key_at_max_offset;
   for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index 7ff73cd4e4f..1cb00b63928 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -19,11 +19,11 @@ namespace rocksdb {
 
 namespace {
 
-class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
+class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
  public:
   explicit TwoLevelIndexIterator(
       TwoLevelIteratorState* state,
-      InternalIteratorBase<BlockHandle>* first_level_iter);
+      InternalIteratorBase<IndexValue>* first_level_iter);
 
   ~TwoLevelIndexIterator() override {
     first_level_iter_.DeleteIter(false /* is_arena_mode */);
@@ -43,7 +43,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
     assert(Valid());
     return second_level_iter_.key();
   }
-  BlockHandle value() const override {
+  IndexValue value() const override {
     assert(Valid());
     return second_level_iter_.value();
   }
@@ -69,12 +69,12 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
   }
   void SkipEmptyDataBlocksForward();
   void SkipEmptyDataBlocksBackward();
-  void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter);
+  void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
   void InitDataBlock();
 
   TwoLevelIteratorState* state_;
-  IteratorWrapperBase<BlockHandle> first_level_iter_;
-  IteratorWrapperBase<BlockHandle> second_level_iter_;  // May be nullptr
+  IteratorWrapperBase<IndexValue> first_level_iter_;
+  IteratorWrapperBase<IndexValue> second_level_iter_;  // May be nullptr
   Status status_;
   // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the second_level_iter.
@@ -83,7 +83,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
 
 TwoLevelIndexIterator::TwoLevelIndexIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter)
+    InternalIteratorBase<IndexValue>* first_level_iter)
     : state_(state), first_level_iter_(first_level_iter) {}
 
 void TwoLevelIndexIterator::Seek(const Slice& target) {
@@ -177,8 +177,8 @@ void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
 }
 
 void TwoLevelIndexIterator::SetSecondLevelIterator(
-    InternalIteratorBase<BlockHandle>* iter) {
-  InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter);
+    InternalIteratorBase<IndexValue>* iter) {
+  InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
   delete old_iter;
 }
 
@@ -186,14 +186,14 @@ void TwoLevelIndexIterator::InitDataBlock() {
   if (!first_level_iter_.Valid()) {
     SetSecondLevelIterator(nullptr);
   } else {
-    BlockHandle handle = first_level_iter_.value();
+    BlockHandle handle = first_level_iter_.value().handle;
     if (second_level_iter_.iter() != nullptr &&
         !second_level_iter_.status().IsIncomplete() &&
         handle.offset() == data_block_handle_.offset()) {
       // second_level_iter is already constructed with this iterator, so
       // no need to change anything
     } else {
-      InternalIteratorBase<BlockHandle>* iter =
+      InternalIteratorBase<IndexValue>* iter =
           state_->NewSecondaryIterator(handle);
       data_block_handle_ = handle;
       SetSecondLevelIterator(iter);
@@ -203,9 +203,9 @@ void TwoLevelIndexIterator::InitDataBlock() {
 
 }  // namespace
 
-InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter) {
+    InternalIteratorBase<IndexValue>* first_level_iter) {
   return new TwoLevelIndexIterator(state, first_level_iter);
 }
 }  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 55d5c01a4ae..545c29f493e 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -22,11 +22,10 @@ struct TwoLevelIteratorState {
   TwoLevelIteratorState() {}
 
   virtual ~TwoLevelIteratorState() {}
-  virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+  virtual InternalIteratorBase<IndexValue>* NewSecondaryIterator(
       const BlockHandle& handle) = 0;
 };
 
-
 // Return a new two level iterator.  A two-level iterator contains an
 // index iterator whose values point to a sequence of blocks where
 // each block is itself a sequence of key,value pairs.  The returned
@@ -37,8 +36,8 @@ struct TwoLevelIteratorState {
 // Uses a supplied function to convert an index_iter value into
 // an iterator over the contents of the corresponding block.
 // Note: this function expects first_level_iter was not created using the arena
-extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+extern InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter);
+    InternalIteratorBase<IndexValue>* first_level_iter);
 
 }  // namespace rocksdb
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 4e37cde40d1..61a49d88a17 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -9,6 +9,7 @@
 
 #include "test_util/testutil.h"
 
+#include <array>
 #include <cctype>
 #include <sstream>
 
@@ -197,8 +198,12 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
   opt.cache_index_and_filter_blocks = rnd->Uniform(2);
   opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
   opt.pin_top_level_index_and_filter = rnd->Uniform(2);
-  opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch
-                                   : BlockBasedTableOptions::kHashSearch;
+  using IndexType = BlockBasedTableOptions::IndexType;
+  const std::array<IndexType, 4> index_types = {
+      {IndexType::kBinarySearch, IndexType::kHashSearch,
+       IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}};
+  opt.index_type =
+      index_types[rnd->Uniform(static_cast<int>(index_types.size()))];
   opt.hash_index_allow_collision = rnd->Uniform(2);
   opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
   opt.block_size = rnd->Uniform(10000000);
diff --git a/util/coding.h b/util/coding.h
index 4046a2b60bf..9427d52618e 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -58,6 +58,7 @@ extern bool GetFixed32(Slice* input, uint32_t* value);
 extern bool GetFixed16(Slice* input, uint16_t* value);
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetVarsignedint64(Slice* input, int64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
 // This function assumes data is well-formed.
 extern Slice GetLengthPrefixedSlice(const char* data);
@@ -377,6 +378,18 @@ inline bool GetVarint64(Slice* input, uint64_t* value) {
   }
 }
 
+inline bool GetVarsignedint64(Slice* input, int64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarsignedint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
 // Provide an interface for platform independent endianness transformation
 inline uint64_t EndianTransform(uint64_t input, size_t size) {
   char* pos = reinterpret_cast<char*>(&input);

From 9dbcda9e3b9b59b76b247e24e9ebc4b9263197ff Mon Sep 17 00:00:00 2001
From: Mike Kolupaev <kolmike@fb.com>
Date: Tue, 25 Jun 2019 22:58:56 -0700
Subject: [PATCH 181/572] Fix uninitialized prev_block_offset_ in
 BlockBasedTableReader (#5507)

Summary:
Found by valgrind_check.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5507

Differential Revision: D16002612

Pulled By: miasantreble

fbshipit-source-id: 13c11c183190e0a0571844635457d434da3ac59a
---
 table/block_based/block_based_table_reader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 9300fb36a70..4356713910c 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -755,7 +755,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool need_upper_bound_check_;
   const SliceTransform* prefix_extractor_;
   BlockType block_type_;
-  uint64_t prev_block_offset_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
   BlockCacheLookupContext lookup_context_;
   // Readahead size used in compaction, its value is used only if
   // lookup_context_.caller = kCompaction.

From a8975b62455cb73a8e23ff6be709df1b97859d2d Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 27 Jun 2019 08:31:03 -0700
Subject: [PATCH 182/572] Block cache tracer: Do not populate block cache trace
 record when tracing is disabled. (#5510)

Summary:
This PR makes sure that trace record is not populated when tracing is disabled.

Before this PR:
DB path: [/data/mysql/rocks_regression_tests/OPTIONS-myrocks-40-33-10000000/2019-06-26-13-04-41/db]
readwhilewriting :       9.803 micros/op 1550408 ops/sec;  107.9 MB/s (5000000 of 5000000 found)
Microseconds per read:
Count: 80000000 Average: 9.8045  StdDev: 12.64
Min: 1  Median: 7.5246  Max: 25343
Percentiles: P50: 7.52 P75: 12.10 P99: 37.44 P99.9: 75.07 P99.99: 133.60

After this PR:
DB path: [/data/mysql/rocks_regression_tests/OPTIONS-myrocks-40-33-10000000/2019-06-26-14-08-21/db]
readwhilewriting :       8.723 micros/op 1662882 ops/sec;  115.8 MB/s (5000000 of 5000000 found)
Microseconds per read:
Count: 80000000 Average: 8.7236  StdDev: 12.19
Min: 1  Median: 6.7262  Max: 25229
Percentiles: P50: 6.73 P75: 10.50 P99: 31.54 P99.9: 74.81 P99.99: 132.82
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5510

Differential Revision: D16016428

Pulled By: HaoyuHuang

fbshipit-source-id: 3b3d11e6accf207d18ec2545b802aa01ee65901f
---
 table/block_based/block_based_table_reader.cc | 13 ++++++++-----
 trace_replay/block_cache_tracer.h             |  4 ++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 5344625ec94..e73b0c08c41 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1967,7 +1967,8 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     }
   }
 
-  if (block_cache_tracer_ && lookup_context) {
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
     // Avoid making copy of block_key and cf_name when constructing the access
     // record.
     BlockCacheTraceRecord access_record(
@@ -2048,7 +2049,8 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
       }
     }
   }
-  if (block_cache_tracer_ && lookup_context) {
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
     // Avoid making copy of block_key and cf_name when constructing the access
     // record.
     BlockCacheTraceRecord access_record(
@@ -2273,7 +2275,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   }
 
   // Fill lookup_context.
-  if (block_cache_tracer_ && lookup_context) {
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
     size_t usage = 0;
     uint64_t nkeys = 0;
     if (block_entry->GetValue()) {
@@ -3167,7 +3170,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         s = biter.status();
       }
       // Write the block cache access record.
-      if (block_cache_tracer_) {
+      if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
         // Avoid making copy of block_key, cf_name, and referenced_key when
         // constructing the access record.
         BlockCacheTraceRecord access_record(
@@ -3334,7 +3337,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           s = biter.status();
         }
         // Write the block cache access.
-        if (block_cache_tracer_) {
+        if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
           // Avoid making copy of block_key, cf_name, and referenced_key when
           // constructing the access record.
           BlockCacheTraceRecord access_record(
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index e7f38db3c6d..e2ad933b9b8 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -197,6 +197,10 @@ class BlockCacheTracer {
   // Stop writing block cache accesses to the trace_writer.
   void EndTrace();
 
+  bool is_tracing_enabled() const {
+    return writer_.load(std::memory_order_relaxed);
+  }
+
   Status WriteBlockAccess(const BlockCacheTraceRecord& record,
                           const Slice& block_key, const Slice& cf_name,
                           const Slice& referenced_key);

From c08c0ae73131457a2ac74507da58ff49870c1ee6 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 27 Jun 2019 08:54:28 -0700
Subject: [PATCH 183/572] Add C binding for secondary instance (#5505)

Summary:
Add C binding for secondary instance as well as unit test.

Test plan (on devserver)
```
$make clean && COMPILE_WITH_ASAN=1 make -j20 all
$./c_test
$make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5505

Differential Revision: D16000043

Pulled By: riversand963

fbshipit-source-id: 3361ef6bfdf4ce12438cee7290a0ac203b5250bd
---
 HISTORY.md          |  1 +
 db/c.cc             | 50 +++++++++++++++++++++++++++++++++++++++++
 db/c_test.c         | 54 +++++++++++++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h | 14 ++++++++++++
 4 files changed, 119 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 07eb2759736..d3660ee64ac 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,7 @@
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
 * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+* Add C bindings for secondary instance, i.e. DBImplSecondary.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/db/c.cc b/db/c.cc
index 8f96366fbed..17dc766dd66 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -517,6 +517,21 @@ rocksdb_t* rocksdb_open_for_read_only(
   return result;
 }
 
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+                                     const char* name,
+                                     const char* secondary_path,
+                                     char** errptr) {
+  DB* db;
+  if (SaveError(errptr,
+                DB::OpenAsSecondary(options->rep, std::string(name),
+                                    std::string(secondary_path), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr) {
   BackupEngine* be;
@@ -717,6 +732,37 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
   return result;
 }
 
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i != num_column_families; ++i) {
+    column_families.emplace_back(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep));
+  }
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+                                            std::string(name),
+                                            std::string(secondary_path),
+                                            column_families, &handles, &db))) {
+    return nullptr;
+  }
+  for (size_t i = 0; i != handles.size(); ++i) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 char** rocksdb_list_column_families(
     const rocksdb_options_t* options,
     const char* name,
@@ -3423,6 +3469,10 @@ void rocksdb_ingest_external_file_cf(
   SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
 }
 
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
 rocksdb_slicetransform_t* rocksdb_slicetransform_create(
     void* state,
     void (*destructor)(void*),
diff --git a/db/c_test.c b/db/c_test.c
index 64241df287b..4b4b165c879 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -45,6 +45,7 @@ static char sstfilename[200];
 static char dbbackupname[200];
 static char dbcheckpointname[200];
 static char dbpathname[200];
+static char secondary_path[200];
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -1722,6 +1723,59 @@ int main(int argc, char** argv) {
     CheckNoError(err);
   }
 
+  // Check that secondary instance works.
+  StartPhase("open_as_secondary");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_t* db1;
+    rocksdb_options_t* opts = rocksdb_options_create();
+    rocksdb_options_set_max_open_files(opts, -1);
+    rocksdb_options_set_create_if_missing(opts, 1);
+    snprintf(secondary_path, sizeof(secondary_path),
+             "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+    db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    rocksdb_writeoptions_disable_WAL(woptions, 1);
+    rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+    rocksdb_flush(db, flush_opts, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_verify_checksums(ropts, 1);
+    rocksdb_readoptions_set_snapshot(ropts, NULL);
+    CheckGet(db, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key0", "value0");
+
+    rocksdb_writeoptions_disable_WAL(woptions, 0);
+    rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    CheckGet(db1, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key1", "value1");
+
+    rocksdb_close(db1);
+    rocksdb_destroy_db(opts, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(opts);
+    rocksdb_readoptions_destroy(ropts);
+    rocksdb_flushoptions_destroy(flush_opts);
+  }
+
   // Simple sanity check that options setting db_paths work.
   StartPhase("open_db_paths");
   {
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 5e75dd70964..e8cb3224248 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -138,6 +138,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
     const rocksdb_options_t* options, const char* name,
     unsigned char error_if_log_file_exist, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr);
 
@@ -218,6 +222,13 @@ rocksdb_open_for_read_only_column_families(
     rocksdb_column_family_handle_t** column_family_handles,
     unsigned char error_if_log_file_exist, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** colummn_family_handles, char** errptr);
+
 extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
     const rocksdb_options_t* options, const char* name, size_t* lencf,
     char** errptr);
@@ -1375,6 +1386,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
     const char* const* file_list, const size_t list_len,
     const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+    rocksdb_t* db, char** errptr);
+
 /* SliceTransform */
 
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*

From 15fd3be07bd7a6fa29604277e9a9be21f458c426 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 27 Jun 2019 10:16:21 -0700
Subject: [PATCH 184/572] LRU Cache to enable mid-point insertion by default
 (#5508)

Summary:
Mid-point insertion is a useful feature and is mature now. Make it default. Also changed cache_index_and_filter_blocks_with_high_priority=true as default accordingly, so that we won't evict index and filter blocks easier after the change, to avoid too many surprises to users.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5508

Test Plan: Run all existing tests.

Differential Revision: D16021179

fbshipit-source-id: ce8456e8d43b3bfb48df6c304b5290a9d19817eb
---
 HISTORY.md                                    |  4 ++
 cache/cache_test.cc                           |  2 +-
 include/rocksdb/cache.h                       |  4 +-
 include/rocksdb/table.h                       |  2 +-
 options/options_test.cc                       | 39 ++++++++++---------
 .../block_based/block_based_table_factory.cc  |  7 +++-
 table/block_based/block_based_table_reader.cc | 27 +++++++------
 7 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d3660ee64ac..79feac37cbb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,9 @@
 # Rocksdb Change Log
 ## Unreleased
+### Default Option Change
+* LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
+* Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
+
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index d7b191bb31f..46ce78db68f 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -90,7 +90,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
                                   bool strict_capacity_limit) {
     auto type = GetParam();
     if (type == kLRU) {
-      return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+      return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, 0.0);
     }
     if (type == kClock) {
       return NewClockCache(capacity, num_shard_bits, strict_capacity_limit);
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 8fb691559d0..410c2cf827a 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -59,7 +59,7 @@ struct LRUCacheOptions {
   //
   // See also
   // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
-  double high_pri_pool_ratio = 0.0;
+  double high_pri_pool_ratio = 0.5;
 
   // If non-nullptr will use this allocator instead of system allocator when
   // allocating memory for cache blocks. Call this method before you start using
@@ -99,7 +99,7 @@ struct LRUCacheOptions {
 // will be at least 512KB and number of shard bits will not exceed 6.
 extern std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits = -1,
-    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
     std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
     bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
 
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 929239100a4..712c604ad35 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -74,7 +74,7 @@ struct BlockBasedTableOptions {
   // blocks with high priority. If set to true, depending on implementation of
   // block cache, index and filter blocks may be less likely to be evicted
   // than data blocks.
-  bool cache_index_and_filter_blocks_with_high_priority = false;
+  bool cache_index_and_filter_blocks_with_high_priority = true;
 
   // if cache_index_and_filter_blocks is true and the below is true, then
   // filter and index blocks are stored in the cache, but a reference is
diff --git a/options/options_test.cc b/options/options_test.cc
index 24aeec99e17..823a9c1e054 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -617,8 +617,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
                 new_opt.block_cache)->GetNumShardBits(),
                 GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
   // Default values
@@ -627,16 +628,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
                 GetDefaultCacheShardBits(
                     new_opt.block_cache_compressed->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
 
   // Set couple of block cache options.
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-             "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
-             "block_cache_compressed={num_shard_bits=5;"
-             "high_pri_pool_ratio=0.5;}",
-             &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={num_shard_bits=5;"
+      "high_pri_pool_ratio=0.0;}",
+      &new_opt));
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache)->GetNumShardBits(), 5);
@@ -648,9 +650,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache_compressed)->GetNumShardBits(), 5);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.5);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.0);
 
   // Set couple of block cache options.
   ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
@@ -664,16 +666,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache)->GetNumShardBits(), 4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache_compressed)->GetNumShardBits(), 4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
 }
 #endif  // !ROCKSDB_LITE
 
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 96812e233b8..9dca2a6f0c1 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -167,7 +167,12 @@ BlockBasedTableFactory::BlockBasedTableFactory(
   if (table_options_.no_block_cache) {
     table_options_.block_cache.reset();
   } else if (table_options_.block_cache == nullptr) {
-    table_options_.block_cache = NewLRUCache(8 << 20);
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    // It makes little sense to pay overhead for mid-point insertion while the
+    // block size is only 8MB.
+    co.high_pri_pool_ratio = 0.0;
+    table_options_.block_cache = NewLRUCache(co);
   }
   if (table_options_.block_size_deviation < 0 ||
       table_options_.block_size_deviation > 100) {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index e73b0c08c41..017d6126c2b 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2626,12 +2626,11 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
   CheckOutOfBound();
 
   if (target) {
-    assert(
-        !Valid() ||
-        ((block_type_ == BlockType::kIndex &&
-          !table_->get_rep()->index_key_includes_seq)
-             ? (user_comparator_.Compare(ExtractUserKey(*target), key()) <= 0)
-             : (icomp_.Compare(*target, key()) <= 0)));
+    assert(!Valid() || ((block_type_ == BlockType::kIndex &&
+                         !table_->get_rep()->index_key_includes_seq)
+                            ? (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)
+                            : (icomp_.Compare(*target, key()) <= 0)));
   }
 }
 
@@ -2954,8 +2953,8 @@ InternalIterator* BlockBasedTable::NewIterator(
             /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, BlockType::kData,
-        caller, compaction_readahead_size);
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
   } else {
     auto* mem =
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
@@ -2966,8 +2965,8 @@ InternalIterator* BlockBasedTable::NewIterator(
                          &lookup_context),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, BlockType::kData,
-        caller, compaction_readahead_size);
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
   }
 }
 
@@ -3125,8 +3124,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       DataBlockIter biter;
       uint64_t referenced_data_size = 0;
       NewDataBlockIterator<DataBlockIter>(
-          read_options, v.handle, &biter, BlockType::kData,
-          get_context, &lookup_data_block_context,
+          read_options, v.handle, &biter, BlockType::kData, get_context,
+          &lookup_data_block_context,
           /*s=*/Status(), /*prefetch_buffer*/ nullptr);
 
       if (no_io && biter.status().IsIncomplete()) {
@@ -3278,8 +3277,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           offset = iiter->value().handle.offset();
           biter.Invalidate(Status::OK());
           NewDataBlockIterator<DataBlockIter>(
-              read_options, v.handle, &biter, BlockType::kData,
-              get_context, &lookup_data_block_context, Status(), nullptr);
+              read_options, v.handle, &biter, BlockType::kData, get_context,
+              &lookup_data_block_context, Status(), nullptr);
           reusing_block = false;
         }
 

From 5c2f13fb14540f8b57337120811bf755e132c6fb Mon Sep 17 00:00:00 2001
From: Aaron Gao <gzh@fb.com>
Date: Thu, 27 Jun 2019 11:08:45 -0700
Subject: [PATCH 185/572] add create_column_family and drop_column_family cmd
 to ldb tool (#5503)

Summary:
`create_column_family` cmd already exists but was somehow missed in the help message.
also add `drop_column_family` cmd which can drop a cf without opening db.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5503

Test Plan: Updated existing ldb_test.py to test deleting a column family.

Differential Revision: D16018414

Pulled By: lightmark

fbshipit-source-id: 1fc33680b742104fea86b10efc8499f79e722301
---
 tools/ldb_cmd.cc     | 42 ++++++++++++++++++++++++++++++++++++++++++
 tools/ldb_cmd_impl.h | 17 +++++++++++++++++
 tools/ldb_test.py    |  2 ++
 tools/ldb_tool.cc    |  2 ++
 4 files changed, 63 insertions(+)

diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 49489173c33..a1507b188b2 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -223,6 +223,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new CreateColumnFamilyCommand(parsed_params.cmd_params,
                                          parsed_params.option_map,
                                          parsed_params.flags);
+  } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) {
+    return new DropColumnFamilyCommand(parsed_params.cmd_params,
+                                         parsed_params.option_map,
+                                         parsed_params.flags);
   } else if (parsed_params.cmd == DBFileDumperCommand::Name()) {
     return new DBFileDumperCommand(parsed_params.cmd_params,
                                    parsed_params.option_map,
@@ -1125,6 +1129,44 @@ void CreateColumnFamilyCommand::DoCommand() {
   CloseDB();
 }
 
+void DropColumnFamilyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DropColumnFamilyCommand::Name());
+  ret.append(" --db=<db_path> <column_family_name_to_drop>");
+  ret.append("\n");
+}
+
+DropColumnFamilyCommand::DropColumnFamilyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+  : LDBCommand(options, flags, true, {ARG_DB}) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "The name of column family to drop must be specified");
+  } else {
+    cf_name_to_drop_ = params[0];
+  }
+}
+
+void DropColumnFamilyCommand::DoCommand() {
+  auto iter = cf_handles_.find(cf_name_to_drop_);
+  if (iter == cf_handles_.end()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Column family: " + cf_name_to_drop_ + " doesn't exist in db.");
+    return;
+  }
+  ColumnFamilyHandle* cf_handle_to_drop = iter->second;
+  Status st = db_->DropColumnFamily(cf_handle_to_drop);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Fail to drop column family: " + st.ToString());
+  }
+  CloseDB();
+}
+
 // ----------------------------------------------------------------------------
 
 namespace {
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 868c81f44c8..23bafe68254 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -205,6 +205,23 @@ class CreateColumnFamilyCommand : public LDBCommand {
   std::string new_cf_name_;
 };
 
+class DropColumnFamilyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "drop_column_family"; }
+
+  DropColumnFamilyCommand(const std::vector<std::string>& params,
+                          const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return false; }
+
+ private:
+  std::string cf_name_to_drop_;
+};
+
 class ReduceDBLevelsCommand : public LDBCommand {
  public:
   static std::string Name() { return "reduce_levels"; }
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index e64e76ee731..26167ee83fd 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -553,8 +553,10 @@ def testColumnFamilies(self):
                          "1")
         self.assertRunOK("get cf3_1 --column_family=three",
                          "3")
+        self.assertRunOK("drop_column_family three", "OK")
         # non-existing column family.
         self.assertRunFAIL("get cf3_1 --column_family=four")
+        self.assertRunFAIL("drop_column_family four")
 
     def testIngestExternalSst(self):
         print "Running testIngestExternalSst..."
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index fe307eab7dc..2813f6c6edf 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -82,6 +82,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
   ListColumnFamiliesCommand::Help(ret);
+  CreateColumnFamilyCommand::Help(ret);
+  DropColumnFamilyCommand::Help(ret);
   DBFileDumperCommand::Help(ret);
   InternalDumpCommand::Help(ret);
   RepairCommand::Help(ret);

From 10bae8ceb39db5cb332cbf24f6eec60d8b7d7f20 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 28 Jun 2019 17:38:34 -0700
Subject: [PATCH 186/572] Add more release versions to
 tools/check_format_compatible.sh (#5518)

Summary:
tools/check_format_compatible.sh is lagged behind. Catch up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5518

Test Plan: Run the command

Differential Revision: D16063180

fbshipit-source-id: d063eb42df9653dec06a2cf0fb982b8a60ca3d2f
---
 tools/check_format_compatible.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 098f0d555a9..444c1111a73 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -55,9 +55,9 @@ EOF
 
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
 declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
-declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb")
+declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb")
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb")
 
 generate_db()
 {

From 68b46a2e3699180609b65c2529b86b067bd1829d Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Sun, 30 Jun 2019 19:54:28 -0700
Subject: [PATCH 187/572] Block cache tracer: StartTrace return busy if trace
 is already started. (#5519)

Summary:
This PR is needed for integration into MyRocks. A second call on StartTrace returns Busy so that MyRocks may return an error to the user.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5519

Test Plan: make clean && USE_CLANG=1 make check -j32

Differential Revision: D16055476

Pulled By: HaoyuHuang

fbshipit-source-id: a51772fb0965c873922757eb470a332b1e02a91d
---
 trace_replay/block_cache_tracer.cc      |  2 +-
 trace_replay/block_cache_tracer_test.cc | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 4c5ad011609..b163216d874 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -233,7 +233,7 @@ Status BlockCacheTracer::StartTrace(
     std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   if (writer_.load()) {
-    return Status::OK();
+    return Status::Busy();
   }
   trace_options_ = trace_options;
   writer_.store(
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index 44cba7bfbd8..e7a5881044f 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -195,6 +195,17 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) {
   }
 }
 
+TEST_F(BlockCacheTracerTest, ConsecutiveStartTrace) {
+  TraceOptions trace_opt;
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(
+      NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer));
+  BlockCacheTracer writer;
+  ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_NOK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_OK(env_->FileExists(trace_file_path_));
+}
+
 TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
   BlockCacheTraceRecord record = GenerateAccessRecord();
   {

From 7259e28d915af72dd0cd6d055ab966644d83dd68 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Sun, 30 Jun 2019 20:52:34 -0700
Subject: [PATCH 188/572] MultiGet parallel IO (#5464)

Summary:
Enhancement to MultiGet batching to read data blocks required for keys in a batch in parallel from disk. It uses Env::MultiRead() API to read multiple blocks and reduce latency.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5464

Test Plan:
1. make check
2. make asan_check
3. make asan_crash

Differential Revision: D15911771

Pulled By: anand1976

fbshipit-source-id: 605036b9af0f90ca0020dc87c3a86b4da6e83394
---
 HISTORY.md                                    |   1 +
 db/db_basic_test.cc                           | 289 ++++++++++
 table/block_based/block_based_table_reader.cc | 519 +++++++++++++++---
 table/block_based/block_based_table_reader.h  |  23 +-
 table/format.h                                |   2 +
 util/file_reader_writer.cc                    |  43 ++
 util/file_reader_writer.h                     |   2 +
 7 files changed, 812 insertions(+), 67 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 79feac37cbb..2c8dc8c3ab9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -25,6 +25,7 @@
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
 * Log Writer will flush after finishing the whole record, rather than a fragment.
+* Lower MultiGet batching API latency by reading data blocks from disk in parallel
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 1aec864dd6f..66d3b3aff7c 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "table/block_based/block_builder.h"
 #include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
@@ -1285,6 +1286,294 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
   }
 }
 
+class DBBasicTestWithParallelIO
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool,bool,bool,bool>> {
+ public:
+  DBBasicTestWithParallelIO()
+      : DBTestBase("/db_basic_test_with_parallel_io") {
+    bool compressed_cache = std::get<0>(GetParam());
+    bool uncompressed_cache = std::get<1>(GetParam());
+    compression_enabled_ = std::get<2>(GetParam());
+    fill_cache_ = std::get<3>(GetParam());
+
+    if (compressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+    if (uncompressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+
+    env_->count_random_reads_ = true;
+
+    Options options = CurrentOptions();
+    Random rnd(301);
+    BlockBasedTableOptions table_options;
+    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    table_options.block_cache = uncompressed_cache_;
+    table_options.block_cache_compressed = compressed_cache_;
+    table_options.flush_block_policy_factory.reset(
+                      new MyFlushBlockPolicyFactory());
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    if (!compression_enabled_) {
+      options.compression = kNoCompression;
+    }
+    Reopen(options);
+
+    std::string zero_str(128, '\0');
+    for (int i = 0; i < 100; ++i) {
+      // Make the value compressible. A purely random string doesn't compress
+      // and the resultant data block will not be compressed
+      values_.emplace_back(RandomString(&rnd, 128) + zero_str);
+      assert(Put(Key(i), values_[i]) == Status::OK());
+    }
+    Flush();
+  }
+
+  bool CheckValue(int i, const std::string& value) {
+    if (values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  int num_lookups() { return uncompressed_cache_->num_lookups(); }
+  int num_found() { return uncompressed_cache_->num_found(); }
+  int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+  int num_lookups_compressed() {
+    return compressed_cache_->num_lookups();
+  }
+  int num_found_compressed() {
+    return compressed_cache_->num_found();
+  }
+  int num_inserts_compressed() {
+    return compressed_cache_->num_inserts();
+  }
+
+  bool fill_cache() { return fill_cache_; }
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+ private:
+  class MyFlushBlockPolicyFactory
+    : public FlushBlockPolicyFactory {
+   public:
+    MyFlushBlockPolicyFactory() {}
+
+    virtual const char* Name() const override {
+      return "MyFlushBlockPolicyFactory";
+    }
+
+    virtual FlushBlockPolicy* NewFlushBlockPolicy(
+        const BlockBasedTableOptions& /*table_options*/,
+        const BlockBuilder& data_block_builder) const override {
+      return new MyFlushBlockPolicy(data_block_builder);
+    }
+  };
+
+  class MyFlushBlockPolicy
+    : public FlushBlockPolicy {
+   public:
+    explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+      : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (data_block_builder_.empty()) {
+        // First key in this block
+        num_keys_ = 1;
+        return false;
+      }
+      // Flush every 10 keys
+      if (num_keys_ == 10) {
+        num_keys_ = 1;
+        return true;
+      }
+      num_keys_++;
+      return false;
+    }
+
+   private:
+    int num_keys_;
+    const BlockBuilder& data_block_builder_;
+  };
+
+  class MyBlockCache
+    : public Cache {
+   public:
+    explicit MyBlockCache(std::shared_ptr<Cache>& target)
+      : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
+
+    virtual const char* Name() const override { return "MyBlockCache"; }
+
+    virtual Status Insert(const Slice& key, void* value, size_t charge,
+                          void (*deleter)(const Slice& key, void* value),
+                          Handle** handle = nullptr,
+                          Priority priority = Priority::LOW) override {
+      num_inserts_++;
+      return target_->Insert(key, value, charge, deleter, handle, priority);
+    }
+
+    virtual Handle* Lookup(const Slice& key,
+                           Statistics* stats = nullptr) override {
+      num_lookups_++;
+      Handle* handle = target_->Lookup(key, stats);
+      if (handle != nullptr) {
+        num_found_++;
+      }
+      return handle;
+    }
+
+    virtual bool Ref(Handle* handle) override {
+      return target_->Ref(handle);
+    }
+
+    virtual bool Release(Handle* handle, bool force_erase = false) override {
+      return target_->Release(handle, force_erase);
+    }
+
+    virtual void* Value(Handle* handle) override {
+      return target_->Value(handle);
+    }
+
+    virtual void Erase(const Slice& key) override {
+      target_->Erase(key);
+    }
+    virtual uint64_t NewId() override {
+      return target_->NewId();
+    }
+
+    virtual void SetCapacity(size_t capacity) override {
+      target_->SetCapacity(capacity);
+    }
+
+    virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+      target_->SetStrictCapacityLimit(strict_capacity_limit);
+    }
+
+    virtual bool HasStrictCapacityLimit() const override {
+      return target_->HasStrictCapacityLimit();
+    }
+
+    virtual size_t GetCapacity() const override {
+      return target_->GetCapacity();
+    }
+
+    virtual size_t GetUsage() const override {
+      return target_->GetUsage();
+    }
+
+    virtual size_t GetUsage(Handle* handle) const override {
+      return target_->GetUsage(handle);
+    }
+
+    virtual size_t GetPinnedUsage() const override {
+      return target_->GetPinnedUsage();
+    }
+
+    virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; }
+
+    virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                        bool thread_safe) override {
+      return target_->ApplyToAllCacheEntries(callback, thread_safe);
+    }
+
+    virtual void EraseUnRefEntries() override {
+      return target_->EraseUnRefEntries();
+    }
+
+    int num_lookups() { return num_lookups_; }
+
+    int num_found() { return num_found_; }
+
+    int num_inserts() { return num_inserts_; }
+   private:
+    std::shared_ptr<Cache> target_;
+    int num_lookups_;
+    int num_found_;
+    int num_inserts_;
+  };
+
+  std::shared_ptr<MyBlockCache> compressed_cache_;
+  std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  bool compression_enabled_;
+  std::vector<std::string> values_;
+  bool fill_cache_;
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+           keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+           keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  int expected_reads = random_reads + (fill_cache() ? 0 : 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_ints{1,2,15,16,55,81,82,83,84,85};
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    key_data[i] = Key(key_ints[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+           keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+  }
+  expected_reads += (fill_cache() ? 2 : 4);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParallelIO, DBBasicTestWithParallelIO,
+    // Params are as follows -
+    // Param 0 - Compressed cache enabled
+    // Param 1 - Uncompressed cache enabled
+    // Param 2 - Data compression enabled
+    // Param 3 - ReadOptions::fill_cache
+    ::testing::Values(std::make_tuple(false, true, true, true),
+                      std::make_tuple(true, true, true, true),
+                      std::make_tuple(false, true, false, true),
+                      std::make_tuple(false, true, true, false),
+                      std::make_tuple(true, true, true, false),
+                      std::make_tuple(false, true, false, false)));
+
 class DBBasicTestWithTimestampWithParam
     : public DBTestBase,
       public testing::WithParamInterface<bool> {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 017d6126c2b..edddecf78bd 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -160,6 +160,13 @@ bool PrefixExtractorChanged(const TableProperties* table_properties,
   }
 }
 
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+  CacheAllocationPtr heap_buf;
+  heap_buf = AllocateBlock(buf.size(), allocator);
+  memcpy(heap_buf.get(), buf.data(), buf.size());
+  return heap_buf;
+}
+
 }  // namespace
 
 // Encapsulates common functionality for the various index reader
@@ -421,7 +428,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       // filter blocks
       s = table()->MaybeReadBlockAndLoadToCache(
           prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context);
+          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
+          /*contents=*/nullptr);
 
       assert(s.ok() || block.GetValue() == nullptr);
       if (s.ok() && block.GetValue() != nullptr) {
@@ -1745,8 +1753,6 @@ Status BlockBasedTable::PutDataBlockToCache(
           : Cache::Priority::LOW;
   assert(cached_block);
   assert(cached_block->IsEmpty());
-  assert(raw_block_comp_type == kNoCompression ||
-         block_cache_compressed != nullptr);
 
   Status s;
   Statistics* statistics = ioptions.statistics;
@@ -2195,11 +2201,105 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
       rep->index_value_is_full, block_contents_pinned);
 }
 
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, CachableEntry<Block>& block, TBlockIter* input_iter,
+    Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep_->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+
+// Lookup the cache for the given data block referenced by an index iterator
+// value (i.e BlockHandle). If it exists in the cache, initialize block to
+// the contents of the data block.
+Status BlockBasedTable::GetDataBlockFromCache(
+    const ReadOptions& ro, const BlockHandle& handle,
+    const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block, BlockType block_type,
+    GetContext* get_context) const {
+  BlockCacheLookupContext lookup_data_block_context(
+      TableReaderCaller::kUserMultiGet);
+  Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict, block,
+                    block_type, get_context, &lookup_data_block_context);
+  if (s.IsIncomplete()) {
+    s = Status::OK();
+  }
+
+  return s;
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
-    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -2231,14 +2331,17 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
                          compressed_cache_key);
     }
 
-    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                              ro, block_entry, uncompression_dict, block_type,
-                              get_context);
-    if (block_entry->GetValue()) {
-      // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
-      // compressed block cache.
-      is_cache_hit = true;
+    if (!contents) {
+      s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                                ro, block_entry, uncompression_dict, block_type,
+                                get_context);
+      if (block_entry->GetValue()) {
+        // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+        // compressed block cache.
+        is_cache_hit = true;
+      }
     }
+
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
@@ -2248,7 +2351,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
           block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
-      {
+      if (!contents) {
         StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
@@ -2259,6 +2362,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
         s = block_fetcher.ReadBlockContents();
         raw_block_comp_type = block_fetcher.get_compression_type();
+        contents = &raw_block_contents;
+      } else {
+        raw_block_comp_type = contents->get_compression_type();
       }
 
       if (s.ok()) {
@@ -2266,7 +2372,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
         // If filling cache is allowed and a cache is configured, try to put the
         // block to the cache.
         s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
-                                block_entry, &raw_block_contents,
+                                block_entry, contents,
                                 raw_block_comp_type, uncompression_dict, seq_no,
                                 GetMemoryAllocator(rep_->table_options),
                                 block_type, get_context);
@@ -2331,6 +2437,172 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   return s;
 }
 
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// batch - A MultiGetRange with only those keys with unique data blocks not
+//         found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+void BlockBasedTable::MaybeLoadBlocksToCache(
+    const ReadOptions& options,
+    const MultiGetRange* batch,
+    const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+    autovector<
+      CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+    char* scratch,
+    const UncompressionDict& uncompression_dict) const {
+
+  RandomAccessFileReader* file = rep_->file.get();
+  const Footer& footer = rep_->footer;
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
+  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+  if (file->use_direct_io() || ioptions.allow_mmap_reads) {
+    size_t idx_in_batch = 0;
+    for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+         ++mget_iter, ++idx_in_batch) {
+      BlockCacheLookupContext lookup_data_block_context(
+          TableReaderCaller::kUserMultiGet);
+      const BlockHandle& handle = (*handles)[idx_in_batch];
+      if (handle.IsNull()) {
+        continue;
+      }
+
+      (*statuses)[idx_in_batch] = RetrieveBlock(nullptr, options, handle,
+            uncompression_dict, &(*results)[idx_in_batch], BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context);
+    }
+    return;
+  }
+
+  autovector<ReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+  size_t buf_offset = 0;
+  size_t idx_in_batch = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    ReadRequest req;
+    req.len = handle.size() + kBlockTrailerSize;
+    if (scratch == nullptr) {
+      req.scratch = new char[req.len];
+    } else {
+      req.scratch = scratch + buf_offset;
+      buf_offset += req.len;
+    }
+    req.offset = handle.offset();
+    req.status = Status::OK();
+    read_reqs.emplace_back(req);
+  }
+
+  file->MultiRead(&read_reqs[0], read_reqs.size());
+
+  size_t read_req_idx = 0;
+  idx_in_batch = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    ReadRequest& req = read_reqs[read_req_idx++];
+    Status s = req.status;
+    if (s.ok()) {
+      if (req.result.size() != handle.size() + kBlockTrailerSize) {
+        s = Status::Corruption("truncated block read from " +
+                               rep_->file->file_name() + " offset " +
+                               ToString(handle.offset()) + ", expected " +
+                               ToString(handle.size() + kBlockTrailerSize) +
+                               " bytes, got " + ToString(req.result.size()));
+      }
+    }
+
+    BlockContents raw_block_contents;
+    if (s.ok()) {
+      if (scratch == nullptr) {
+        // We allocated a buffer for this block. Give ownership of it to
+        // BlockContents so it can free the memory
+        assert(req.result.data() == req.scratch);
+        std::unique_ptr<char[]> raw_block(req.scratch);
+        raw_block_contents = BlockContents(std::move(raw_block),
+                                 handle.size());
+      } else {
+        // We used the scratch buffer, so no need to free anything
+        raw_block_contents = BlockContents(Slice(req.scratch,
+                                 handle.size()));
+      }
+#ifndef NDEBUG
+      raw_block_contents.is_raw_block = true;
+#endif
+      if (options.verify_checksums) {
+        PERF_TIMER_GUARD(block_checksum_time);
+        const char* data = req.result.data();
+        uint32_t expected = DecodeFixed32(data + handle.size() + 1);
+        s = rocksdb::VerifyChecksum(footer.checksum(), req.result.data(),
+                                    handle.size() + 1, expected);
+      }
+    }
+    if (s.ok()) {
+      if (options.fill_cache) {
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+        // MaybeReadBlockAndLoadToCache will insert into the block caches if
+        // necessary. Since we're passing the raw block contents, it will
+        // avoid looking up the block cache
+        s = MaybeReadBlockAndLoadToCache(nullptr, options, handle,
+              uncompression_dict, block_entry, BlockType::kData,
+              mget_iter->get_context, &lookup_data_block_context,
+              &raw_block_contents);
+      } else {
+        CompressionType compression_type =
+                raw_block_contents.get_compression_type();
+        BlockContents contents;
+        if (compression_type != kNoCompression) {
+          UncompressionContext context(compression_type);
+          UncompressionInfo info(context, uncompression_dict, compression_type);
+          s = UncompressBlockContents(info, req.result.data(), handle.size(),
+                    &contents, footer.version(), rep_->ioptions,
+                    memory_allocator);
+        } else {
+          if (scratch != nullptr) {
+            // If we used the scratch buffer, then the contents need to be
+            // copied to heap
+            Slice raw = Slice(req.result.data(), handle.size());
+            contents = BlockContents(CopyBufferToHeap(
+                  GetMemoryAllocator(rep_->table_options), raw),
+                  handle.size());
+          } else {
+            contents = std::move(raw_block_contents);
+          }
+        }
+        if (s.ok()) {
+          (*results)[idx_in_batch].SetOwnedValue(new Block(std::move(contents),
+                global_seqno, read_amp_bytes_per_bit, ioptions.statistics));
+        }
+      }
+    }
+    (*statuses)[idx_in_batch] = s;
+  }
+}
+
 Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
@@ -2347,7 +2619,8 @@ Status BlockBasedTable::RetrieveBlock(
        block_type != BlockType::kIndex)) {
     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
                                      uncompression_dict, block_entry,
-                                     block_type, get_context, lookup_context);
+                                     block_type, get_context, lookup_context,
+                                     /*contents=*/nullptr);
 
     if (!s.ok()) {
       return s;
@@ -3248,8 +3521,101 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       iiter_unique_ptr.reset(iiter);
     }
 
-    DataBlockIter biter;
     uint64_t offset = std::numeric_limits<uint64_t>::max();
+    autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+    static const size_t kMultiGetReadStackBufSize = 8192;
+    char stack_buf[kMultiGetReadStackBufSize];
+    std::unique_ptr<char[]> block_buf;
+    {
+      MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+                                     sst_file_range.end());
+      BlockCacheLookupContext lookup_compression_dict_context(
+          TableReaderCaller::kUserMultiGet);
+      auto uncompression_dict_storage = GetUncompressionDict(nullptr, no_io,
+                                          sst_file_range.begin()->get_context,
+                                          &lookup_compression_dict_context);
+      const UncompressionDict& uncompression_dict =
+          uncompression_dict_storage.GetValue() == nullptr
+              ? UncompressionDict::GetEmptyDict()
+              : *uncompression_dict_storage.GetValue();
+      size_t total_len = 0;
+      ReadOptions ro = read_options;
+      ro.read_tier = kBlockCacheTier;
+
+      for (auto miter = data_block_range.begin();
+            miter != data_block_range.end(); ++miter) {
+        const Slice& key = miter->ikey;
+        iiter->Seek(miter->ikey);
+
+        IndexValue v;
+        if (iiter->Valid()) {
+          v = iiter->value();
+        }
+        if (!iiter->Valid() ||
+            (!v.first_internal_key.empty() && !skip_filters &&
+            UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                    .Compare(ExtractUserKey(key),
+                             ExtractUserKey(v.first_internal_key)) < 0)) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          *(miter->s) = iiter->status();
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+        statuses.emplace_back();
+        results.emplace_back();
+        if (v.handle.offset() == offset) {
+          // We're going to reuse the block for this key later on. No need to
+          // look it up now. Place a null handle
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          continue;
+        }
+        offset = v.handle.offset();
+        BlockHandle handle = v.handle;
+        Status s = GetDataBlockFromCache(ro, handle, uncompression_dict,
+              &(results.back()), BlockType::kData, miter->get_context);
+        if (s.ok() && !results.back().IsEmpty()) {
+          // Found it in the cache. Add NULL handle to indicate there is
+          // nothing to read from disk
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+        } else {
+          block_handles.emplace_back(handle);
+          total_len += handle.size();
+        }
+      }
+
+      if (total_len) {
+        char* scratch = nullptr;
+        // If the blocks need to be uncompressed and we don't need the
+        // compressed blocks, then we can use a contiguous block of
+        // memory to read in all the blocks as it will be temporary
+        // storage
+        // 1. If blocks are compressed and compressed block cache is there,
+        //    alloc heap bufs
+        // 2. If blocks are uncompressed, alloc heap bufs
+        // 3. If blocks are compressed and no compressed block cache, use
+        //    stack buf
+        if (rep_->table_options.block_cache_compressed == nullptr &&
+            rep_->blocks_maybe_compressed) {
+          if (total_len <= kMultiGetReadStackBufSize) {
+            scratch = stack_buf;
+          } else {
+            scratch = new char[total_len];
+            block_buf.reset(scratch);
+          }
+        }
+        MaybeLoadBlocksToCache(read_options,
+            &data_block_range, &block_handles, &statuses, &results,
+            scratch, uncompression_dict);
+      }
+    }
+
+    DataBlockIter first_biter;
+    DataBlockIter next_biter;
+    size_t idx_in_batch = 0;
     for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
          ++miter) {
       Status s;
@@ -3257,83 +3623,97 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       const Slice& key = miter->ikey;
       bool matched = false;  // if such user key matched a key in SST
       bool done = false;
-      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-        IndexValue v = iiter->value();
-        if (!v.first_internal_key.empty() && !skip_filters &&
-            UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                    .Compare(ExtractUserKey(key),
-                             ExtractUserKey(v.first_internal_key)) < 0) {
-          // The requested key falls between highest key in previous block and
-          // lowest key in current block.
-          break;
-        }
-
+      bool first_block = true;
+      do {
+        DataBlockIter* biter = nullptr;
         bool reusing_block = true;
         uint64_t referenced_data_size = 0;
         bool does_referenced_key_exist = false;
         BlockCacheLookupContext lookup_data_block_context(
             TableReaderCaller::kUserMultiGet);
-        if (iiter->value().handle.offset() != offset) {
-          offset = iiter->value().handle.offset();
-          biter.Invalidate(Status::OK());
+        if (first_block) {
+          if (!block_handles[idx_in_batch].IsNull() ||
+              !results[idx_in_batch].IsEmpty()) {
+            first_biter.Invalidate(Status::OK());
+            NewDataBlockIterator<DataBlockIter>(
+                read_options, results[idx_in_batch], &first_biter,
+                statuses[idx_in_batch]);
+            reusing_block = false;
+          }
+          biter = &first_biter;
+          idx_in_batch++;
+        } else {
+          IndexValue v = iiter->value();
+          if (!v.first_internal_key.empty() && !skip_filters &&
+              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                      .Compare(ExtractUserKey(key),
+                               ExtractUserKey(v.first_internal_key)) < 0) {
+            // The requested key falls between highest key in previous block and
+            // lowest key in current block.
+            break;
+          }
+
+          next_biter.Invalidate(Status::OK());
           NewDataBlockIterator<DataBlockIter>(
-              read_options, v.handle, &biter, BlockType::kData, get_context,
-              &lookup_data_block_context, Status(), nullptr);
+              read_options, iiter->value().handle, &next_biter,
+              BlockType::kData, get_context, &lookup_data_block_context,
+              Status(), nullptr);
+          biter = &next_biter;
           reusing_block = false;
         }
 
         if (read_options.read_tier == kBlockCacheTier &&
-            biter.status().IsIncomplete()) {
+            biter->status().IsIncomplete()) {
           // couldn't get block from block_cache
           // Update Saver.state to Found because we are only looking for
           // whether we can guarantee the key is not there when "no_io" is set
           get_context->MarkKeyMayExist();
           break;
         }
-        if (!biter.status().ok()) {
-          s = biter.status();
+        if (!biter->status().ok()) {
+          s = biter->status();
           break;
         }
 
-        bool may_exist = biter.SeekForGet(key);
+        bool may_exist = biter->SeekForGet(key);
         if (!may_exist) {
           // HashSeek cannot find the key this block and the the iter is not
           // the end of the block, i.e. cannot be in the following blocks
           // either. In this case, the seek_key cannot be found, so we break
           // from the top level for-loop.
-          done = true;
-        } else {
-          // Call the *saver function on each entry/block until it returns false
-          for (; biter.Valid(); biter.Next()) {
-            ParsedInternalKey parsed_key;
-            Cleanable dummy;
-            Cleanable* value_pinner = nullptr;
-
-            if (!ParseInternalKey(biter.key(), &parsed_key)) {
-              s = Status::Corruption(Slice());
-            }
-            if (biter.IsValuePinned()) {
-              if (reusing_block) {
-                Cache* block_cache = rep_->table_options.block_cache.get();
-                assert(biter.cache_handle() != nullptr);
-                block_cache->Ref(biter.cache_handle());
-                dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                                      biter.cache_handle());
-                value_pinner = &dummy;
-              } else {
-                value_pinner = &biter;
-              }
-            }
+          break;
+        }
 
-            if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
-                                        value_pinner)) {
-              does_referenced_key_exist = true;
-              referenced_data_size = biter.key().size() + biter.value().size();
-              done = true;
-              break;
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter->Valid(); biter->Next()) {
+          ParsedInternalKey parsed_key;
+          Cleanable dummy;
+          Cleanable* value_pinner = nullptr;
+
+          if (!ParseInternalKey(biter->key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+          if (biter->IsValuePinned()) {
+            if (reusing_block) {
+              Cache* block_cache = rep_->table_options.block_cache.get();
+              assert(biter->cache_handle() != nullptr);
+              block_cache->Ref(biter->cache_handle());
+              dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                                    biter->cache_handle());
+              value_pinner = &dummy;
+            } else {
+              value_pinner = biter;
             }
           }
-          s = biter.status();
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter->value(), &matched, value_pinner)) {
+            does_referenced_key_exist = true;
+            referenced_data_size = biter->key().size() + biter->value().size();
+            done = true;
+            break;
+          }
+          s = biter->status();
         }
         // Write the block cache access.
         if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
@@ -3354,11 +3734,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
               access_record, lookup_data_block_context.block_key,
               rep_->cf_name_for_tracing(), key);
         }
+        s = biter->status();
         if (done) {
           // Avoid the extra Next which is expensive in two-level indexes
           break;
         }
-      }
+        if (first_block) {
+          iiter->Seek(key);
+        }
+        first_block = false;
+        iiter->Next();
+      } while (iiter->Valid());
+
       if (matched && filter != nullptr && !filter->IsBlockBased()) {
         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 4356713910c..358bc8b8d22 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -233,6 +233,12 @@ class BlockBasedTable : public TableReader {
       BlockCacheLookupContext* lookup_context, Status s,
       FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
 
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   CachableEntry<Block>& block,
+                                   TBlockIter* input_iter, Status s) const;
+
   class PartitionedIndexIteratorState;
 
   friend class PartitionIndexReader;
@@ -276,7 +282,8 @@ class BlockBasedTable : public TableReader {
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
       CachableEntry<Block>* block_entry, BlockType block_type,
-      GetContext* get_context, BlockCacheLookupContext* lookup_context) const;
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
@@ -289,6 +296,20 @@ class BlockBasedTable : public TableReader {
                        BlockCacheLookupContext* lookup_context,
                        bool for_compaction = false) const;
 
+  Status GetDataBlockFromCache(
+      const ReadOptions& ro, const BlockHandle& handle,
+      const UncompressionDict& uncompression_dict,
+      CachableEntry<Block>* block_entry, BlockType block_type,
+      GetContext* get_context) const;
+
+  void MaybeLoadBlocksToCache(
+      const ReadOptions& options, const MultiGetRange* batch,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
+      autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+      autovector<
+        CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+      char* scratch, const UncompressionDict& uncompression_dict) const;
+
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
   // were they not present in cache yet.
diff --git a/table/format.h b/table/format.h
index 539ca88805c..effc13addaf 100644
--- a/table/format.h
+++ b/table/format.h
@@ -26,7 +26,9 @@
 #include "options/cf_options.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
+#include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "util/xxhash.h"
 
 namespace rocksdb {
 
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index bf88503339a..f49866d13e7 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -192,6 +192,49 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
   return s;
 }
 
+Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
+                                         size_t num_reqs) const {
+  Status s;
+  uint64_t elapsed = 0;
+  assert(!use_direct_io());
+  assert(!for_compaction_);
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
+#endif // ROCKSDB_LITE
+      {
+        IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+        s = file_->MultiRead(read_reqs, num_reqs);
+      }
+      for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+            NotifyOnFileReadFinish(read_reqs[i].offset,
+                read_reqs[i].result.size(), start_ts, finish_ts,
+                read_reqs[i].status);
+        }
+#endif // ROCKSDB_LITE
+        IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+      }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+
 Status WritableFileWriter::Append(const Slice& data) {
   const char* src = data.data();
   size_t left = data.size();
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 01df1067ed9..0a7e5032d2f 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -161,6 +161,8 @@ class RandomAccessFileReader {
   Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
               bool for_compaction = false) const;
 
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) const;
+
   Status Prefetch(uint64_t offset, size_t n) const {
     return file_->Prefetch(offset, n);
   }

From c36067575037573a1ee3980bf8c27a93b4cf0694 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 1 Jul 2019 11:45:12 -0700
Subject: [PATCH 189/572] Add secondary instance to stress test (#5479)

Summary:
This PR allows users to run stress tests on secondary instance.

Test plan (on devserver)
```
./db_stress -ops_per_thread=100000 -enable_secondary=true -threads=32 -secondary_catch_up_one_in=10000 -clear_column_family_one_in=1000 -reopen=100
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5479

Differential Revision: D16074325

Pulled By: riversand963

fbshipit-source-id: c0ed959e7b6c7cda3efd0b3070ab379de3b29f1c
---
 tools/db_stress.cc | 171 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 6a3e8bdefb1..813f8068278 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -333,6 +333,11 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter"
 
 DEFINE_string(db, "", "Use the db with the following name.");
 
+DEFINE_string(secondaries_base, "",
+              "Use this path as the base path for secondary instances.");
+
+DEFINE_bool(enable_secondary, false, "Enable secondary instance.");
+
 DEFINE_string(
     expected_values_path, "",
     "File where the array of expected uint32_t values will be stored. If "
@@ -599,6 +604,13 @@ DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
 
 DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
 
+DEFINE_int32(secondary_catch_up_one_in, 0,
+             "If non-zero, the secondaries attemp to catch up with the primary "
+             "once for every N operations on average. 0 indicates the "
+             "secondaries do not try to catch up after open.");
+
+static std::shared_ptr<rocksdb::Statistics> dbstats_secondaries;
+
 enum RepFactory {
   kSkipList,
   kHashSkipList,
@@ -1423,6 +1435,17 @@ class StressTest {
     }
     column_families_.clear();
     delete db_;
+
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
   }
 
   std::shared_ptr<Cache> NewCache(size_t capacity) {
@@ -1620,6 +1643,60 @@ class StressTest {
       }
     }
 
+#ifndef ROCKSDB_LITE
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Start to verify secondaries against primary\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+    for (size_t k = 0; k != secondaries_.size(); ++k) {
+      Status s = secondaries_[k]->TryCatchUpWithPrimary();
+      if (!s.ok()) {
+        fprintf(stderr, "Secondary failed to catch up with primary\n");
+        return false;
+      }
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      // Verify only the default column family since the primary may have
+      // dropped other column families after most recent reopen.
+      std::unique_ptr<Iterator> iter1(db_->NewIterator(ropts));
+      std::unique_ptr<Iterator> iter2(secondaries_[k]->NewIterator(ropts));
+      for (iter1->SeekToFirst(), iter2->SeekToFirst();
+           iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+        if (iter1->key().compare(iter2->key()) != 0 ||
+            iter1->value().compare(iter2->value())) {
+          fprintf(stderr,
+                  "Secondary %d contains different data from "
+                  "primary.\nPrimary: %s : %s\nSecondary: %s : %s\n",
+                  static_cast<int>(k),
+                  iter1->key().ToString(/*hex=*/true).c_str(),
+                  iter1->value().ToString(/*hex=*/true).c_str(),
+                  iter2->key().ToString(/*hex=*/true).c_str(),
+                  iter2->value().ToString(/*hex=*/true).c_str());
+          return false;
+        }
+      }
+      if (iter1->Valid() && !iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is smaller than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      } else if (!iter1->Valid() && iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is larger than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      }
+    }
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Verification of secondaries succeeded\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+#endif  // ROCKSDB_LITE
+
     if (shared.HasVerificationFailedYet()) {
       printf("Verification failed :(\n");
       return false;
@@ -2231,6 +2308,19 @@ class StressTest {
         TestIterate(thread, read_opts, rand_column_families, rand_keys);
       }
       thread->stats.FinishedSingleOp();
+#ifndef ROCKSDB_LITE
+      uint32_t tid = thread->tid;
+      assert(secondaries_.empty() ||
+             static_cast<size_t>(tid) < secondaries_.size());
+      if (FLAGS_secondary_catch_up_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
+        Status s = secondaries_[tid]->TryCatchUpWithPrimary();
+        if (!s.ok()) {
+          VerificationAbort(shared, "Secondary instance failed to catch up", s);
+          break;
+        }
+      }
+#endif
     }
 
     thread->stats.Stop();
@@ -2864,11 +2954,52 @@ class StressTest {
       }
       assert(!s.ok() || column_families_.size() ==
                             static_cast<size_t>(FLAGS_column_families));
+
+      if (FLAGS_enable_secondary) {
+#ifndef ROCKSDB_LITE
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        secondary_cfh_lists_.clear();
+        secondary_cfh_lists_.resize(FLAGS_threads);
+        Options tmp_opts;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        tmp_opts.statistics = dbstats_secondaries;
+        tmp_opts.env = FLAGS_env;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  cf_descriptors, &secondary_cfh_lists_[i],
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+#else
+        fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
+        exit(1);
+#endif
+      }
     } else {
 #ifndef ROCKSDB_LITE
       DBWithTTL* db_with_ttl;
       s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
       db_ = db_with_ttl;
+      if (FLAGS_enable_secondary) {
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        Options tmp_opts;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
 #else
       fprintf(stderr, "TTL is not supported in RocksDBLite\n");
       exit(1);
@@ -2891,6 +3022,17 @@ class StressTest {
     txn_db_ = nullptr;
 #endif
 
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
+
     num_times_reopened_++;
     auto now = FLAGS_env->NowMicros();
     fprintf(stdout, "%s Reopening database for the %dth time\n",
@@ -2903,6 +3045,10 @@ class StressTest {
     if (dbstats) {
       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
     }
+    if (dbstats_secondaries) {
+      fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
+              dbstats_secondaries->ToString().c_str());
+    }
   }
 
   std::shared_ptr<Cache> cache_;
@@ -2920,6 +3066,10 @@ class StressTest {
   std::unordered_map<std::string, std::vector<std::string>> options_table_;
   std::vector<std::string> options_index_;
   std::atomic<bool> db_preload_finished_;
+
+  // Fields used for stress-testing secondary instance in the same process
+  std::vector<DB*> secondaries_;
+  std::vector<std::vector<ColumnFamilyHandle*> > secondary_cfh_lists_;
 };
 
 class NonBatchedOpsStressTest : public StressTest {
@@ -4153,6 +4303,9 @@ int main(int argc, char** argv) {
 
   if (FLAGS_statistics) {
     dbstats = rocksdb::CreateDBStatistics();
+    if (FLAGS_enable_secondary) {
+      dbstats_secondaries = rocksdb::CreateDBStatistics();
+    }
   }
   FLAGS_compression_type_e =
     StringToCompressionType(FLAGS_compression_type.c_str());
@@ -4261,6 +4414,24 @@ int main(int argc, char** argv) {
       FLAGS_db = default_db_path;
   }
 
+  if (FLAGS_enable_secondary && FLAGS_secondaries_base.empty()) {
+    std::string default_secondaries_path;
+    FLAGS_env->GetTestDirectory(&default_secondaries_path);
+    default_secondaries_path += "/dbstress_secondaries";
+    rocksdb::Status s = FLAGS_env->CreateDirIfMissing(default_secondaries_path);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to create directory %s: %s\n",
+              default_secondaries_path.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    FLAGS_secondaries_base = default_secondaries_path;
+  }
+
+  if (!FLAGS_enable_secondary && FLAGS_secondary_catch_up_one_in > 0) {
+    fprintf(stderr, "Secondary instance is disabled.\n");
+    exit(1);
+  }
+
   rocksdb_kill_odds = FLAGS_kill_random_test;
   rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
 

From 3886dddc3b44bf5061c0f93eab578c51e8bad7bd Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Mon, 1 Jul 2019 11:53:25 -0700
Subject: [PATCH 190/572] force flushing stats CF to avoid holding old logs
 (#5509)

Summary:
WAL records RocksDB writes to all column families. When user flushes a a column family, the old WAL will not accept new writes but cannot be deleted yet because it may still contain live data for other column families. (See https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log#life-cycle-of-a-wal for detailed explanation)
Because of this, if there is a column family that receive very infrequent writes and no manual flush is called for it, it could prevent a lot of WALs from being deleted. PR https://github.com/facebook/rocksdb/pull/5046 introduced persistent stats column family which is a good example of such column families. Depending on the config, it may have long intervals between writes, and user is unaware of it which makes it difficult to call manual flush for it.
This PR addresses the problem for persistent stats column family by forcing a flush for persistent stats column family when 1) another column family is flushed 2) persistent stats column family's log number is the smallest among all column families, this way persistent stats column family will  keep advancing its log number when necessary, allowing RocksDB to delete old WAL files.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5509

Differential Revision: D16045896

Pulled By: miasantreble

fbshipit-source-id: 286837b633e988417f0096ff38384742d3b40ef4
---
 db/db_impl/db_impl.h                   |  2 +
 db/db_impl/db_impl_compaction_flush.cc | 28 +++++++++-
 db/db_impl/db_impl_write.cc            | 37 +++++++++++++
 monitoring/stats_history_test.cc       | 77 +++++++++++++++++++++++++-
 4 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index b5437c49543..e57768a74af 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1292,6 +1292,8 @@ class DBImpl : public DB {
 
   Status ScheduleFlushes(WriteContext* context);
 
+  void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
   void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 8cb37484cac..ff03e591d28 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1551,13 +1551,39 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
       s = SwitchMemtable(cfd, &context);
     }
-
     if (s.ok()) {
       if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
           !cached_recoverable_state_empty_.load()) {
         flush_memtable_id = cfd->imm()->GetLatestMemTableID();
         flush_req.emplace_back(cfd, flush_memtable_id);
       }
+      if (immutable_db_options_.persist_stats_to_disk) {
+        ColumnFamilyData* cfd_stats =
+            versions_->GetColumnFamilySet()->GetColumnFamily(
+                kPersistentStatsColumnFamilyName);
+        if (cfd_stats != nullptr && cfd_stats != cfd &&
+            !cfd_stats->mem()->IsEmpty()) {
+          // only force flush stats CF when it will be the only CF lagging
+          // behind after the current flush
+          bool stats_cf_flush_needed = true;
+          for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+            if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+              continue;
+            }
+            if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+              stats_cf_flush_needed = false;
+            }
+          }
+          if (stats_cf_flush_needed) {
+            ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                           "Force flushing stats CF with manual flush of %s "
+                           "to avoid holding old logs", cfd->GetName().c_str());
+            s = SwitchMemtable(cfd_stats, &context);
+            flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
+            flush_req.emplace_back(cfd_stats, flush_memtable_id);
+          }
+        }
+      }
     }
 
     if (s.ok() && !flush_req.empty()) {
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 21b123c3a94..c0d320013b7 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1228,6 +1228,7 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
         cfds.push_back(cfd);
       }
     }
+    MaybeFlushStatsCF(&cfds);
   }
   for (const auto cfd : cfds) {
     cfd->Ref();
@@ -1294,6 +1295,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
     if (cfd_picked != nullptr) {
       cfds.push_back(cfd_picked);
     }
+    MaybeFlushStatsCF(&cfds);
   }
 
   for (const auto cfd : cfds) {
@@ -1437,6 +1439,40 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
   return Status::OK();
 }
 
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+  assert(cfds != nullptr);
+  if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+    ColumnFamilyData* cfd_stats =
+        versions_->GetColumnFamilySet()->GetColumnFamily(
+            kPersistentStatsColumnFamilyName);
+    if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+      for (ColumnFamilyData* cfd : *cfds) {
+        if (cfd == cfd_stats) {
+          // stats CF already included in cfds
+          return;
+        }
+      }
+      // force flush stats CF when its log number is less than all other CF's
+      // log numbers
+      bool force_flush_stats_cf = true;
+      for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+        if (loop_cfd == cfd_stats) {
+          continue;
+        }
+        if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+          force_flush_stats_cf = false;
+        }
+      }
+      if (force_flush_stats_cf) {
+        cfds->push_back(cfd_stats);
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Force flushing stats CF with automated flush "
+                       "to avoid holding old logs");
+      }
+    }
+  }
+}
+
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
   autovector<ColumnFamilyData*> cfds;
   if (immutable_db_options_.atomic_flush) {
@@ -1450,6 +1486,7 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
     while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
       cfds.push_back(tmp_cfd);
     }
+    MaybeFlushStatsCF(&cfds);
   }
   Status status;
   for (auto& cfd : cfds) {
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index 16681fe05d8..bef928558d7 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -561,7 +561,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
   Close();
 
   // Reopen and flush memtable.
-  Reopen(options);
+  ASSERT_OK(TryReopen(options));
   Flush();
   Close();
   // Now check keys in read only mode.
@@ -569,6 +569,81 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
 }
 #endif  // !ROCKSDB_LITE
 
+TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ColumnFamilyData* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  ColumnFamilyData* cfd_stats = static_cast<ColumnFamilyHandleImpl*>(
+                                    dbfull()->PersistentStatsColumnFamily())
+                                    ->cfd();
+  ColumnFamilyData* cfd_test =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+
+  ASSERT_OK(Put("foo", "v0"));
+  ASSERT_OK(Put("bar", "v0"));
+  ASSERT_EQ("v0", Get("bar"));
+  ASSERT_EQ("v0", Get("foo"));
+  ASSERT_OK(Put(1, "Eevee", "v0"));
+  ASSERT_EQ("v0", Get(1, "Eevee"));
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  // writing to all three cf, flush default cf
+  // LogNumbers: default: 14, stats: 4, pikachu: 4
+  ASSERT_OK(Flush());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo1", "v1"));
+  ASSERT_OK(Put("bar1", "v1"));
+  ASSERT_EQ("v1", Get("bar1"));
+  ASSERT_EQ("v1", Get("foo1"));
+  ASSERT_OK(Put(1, "Vaporeon", "v1"));
+  ASSERT_EQ("v1", Get(1, "Vaporeon"));
+  // writing to default and test cf, flush test cf
+  // LogNumbers: default: 14, stats: 16, pikachu: 16
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_GT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo2", "v2"));
+  ASSERT_OK(Put("bar2", "v2"));
+  ASSERT_EQ("v2", Get("bar2"));
+  ASSERT_EQ("v2", Get("foo2"));
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  // writing to default and stats cf, flushing default cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 19
+  ASSERT_OK(Flush());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo3", "v3"));
+  ASSERT_OK(Put("bar3", "v3"));
+  ASSERT_EQ("v3", Get("bar3"));
+  ASSERT_EQ("v3", Get("foo3"));
+  ASSERT_OK(Put(1, "Jolteon", "v3"));
+  ASSERT_EQ("v3", Get(1, "Jolteon"));
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  // writing to all three cf, flushing test cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 22
+  ASSERT_OK(Flush(1));
+  ASSERT_LT(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+  Close();
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 9f0bd568897288952329e05bf2354cb21602cd6d Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 1 Jul 2019 12:43:14 -0700
Subject: [PATCH 191/572] Cache simulator: Refactor the cache simulator so that
 we can add alternative policies easily (#5517)

Summary:
This PR creates cache_simulator.h file. It contains a CacheSimulator that runs against a block cache trace record. We can add alternative cache simulators derived from CacheSimulator later. For example, this PR adds a PrioritizedCacheSimulator that inserts filter/index/uncompressed dictionary blocks with high priority.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5517

Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32

Differential Revision: D16043689

Pulled By: HaoyuHuang

fbshipit-source-id: 65f28ed52b866ffb0e6eceffd7f9ca7c45bb680d
---
 CMakeLists.txt                               |   1 +
 TARGETS                                      |   1 +
 src.mk                                       |   1 +
 tools/block_cache_trace_analyzer.cc          |  67 +++---------
 tools/block_cache_trace_analyzer.h           |  49 +--------
 utilities/simulator_cache/cache_simulator.cc | 104 +++++++++++++++++++
 utilities/simulator_cache/cache_simulator.h  |  98 +++++++++++++++++
 7 files changed, 219 insertions(+), 102 deletions(-)
 create mode 100644 utilities/simulator_cache/cache_simulator.cc
 create mode 100644 utilities/simulator_cache/cache_simulator.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ff61dca99f..0ca338bd63f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -685,6 +685,7 @@ set(SOURCES
         utilities/persistent_cache/block_cache_tier_metadata.cc
         utilities/persistent_cache/persistent_cache_tier.cc
         utilities/persistent_cache/volatile_tier_impl.cc
+        utilities/simulator_cache/cache_simulator.cc
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/trace/file_trace_reader_writer.cc
diff --git a/TARGETS b/TARGETS
index a43ed6b1085..3935f1f740d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -280,6 +280,7 @@ cpp_library(
         "utilities/persistent_cache/block_cache_tier_metadata.cc",
         "utilities/persistent_cache/persistent_cache_tier.cc",
         "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/cache_simulator.cc",
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
         "utilities/trace/file_trace_reader_writer.cc",
diff --git a/src.mk b/src.mk
index 71c2bd01803..7c35ee67589 100644
--- a/src.mk
+++ b/src.mk
@@ -199,6 +199,7 @@ LIB_SOURCES =                                                   \
   utilities/persistent_cache/block_cache_tier_metadata.cc       \
   utilities/persistent_cache/persistent_cache_tier.cc           \
   utilities/persistent_cache/volatile_tier_impl.cc              \
+  utilities/simulator_cache/cache_simulator.cc                  \
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
   utilities/trace/file_trace_reader_writer.cc                   \
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 78753a21622..4770348a79d 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -24,7 +24,7 @@ DEFINE_string(
     "The config file path. One cache configuration per line. The format of a "
     "cache configuration is "
     "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. "
-    "cache_name is lru. cache_capacity can be xK, xM or xG "
+    "cache_name is lru or lru_priority. cache_capacity can be xK, xM or xG "
     "where x is a positive number.");
 DEFINE_int32(block_cache_trace_downsample_ratio, 1,
              "The trace collected accesses on one in every "
@@ -179,47 +179,6 @@ double percent(uint64_t numerator, uint64_t denomenator) {
 
 }  // namespace
 
-BlockCacheTraceSimulator::BlockCacheTraceSimulator(
-    uint64_t warmup_seconds, uint32_t downsample_ratio,
-    const std::vector<CacheConfiguration>& cache_configurations)
-    : warmup_seconds_(warmup_seconds),
-      downsample_ratio_(downsample_ratio),
-      cache_configurations_(cache_configurations) {
-  for (auto const& config : cache_configurations_) {
-    for (auto cache_capacity : config.cache_capacities) {
-      // Scale down the cache capacity since the trace contains accesses on
-      // 1/'downsample_ratio' blocks.
-      uint64_t simulate_cache_capacity =
-          cache_capacity / downsample_ratio_;
-      sim_caches_.push_back(NewSimCache(
-          NewLRUCache(simulate_cache_capacity, config.num_shard_bits),
-          /*real_cache=*/nullptr, config.num_shard_bits));
-    }
-  }
-}
-
-void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
-  if (trace_start_time_ == 0) {
-    trace_start_time_ = access.access_timestamp;
-  }
-  // access.access_timestamp is in microseconds.
-  if (!warmup_complete_ &&
-      trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
-          access.access_timestamp) {
-    for (auto& sim_cache : sim_caches_) {
-      sim_cache->reset_counter();
-    }
-    warmup_complete_ = true;
-  }
-  for (auto& sim_cache : sim_caches_) {
-    auto handle = sim_cache->Lookup(access.block_key);
-    if (handle == nullptr && !access.no_insert) {
-      sim_cache->Insert(access.block_key, /*value=*/nullptr, access.block_size,
-                        /*deleter=*/nullptr);
-    }
-  }
-}
-
 void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   if (!cache_simulator_) {
     return;
@@ -237,27 +196,21 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   const std::string header =
       "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses";
   out << header << std::endl;
-  uint64_t sim_cache_index = 0;
-  for (auto const& config : cache_simulator_->cache_configurations()) {
-    for (auto cache_capacity : config.cache_capacities) {
-      uint64_t hits =
-          cache_simulator_->sim_caches()[sim_cache_index]->get_hit_counter();
-      uint64_t misses =
-          cache_simulator_->sim_caches()[sim_cache_index]->get_miss_counter();
-      uint64_t total_accesses = hits + misses;
-      double miss_ratio = static_cast<double>(misses * 100.0 / total_accesses);
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      double miss_ratio = config_caches.second[i]->miss_ratio();
       // Write the body.
       out << config.cache_name;
       out << ",";
       out << config.num_shard_bits;
       out << ",";
-      out << cache_capacity;
+      out << config.cache_capacities[i];
       out << ",";
       out << std::fixed << std::setprecision(4) << miss_ratio;
       out << ",";
-      out << total_accesses;
+      out << config_caches.second[i]->total_accesses();
       out << std::endl;
-      sim_cache_index++;
     }
   }
   out.close();
@@ -1095,6 +1048,12 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
   if (!cache_configs.empty()) {
     cache_simulator.reset(new BlockCacheTraceSimulator(
         warmup_seconds, downsample_ratio, cache_configs));
+    Status s = cache_simulator->InitializeCaches();
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot initialize cache simulators %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
   }
   BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
                                    FLAGS_block_cache_analysis_result_dir,
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 21a99f7db76..617b90280c9 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -12,57 +12,10 @@
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/sim_cache.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "utilities/simulator_cache/cache_simulator.h"
 
 namespace rocksdb {
 
-const uint64_t kMicrosInSecond = 1000000;
-
-class BlockCacheTraceAnalyzer;
-
-// A cache configuration provided by user.
-struct CacheConfiguration {
-  std::string cache_name;  // LRU.
-  uint32_t num_shard_bits;
-  std::vector<uint64_t>
-      cache_capacities;  // simulate cache capacities in bytes.
-};
-
-// A block cache simulator that reports miss ratio curves given a set of cache
-// configurations.
-class BlockCacheTraceSimulator {
- public:
-  // warmup_seconds: The number of seconds to warmup simulated caches. The
-  // hit/miss counters are reset after the warmup completes.
-  BlockCacheTraceSimulator(
-      uint64_t warmup_seconds, uint32_t downsample_ratio,
-      const std::vector<CacheConfiguration>& cache_configurations);
-  ~BlockCacheTraceSimulator() = default;
-  // No copy and move.
-  BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
-  BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
-  BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
-  BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
-
-  void Access(const BlockCacheTraceRecord& access);
-
-  const std::vector<std::shared_ptr<SimCache>>& sim_caches() const {
-    return sim_caches_;
-  }
-
-  const std::vector<CacheConfiguration>& cache_configurations() const {
-    return cache_configurations_;
-  }
-
- private:
-  const uint64_t warmup_seconds_;
-  const uint32_t downsample_ratio_;
-  const std::vector<CacheConfiguration> cache_configurations_;
-
-  bool warmup_complete_ = false;
-  std::vector<std::shared_ptr<SimCache>> sim_caches_;
-  uint64_t trace_start_time_ = 0;
-};
-
 // Statistics of a block.
 struct BlockAccessInfo {
   uint64_t num_accesses = 0;
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
new file mode 100644
index 00000000000..145efdb6cba
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -0,0 +1,104 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+namespace rocksdb {
+CacheSimulator::CacheSimulator(std::shared_ptr<SimCache> sim_cache)
+    : sim_cache_(sim_cache) {}
+
+void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  auto handle = sim_cache_->Lookup(access.block_key);
+  if (handle == nullptr && !access.no_insert) {
+    sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+                       /*deleter=*/nullptr, /*handle=*/nullptr);
+  }
+}
+
+void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  auto handle = sim_cache_->Lookup(access.block_key);
+  if (handle == nullptr && !access.no_insert) {
+    Cache::Priority priority = Cache::Priority::LOW;
+    if (access.block_type == TraceType::kBlockTraceFilterBlock ||
+        access.block_type == TraceType::kBlockTraceIndexBlock ||
+        access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
+      priority = Cache::Priority::HIGH;
+    }
+    sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+                       /*deleter=*/nullptr, /*handle=*/nullptr, priority);
+  }
+}
+
+double CacheSimulator::miss_ratio() {
+  uint64_t hits = sim_cache_->get_hit_counter();
+  uint64_t misses = sim_cache_->get_miss_counter();
+  uint64_t total_accesses = hits + misses;
+  return static_cast<double>(misses * 100.0 / total_accesses);
+}
+
+uint64_t CacheSimulator::total_accesses() {
+  return sim_cache_->get_hit_counter() + sim_cache_->get_miss_counter();
+}
+
+BlockCacheTraceSimulator::BlockCacheTraceSimulator(
+    uint64_t warmup_seconds, uint32_t downsample_ratio,
+    const std::vector<CacheConfiguration>& cache_configurations)
+    : warmup_seconds_(warmup_seconds),
+      downsample_ratio_(downsample_ratio),
+      cache_configurations_(cache_configurations) {}
+
+Status BlockCacheTraceSimulator::InitializeCaches() {
+  for (auto const& config : cache_configurations_) {
+    for (auto cache_capacity : config.cache_capacities) {
+      // Scale down the cache capacity since the trace contains accesses on
+      // 1/'downsample_ratio' blocks.
+      uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_;
+      std::shared_ptr<CacheSimulator> sim_cache;
+      if (config.cache_name == "lru") {
+        sim_cache = std::make_shared<CacheSimulator>(NewSimCache(
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0),
+            /*real_cache=*/nullptr, config.num_shard_bits));
+      } else if (config.cache_name == "lru_priority") {
+        sim_cache = std::make_shared<PrioritizedCacheSimulator>(NewSimCache(
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*real_cache=*/nullptr, config.num_shard_bits));
+      } else {
+        // Not supported.
+        return Status::InvalidArgument("Unknown cache name " +
+                                       config.cache_name);
+      }
+      sim_caches_[config].push_back(sim_cache);
+    }
+  }
+  return Status::OK();
+}
+
+void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
+  if (trace_start_time_ == 0) {
+    trace_start_time_ = access.access_timestamp;
+  }
+  // access.access_timestamp is in microseconds.
+  if (!warmup_complete_ &&
+      trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
+          access.access_timestamp) {
+    for (auto& config_caches : sim_caches_) {
+      for (auto& sim_cache : config_caches.second) {
+        sim_cache->reset_counter();
+      }
+    }
+    warmup_complete_ = true;
+  }
+  for (auto& config_caches : sim_caches_) {
+    for (auto& sim_cache : config_caches.second) {
+      sim_cache->Access(access);
+    }
+  }
+}
+
+}  // namespace rocksdb
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
new file mode 100644
index 00000000000..37166d8a9c4
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -0,0 +1,98 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/utilities/sim_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace rocksdb {
+
+const uint64_t kMicrosInSecond = 1000000;
+
+// A cache configuration provided by user.
+struct CacheConfiguration {
+  std::string cache_name;  // LRU.
+  uint32_t num_shard_bits;
+  std::vector<uint64_t>
+      cache_capacities;  // simulate cache capacities in bytes.
+
+  bool operator=(const CacheConfiguration& o) const {
+    return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits;
+  }
+  bool operator<(const CacheConfiguration& o) const {
+    return cache_name < o.cache_name ||
+           (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits);
+  }
+};
+
+// A cache simulator that runs against a block cache trace.
+class CacheSimulator {
+ public:
+  CacheSimulator(std::shared_ptr<SimCache> sim_cache);
+  virtual ~CacheSimulator() = default;
+  // No copy and move.
+  CacheSimulator(const CacheSimulator&) = delete;
+  CacheSimulator& operator=(const CacheSimulator&) = delete;
+  CacheSimulator(CacheSimulator&&) = delete;
+  CacheSimulator& operator=(CacheSimulator&&) = delete;
+
+  virtual void Access(const BlockCacheTraceRecord& access);
+  void reset_counter() { sim_cache_->reset_counter(); }
+  double miss_ratio();
+  uint64_t total_accesses();
+
+ protected:
+  std::shared_ptr<SimCache> sim_cache_;
+};
+
+// A prioritized cache simulator that runs against a block cache trace.
+// It inserts missing index/filter/uncompression-dictionary blocks with high
+// priority in the cache.
+class PrioritizedCacheSimulator : public CacheSimulator {
+ public:
+  PrioritizedCacheSimulator(std::shared_ptr<SimCache> sim_cache)
+      : CacheSimulator(sim_cache) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+};
+
+// A block cache simulator that reports miss ratio curves given a set of cache
+// configurations.
+class BlockCacheTraceSimulator {
+ public:
+  // warmup_seconds: The number of seconds to warmup simulated caches. The
+  // hit/miss counters are reset after the warmup completes.
+  BlockCacheTraceSimulator(
+      uint64_t warmup_seconds, uint32_t downsample_ratio,
+      const std::vector<CacheConfiguration>& cache_configurations);
+  ~BlockCacheTraceSimulator() = default;
+  // No copy and move.
+  BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
+  BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
+
+  Status InitializeCaches();
+
+  void Access(const BlockCacheTraceRecord& access);
+
+  const std::map<CacheConfiguration,
+                 std::vector<std::shared_ptr<CacheSimulator>>>&
+  sim_caches() const {
+    return sim_caches_;
+  }
+
+ private:
+  const uint64_t warmup_seconds_;
+  const uint32_t downsample_ratio_;
+  const std::vector<CacheConfiguration> cache_configurations_;
+
+  bool warmup_complete_ = false;
+  std::map<CacheConfiguration, std::vector<std::shared_ptr<CacheSimulator>>>
+      sim_caches_;
+  uint64_t trace_start_time_ = 0;
+};
+
+}  // namespace rocksdb

From f872009237762abb504c32c781a5b337033f401c Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Mon, 1 Jul 2019 13:02:30 -0700
Subject: [PATCH 192/572] Fix from some C-style casting (#5524)

Summary:
Fix from some C-style casting in bloom.cc and ./tools/db_bench_tool.cc
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5524

Differential Revision: D16075626

Pulled By: elipoz

fbshipit-source-id: 352948885efb64a7ef865942c75c3c727a914207
---
 tools/db_bench_tool.cc | 29 ++++++++++++++++++-----------
 util/bloom.cc          |  6 +++---
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 9b3e2cac35f..cb5b5a38a66 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2397,16 +2397,19 @@ class Benchmark {
       return nullptr;
     }
     if (FLAGS_use_clock_cache) {
-      auto cache = NewClockCache((size_t)capacity, FLAGS_cache_numshardbits);
+      auto cache =
+        NewClockCache(static_cast<size_t>(capacity), FLAGS_cache_numshardbits);
       if (!cache) {
         fprintf(stderr, "Clock cache not supported.");
         exit(1);
       }
       return cache;
     } else {
-      return NewLRUCache((size_t)capacity, FLAGS_cache_numshardbits,
-                         false /*strict_capacity_limit*/,
-                         FLAGS_cache_high_pri_pool_ratio);
+      return NewLRUCache(
+          static_cast<size_t>(capacity),
+          FLAGS_cache_numshardbits,
+          false /*strict_capacity_limit*/,
+          FLAGS_cache_high_pri_pool_ratio);
     }
   }
 
@@ -3604,9 +3607,12 @@ class Benchmark {
     }
     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
-          (unsigned int)FLAGS_num_levels) {
-        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
-                (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size());
+          static_cast<unsigned int>(FLAGS_num_levels)) {
+        fprintf(
+            stderr,
+            "Insufficient number of fanouts specified %d\n",
+            static_cast<int>(
+              FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
         exit(1);
       }
       options.max_bytes_for_level_multiplier_additional =
@@ -4791,7 +4797,7 @@ class Benchmark {
       if (FLAGS_multiread_stride) {
         int64_t key = GetRandomKey(&thread->rand);
         if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
-            (int64_t)FLAGS_num) {
+            static_cast<int64_t>(FLAGS_num)) {
           key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
         }
         for (int64_t i = 0; i < entries_per_batch_; ++i) {
@@ -5161,9 +5167,10 @@ class Benchmark {
               FLAGS_num, &lower_bound);
           options.iterate_lower_bound = &lower_bound;
         } else {
+          auto min_num =
+            std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
           GenerateKeyFromInt(
-              (uint64_t)std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance),
-              FLAGS_num, &upper_bound);
+              static_cast<uint64_t>(min_num), FLAGS_num, &upper_bound);
           options.iterate_upper_bound = &upper_bound;
         }
       }
@@ -5331,7 +5338,7 @@ class Benchmark {
             // Wait for the writes to be finished
             if (!hint_printed) {
               fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
-                      (int)writes_ - written);
+                      static_cast<int>(writes_) - written);
               hint_printed = true;
             }
           } else {
diff --git a/util/bloom.cc b/util/bloom.cc
index 953a42fa213..f859ab7dd64 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -104,7 +104,7 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
   assert(bits_per_key_);
   assert(space > 0);
   uint32_t dont_care1, dont_care2;
-  int high = (int) (space * 8 / bits_per_key_ + 1);
+  int high = static_cast<int>(space * 8 / bits_per_key_ + 1);
   int low = 1;
   int n = high;
   for (; n >= low; n--) {
@@ -120,7 +120,7 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
 inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
     uint32_t num_lines, uint32_t total_bits) {
 #ifdef NDEBUG
-  (void)total_bits;
+  static_cast<void>(total_bits);
 #endif
   assert(num_lines > 0 && total_bits > 0);
 
@@ -340,7 +340,7 @@ class BloomFilterPolicy : public FilterPolicy {
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
-    for (size_t i = 0; i < (size_t)n; i++) {
+    for (size_t i = 0; i < static_cast<size_t>(n); i++) {
       // Use double-hashing to generate a sequence of hash values.
       // See analysis in [Kirsch,Mitzenmacher 2006].
       uint32_t h = hash_func_(keys[i]);

From 1e87f2b68b01db0579fb98491114e8f059f680be Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 1 Jul 2019 14:04:10 -0700
Subject: [PATCH 193/572] Ref and unref cfd before and after calling
 WaitForFlushMemTables (#5513)

Summary:
This is to prevent bg flush thread from unrefing and deleting the cfd that has been dropped by a concurrent thread.
Before RocksDB calls `DBImpl::WaitForFlushMemTables`, we should increase the refcount of each `ColumnFamilyData` so that its ref count will not drop to 0 even if the column family is dropped by another thread. Otherwise the bg flush thread can deref the cfd and deletes it, causing a segfault in `WaitForFlushMemtables` upon accessing `cfd`.

Test plan (on devserver):
```
$make clean && COMPILE_WITH_ASAN=1 make -j32
$make check
```
All unit tests must pass.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5513

Differential Revision: D16062898

Pulled By: riversand963

fbshipit-source-id: 37dc511f1dc99f036d0201bbd7f0a8f5677c763d
---
 db/db_flush_test.cc                    | 76 ++++++++++++++++++++++++++
 db/db_impl/db_impl.h                   | 10 ++++
 db/db_impl/db_impl_compaction_flush.cc | 39 ++++++++++++-
 db/db_impl/db_impl_debug.cc            | 10 ++++
 4 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index b901a5a7805..034ec63226c 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -290,6 +290,39 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
   Close();
 }
 
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -545,6 +578,49 @@ TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
   handles_.clear();
 }
 
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(0, "key", "value"));
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    delete handles_[1];
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+                                                flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index e57768a74af..737f2337608 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -788,6 +788,16 @@ class DBImpl : public DB {
   Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
                             ColumnFamilyHandle* cfh = nullptr);
 
+  Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+                            const FlushOptions& flush_opts);
+
+  // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+  // is because in certain cases, we can flush column families, wait for the
+  // flush to complete, but delete the column family handle before the wait
+  // finishes. For example in CompactRange.
+  Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+                                   const FlushOptions& flush_opts);
+
   // Wait for memtable compaction
   Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index ff03e591d28..67292401683 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1591,6 +1591,16 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
         ColumnFamilyData* loop_cfd = elem.first;
         loop_cfd->imm()->FlushRequested();
       }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto& elem : flush_req) {
+          ColumnFamilyData* loop_cfd = elem.first;
+          loop_cfd->Ref();
+        }
+      }
       SchedulePendingFlush(flush_req, flush_reason);
       MaybeScheduleFlushOrCompaction();
     }
@@ -1599,7 +1609,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
       write_thread_.ExitUnbatched(&w);
     }
   }
-
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
   if (s.ok() && flush_options.wait) {
     autovector<ColumnFamilyData*> cfds;
     autovector<const uint64_t*> flush_memtable_ids;
@@ -1609,6 +1620,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     }
     s = WaitForFlushMemTables(cfds, flush_memtable_ids,
                               (flush_reason == FlushReason::kErrorRecovery));
+    for (auto* tmp_cfd : cfds) {
+      if (tmp_cfd->Unref()) {
+        // Only one thread can reach here.
+        InstrumentedMutexLock lock_guard(&mutex_);
+        delete tmp_cfd;
+      }
+    }
   }
   TEST_SYNC_POINT("FlushMemTableFinished");
   return s;
@@ -1672,6 +1690,15 @@ Status DBImpl::AtomicFlushMemTables(
       for (auto cfd : cfds) {
         cfd->imm()->FlushRequested();
       }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto cfd : cfds) {
+          cfd->Ref();
+        }
+      }
       GenerateFlushRequest(cfds, &flush_req);
       SchedulePendingFlush(flush_req, flush_reason);
       MaybeScheduleFlushOrCompaction();
@@ -1682,7 +1709,7 @@ Status DBImpl::AtomicFlushMemTables(
     }
   }
   TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
-
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
   if (s.ok() && flush_options.wait) {
     autovector<const uint64_t*> flush_memtable_ids;
     for (auto& iter : flush_req) {
@@ -1690,6 +1717,13 @@ Status DBImpl::AtomicFlushMemTables(
     }
     s = WaitForFlushMemTables(cfds, flush_memtable_ids,
                               (flush_reason == FlushReason::kErrorRecovery));
+    for (auto* cfd : cfds) {
+      if (cfd->Unref()) {
+        // Only one thread can reach here.
+        InstrumentedMutexLock lock_guard(&mutex_);
+        delete cfd;
+      }
+    }
   }
   return s;
 }
@@ -2151,6 +2185,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
     }
     status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
                                          job_context, log_buffer, thread_pri);
+    TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
     // All the CFDs in the FlushReq must have the same flush reason, so just
     // grab the first one
     *reason = bg_flush_args[0].cfd_->GetFlushReason();
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index ec1e1b47752..ec8489848c5 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -122,6 +122,16 @@ Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
   return FlushMemTable(cfd, fo, FlushReason::kTest);
 }
 
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+                                  const FlushOptions& flush_opts) {
+  return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+  return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
 Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {

From 3e6c18538130a4fafb491a5a45dc614958cfe50b Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Mon, 1 Jul 2019 14:53:51 -0700
Subject: [PATCH 194/572] Formatting fixes in db_bench_tool (#5525)

Summary:
Formatting fixes in db_bench_tool that were accidentally omitted
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5525

Test Plan: Unit tests

Differential Revision: D16078516

Pulled By: elipoz

fbshipit-source-id: bf8df0e3f08092a91794ebf285396d9b8a335bb9
---
 tools/db_bench_tool.cc | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index cb5b5a38a66..8344669b75c 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2397,8 +2397,8 @@ class Benchmark {
       return nullptr;
     }
     if (FLAGS_use_clock_cache) {
-      auto cache =
-        NewClockCache(static_cast<size_t>(capacity), FLAGS_cache_numshardbits);
+      auto cache = NewClockCache(static_cast<size_t>(capacity),
+                                 FLAGS_cache_numshardbits);
       if (!cache) {
         fprintf(stderr, "Clock cache not supported.");
         exit(1);
@@ -2406,10 +2406,8 @@ class Benchmark {
       return cache;
     } else {
       return NewLRUCache(
-          static_cast<size_t>(capacity),
-          FLAGS_cache_numshardbits,
-          false /*strict_capacity_limit*/,
-          FLAGS_cache_high_pri_pool_ratio);
+          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
+          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
     }
   }
 
@@ -3608,11 +3606,9 @@ class Benchmark {
     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
           static_cast<unsigned int>(FLAGS_num_levels)) {
-        fprintf(
-            stderr,
-            "Insufficient number of fanouts specified %d\n",
-            static_cast<int>(
-              FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
+        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
+                static_cast<int>(
+                    FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
         exit(1);
       }
       options.max_bytes_for_level_multiplier_additional =
@@ -5168,9 +5164,9 @@ class Benchmark {
           options.iterate_lower_bound = &lower_bound;
         } else {
           auto min_num =
-            std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
-          GenerateKeyFromInt(
-              static_cast<uint64_t>(min_num), FLAGS_num, &upper_bound);
+              std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
+          GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
+                             &upper_bound);
           options.iterate_upper_bound = &upper_bound;
         }
       }

From 66464d1fde0257af79b97018a2f6be554f41ff20 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 1 Jul 2019 15:11:43 -0700
Subject: [PATCH 195/572] Remove multiple declarations o kMicrosInSecond.

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5526

Test Plan:
OPT=-g V=1 make J=1 unity_test -j32
make clean && make -j32

Differential Revision: D16079315

Pulled By: HaoyuHuang

fbshipit-source-id: 294ab439cf0db8dd5da44e30eabf0cbb2bb8c4f6
---
 db/db_impl/db_impl.cc                       | 1 -
 tools/block_cache_trace_analyzer_test.cc    | 2 +-
 tools/db_bench_tool.cc                      | 1 -
 trace_replay/block_cache_tracer.cc          | 1 +
 trace_replay/block_cache_tracer.h           | 2 ++
 utilities/simulator_cache/cache_simulator.h | 2 --
 6 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index e2de696ef57..55f89eab32e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -101,7 +101,6 @@ namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
 const std::string kPersistentStatsColumnFamilyName(
     "___rocksdb_stats_history___");
-const int kMicrosInSecond = 1000 * 1000;
 void DumpRocksDBBuildVersion(Logger* log);
 
 CompressionType GetCompressionFlush(
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index 80734565a3d..21d8bcbbb3f 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -91,7 +91,7 @@ class BlockCacheTracerTest : public testing::Test {
     assert(writer);
     for (uint32_t i = 0; i < nblocks; i++) {
       uint32_t key_id = from_key_id + i;
-      uint32_t timestamp = (key_id + 1) * kMicrosInSecond;
+      uint64_t timestamp = (key_id + 1) * kMicrosInSecond;
       BlockCacheTraceRecord record;
       record.block_type = block_type;
       record.block_size = kBlockSize + key_id;
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 8344669b75c..abffae5d9e8 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1547,7 +1547,6 @@ class ReporterAgent {
  private:
   std::string Header() const { return "secs_elapsed,interval_qps"; }
   void SleepAndReport() {
-    uint64_t kMicrosInSecond = 1000 * 1000;
     auto time_started = env_->NowMicros();
     while (true) {
       {
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index b163216d874..cc875bf0dcd 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -28,6 +28,7 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
 }
 }  // namespace
 
+const uint64_t kMicrosInSecond = 1000 * 1000;
 const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
     "UnknownColumnFamily";
 
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index e2ad933b9b8..e21111727c9 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -16,6 +16,8 @@
 
 namespace rocksdb {
 
+extern const uint64_t kMicrosInSecond;
+
 // Lookup context for tracing block cache accesses.
 // We trace block accesses at five places:
 // 1. BlockBasedTable::GetFilter
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
index 37166d8a9c4..b391d5dc8a5 100644
--- a/utilities/simulator_cache/cache_simulator.h
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -10,8 +10,6 @@
 
 namespace rocksdb {
 
-const uint64_t kMicrosInSecond = 1000000;
-
 // A cache configuration provided by user.
 struct CacheConfiguration {
   std::string cache_name;  // LRU.

From cfdf2116d38cd39763528ce2f3a01e661700c601 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Mon, 1 Jul 2019 16:32:59 -0700
Subject: [PATCH 196/572] Exclude StatsHistoryTest.ForceManualFlushStatsCF test
 from lite mode (#5529)

Summary:
Recent commit 3886dddc3b44bf5061c0f93eab578c51e8bad7bd introduced a new test which is not compatible with lite mode and breaks contrun test:
```
[ RUN      ] StatsHistoryTest.ForceManualFlushStatsCF
monitoring/stats_history_test.cc:642: Failure
Expected: (cfd_stats->GetLogNumber()) < (cfd_test->GetLogNumber()), actual: 15 vs 15
```
This PR excludes the test from lite mode to appease the failing test
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5529

Differential Revision: D16080892

Pulled By: miasantreble

fbshipit-source-id: 2f8a22758f71250cd9f204046404226ddc13b028
---
 monitoring/stats_history_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index bef928558d7..9adacdbf7bc 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -567,7 +567,6 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
 }
-#endif  // !ROCKSDB_LITE
 
 TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   Options options;
@@ -644,6 +643,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   Close();
 }
 
+#endif  // !ROCKSDB_LITE
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 662ce6204406f4377044e9fd34fb8dc502ca4df7 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Tue, 2 Jul 2019 11:45:32 -0700
Subject: [PATCH 197/572] Reduce iterator key comparison for upper/lower bound
 check (2nd attempt) (#5468)

Summary:
This is a second attempt for https://github.com/facebook/rocksdb/issues/5111, with the fix to redo iterate bounds check after `SeekXXX()`. This is because MyRocks may change iterate bounds between seek.

See https://github.com/facebook/rocksdb/issues/5111 for original benchmark result and discussion.

Closes https://github.com/facebook/rocksdb/issues/5463.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5468

Test Plan: Existing rocksdb tests, plus myrocks test `rocksdb.optimizer_loose_index_scans` and `rocksdb.group_min_max`.

Differential Revision: D15863332

fbshipit-source-id: ab4aba5899838591806b8673899bd465f3f53e18
---
 HISTORY.md                                    |  1 +
 db/db_iter.cc                                 |  9 ++-
 db/db_iterator_test.cc                        | 62 +++++++++++++++++++
 db/version_set.cc                             | 48 +++++++++++---
 table/block_based/block_based_table_reader.cc | 30 ++++++---
 table/block_based/block_based_table_reader.h  | 15 ++++-
 table/internal_iterator.h                     | 25 +++++++-
 table/iterator_wrapper.h                      | 22 +++++--
 table/merging_iterator.cc                     | 24 +++++++
 9 files changed, 212 insertions(+), 24 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 2c8dc8c3ab9..c3af6ba06d7 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -24,6 +24,7 @@
 * Reduce binary search when iterator reseek into the same data block.
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
+* Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index b89d7301131..633724c5763 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -467,7 +467,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
-    if (iterate_upper_bound_ != nullptr &&
+    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
+           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
+    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
     }
@@ -859,7 +861,10 @@ void DBIter::PrevInternal() {
       return;
     }
 
-    if (iterate_lower_bound_ != nullptr &&
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.Compare(saved_key_.GetUserKey(),
+                                    *iterate_lower_bound_) >= 0);
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
         user_comparator_.Compare(saved_key_.GetUserKey(),
                                  *iterate_lower_bound_) < 0) {
       // We've iterated earlier than the user-specified lower bound.
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index d514e7683de..67a97b20b81 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2759,6 +2759,68 @@ TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::string value(50, 'v');
+  Reopen(options);
+  ASSERT_OK(Put("aaa", value));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("eee", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::string ub1 = "e";
+  std::string ub2 = "c";
+  Slice ub(ub1);
+  ReadOptions read_opts1;
+  read_opts1.iterate_upper_bound = &ub;
+  Iterator* iter = NewIterator(read_opts1);
+  // Seek and iterate accross block boundary.
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  ub = Slice(ub2);
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+
+  std::string lb1 = "a";
+  std::string lb2 = "c";
+  Slice lb(lb1);
+  ReadOptions read_opts2;
+  read_opts2.iterate_lower_bound = &lb;
+  iter = NewIterator(read_opts2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  lb = Slice(lb2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 8e2d21b051a..3354959aac4 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -887,7 +887,7 @@ class LevelIterator final : public InternalIterator {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
 
   bool Valid() const override { return file_iter_.Valid(); }
@@ -895,23 +895,38 @@ class LevelIterator final : public InternalIterator {
     assert(Valid());
     return file_iter_.key();
   }
+
   Slice value() const override {
     assert(Valid());
     return file_iter_.value();
   }
+
   Status status() const override {
     return file_iter_.iter() ? file_iter_.status() : Status::OK();
   }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return file_iter_.MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     if (file_iter_.iter()) {
       file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
     }
   }
+
   bool IsKeyPinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsKeyPinned();
   }
+
   bool IsValuePinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsValuePinned();
@@ -955,6 +970,7 @@ class LevelIterator final : public InternalIterator {
       smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
       largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
     }
+    CheckMayBeOutOfLowerBound();
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
@@ -963,6 +979,19 @@ class LevelIterator final : public InternalIterator {
         largest_compaction_key);
   }
 
+  // Check if current file being fully within iterate_lower_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update may_be_out_of_lower_bound_ accordingly.
+  void CheckMayBeOutOfLowerBound() {
+    if (Valid() && read_options_.iterate_lower_bound != nullptr) {
+      may_be_out_of_lower_bound_ =
+          user_comparator_.Compare(
+              ExtractUserKey(file_smallest_key(file_index_)),
+              *read_options_.iterate_lower_bound) < 0;
+    }
+  }
+
   TableCache* table_cache_;
   const ReadOptions read_options_;
   const EnvOptions& env_options_;
@@ -976,6 +1005,7 @@ class LevelIterator final : public InternalIterator {
   bool should_sample_;
   TableReaderCaller caller_;
   bool skip_filters_;
+  bool may_be_out_of_lower_bound_ = true;
   size_t file_index_;
   int level_;
   RangeDelAggregator* range_del_agg_;
@@ -1011,6 +1041,7 @@ void LevelIterator::Seek(const Slice& target) {
     file_iter_.Seek(target);
   }
   SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekForPrev(const Slice& target) {
@@ -1024,6 +1055,7 @@ void LevelIterator::SeekForPrev(const Slice& target) {
     file_iter_.SeekForPrev(target);
     SkipEmptyFileBackward();
   }
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekToFirst() {
@@ -1032,6 +1064,7 @@ void LevelIterator::SeekToFirst() {
     file_iter_.SeekToFirst();
   }
   SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekToLast() {
@@ -1040,15 +1073,17 @@ void LevelIterator::SeekToLast() {
     file_iter_.SeekToLast();
   }
   SkipEmptyFileBackward();
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::Next() { NextImpl(); }
 
-bool LevelIterator::NextAndGetResult(Slice* ret_key) {
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
   NextImpl();
   bool is_valid = Valid();
   if (is_valid) {
-    *ret_key = key();
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
   }
   return is_valid;
 }
@@ -4366,10 +4401,9 @@ Status VersionSet::Recover(
         ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
-        manifest_path.c_str(), manifest_file_number_,
-        next_file_number_.load(), last_sequence_.load(), log_number,
-        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
-        min_log_number_to_keep_2pc());
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index edddecf78bd..87756f2e240 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2896,6 +2896,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
     FindKeyForward();
   }
 
+  CheckDataBlockWithinUpperBound();
   CheckOutOfBound();
 
   if (target) {
@@ -2952,6 +2953,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
   block_iter_.SeekForPrev(target);
 
   FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
   assert(!block_iter_.Valid() ||
          icomp_.Compare(target, block_iter_.key()) >= 0);
 }
@@ -2969,6 +2971,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
   InitDataBlock();
   block_iter_.SeekToLast();
   FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
 }
 
 template <class TBlockIter, typename TValue>
@@ -2984,11 +2987,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
 
 template <class TBlockIter, typename TValue>
 bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
-    Slice* ret_key) {
+    IterateResult* result) {
   Next();
   bool is_valid = Valid();
   if (is_valid) {
-    *ret_key = key();
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
   }
   return is_valid;
 }
@@ -3087,6 +3091,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
         /*for_compaction=*/lookup_context_.caller ==
             TableReaderCaller::kCompaction);
     block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
   }
 }
 
@@ -3140,13 +3145,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
       return;
     }
     // Whether next data block is out of upper bound, if there is one.
-    bool next_block_is_out_of_bound = false;
-    if (read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_) {
-      next_block_is_out_of_bound =
-          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.Compare(*read_options_.iterate_upper_bound,
                                     index_iter_->user_key()) <= 0);
-    }
     ResetDataIter();
     index_iter_->Next();
     if (next_block_is_out_of_bound) {
@@ -3210,6 +3214,16 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
   }
 }
 
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    data_block_within_upper_bound_ =
+        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                  index_iter_->user_key()) > 0);
+  }
+}
+
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
     Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 358bc8b8d22..750700813d3 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -652,7 +652,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
   bool Valid() const override {
     return !is_out_of_bound_ &&
@@ -702,6 +702,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // Whether iterator invalidated for being out of bound.
   bool IsOutOfBound() override { return is_out_of_bound_; }
 
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return !data_block_within_upper_bound_;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
@@ -768,6 +773,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool block_iter_points_to_real_block_;
   // See InternalIteratorBase::IsOutOfBound().
   bool is_out_of_bound_ = false;
+  // Whether current data block being fully within iterate upper bound.
+  bool data_block_within_upper_bound_ = false;
   // True if we're standing at the first key of a block, and we haven't loaded
   // that block yet. A call to value() will trigger loading the block.
   bool is_at_first_key_from_index_ = false;
@@ -802,6 +809,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void FindBlockForward();
   void FindKeyBackward();
   void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
 };
 
 }  // namespace rocksdb
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 696e66135dc..426ff396548 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -17,6 +17,11 @@ namespace rocksdb {
 
 class PinnedIteratorsManager;
 
+struct IterateResult {
+  Slice key;
+  bool may_be_out_of_upper_bound;
+};
+
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
@@ -55,11 +60,20 @@ class InternalIteratorBase : public Cleanable {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  virtual bool NextAndGetResult(Slice* ret_key) {
+  // Moves to the next entry in the source, and return result. Iterator
+  // implementation should override this method to help methods inline better,
+  // or when MayBeOutOfUpperBound() is non-trivial.
+  // REQUIRES: Valid()
+  virtual bool NextAndGetResult(IterateResult* result) {
     Next();
     bool is_valid = Valid();
     if (is_valid) {
-      *ret_key = key();
+      result->key = key();
+      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+      // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
+      // it should also override NextAndGetResult().
+      result->may_be_out_of_upper_bound = true;
+      assert(MayBeOutOfUpperBound());
     }
     return is_valid;
   }
@@ -97,6 +111,13 @@ class InternalIteratorBase : public Cleanable {
   // keys above the upper bound, IsOutOfBound() must return false.
   virtual bool IsOutOfBound() { return false; }
 
+  // Keys return from this iterator can be smaller than iterate_lower_bound.
+  virtual bool MayBeOutOfLowerBound() { return true; }
+
+  // Keys return from this iterator can be larger or equal to
+  // iterate_upper_bound.
+  virtual bool MayBeOutOfUpperBound() { return true; }
+
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index a570e53c1e2..a5aa5c49eac 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -56,7 +56,10 @@ class IteratorWrapperBase {
 
   // Iterator interface methods
   bool Valid() const        { return valid_; }
-  Slice key() const         { assert(Valid()); return key_; }
+  Slice key() const {
+    assert(Valid());
+    return result_.key;
+  }
   TValue value() const {
     assert(Valid());
     return iter_->value();
@@ -65,7 +68,7 @@ class IteratorWrapperBase {
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next() {
     assert(iter_);
-    valid_ = iter_->NextAndGetResult(&key_);
+    valid_ = iter_->NextAndGetResult(&result_);
     assert(!valid_ || iter_->status().ok());
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
@@ -83,6 +86,16 @@ class IteratorWrapperBase {
   void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
   void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
 
+  bool MayBeOutOfLowerBound() {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() {
+    assert(Valid());
+    return result_.may_be_out_of_upper_bound;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
     assert(iter_);
     iter_->SetPinnedItersMgr(pinned_iters_mgr);
@@ -100,14 +113,15 @@ class IteratorWrapperBase {
   void Update() {
     valid_ = iter_->Valid();
     if (valid_) {
-      key_ = iter_->key();
       assert(iter_->status().ok());
+      result_.key = iter_->key();
+      result_.may_be_out_of_upper_bound = true;
     }
   }
 
   InternalIteratorBase<TValue>* iter_;
+  IterateResult result_;
   bool valid_;
-  Slice key_;
 };
 
 using IteratorWrapper = IteratorWrapperBase<Slice>;
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 207066b5a1e..1a0d4df8995 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -227,6 +227,16 @@ class MergingIterator : public InternalIterator {
     current_ = CurrentForward();
   }
 
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+    }
+    return is_valid;
+  }
+
   void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
@@ -296,6 +306,20 @@ class MergingIterator : public InternalIterator {
     return current_->value();
   }
 
+  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+  // from current child iterator. Potentially as long as one of child iterator
+  // report out of bound is not possible, we know current key is within bound.
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     for (auto& child : children_) {

From 0d57d93a06727943dbad0bc80768a29d74ce22a0 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Tue, 2 Jul 2019 12:03:40 -0700
Subject: [PATCH 198/572] Support jemalloc compiled with
 `--with-jemalloc-prefix` (#5521)

Summary:
Previously, if the jemalloc was built with nonempty string for
`--with-jemalloc-prefix`, then `HasJemalloc()` would return false on
Linux, so jemalloc would not be used at runtime. On Mac, it would cause
a linker failure due to no definitions found for the weak functions
declared in "port/jemalloc_helper.h". This should be a rare problem
because (1) on Linux the default `--with-jemalloc-prefix` value is the
empty string, and (2) Homebrew's build explicitly sets
`--with-jemalloc-prefix` to the empty string.

However, there are cases where `--with-jemalloc-prefix` is nonempty.
For example, when building jemalloc from source on Mac, the default
setting is `--with-jemalloc-prefix=je_`. Such jemalloc builds should be
usable by RocksDB.

The fix is simple. Defining `JEMALLOC_MANGLE` before including
"jemalloc.h" causes it to define unprefixed symbols that are aliases for
each of the prefixed symbols. Thanks to benesch for figuring this out
and explaining it to me.

Fixes https://github.com/facebook/rocksdb/issues/1462.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5521

Test Plan:
build jemalloc with prefixed symbols:

```
$ ./configure --with-jemalloc-prefix=lol
$ make
```

compile rocksdb against it:

```
$ WITH_JEMALLOC_FLAG=1 JEMALLOC=1 EXTRA_LDFLAGS="-L/home/andrew/jemalloc/lib/" EXTRA_CXXFLAGS="-I/home/andrew/jemalloc/include/" make -j12 ./db_bench
```

run db_bench and verify jemalloc actually used:

```
$ ./db_bench -benchmarks=fillrandom -statistics=true -dump_malloc_stats=true -stats_dump_period_sec=1
$ grep jemalloc /tmp/rocksdbtest-1000/dbbench/LOG
2019/06/29-12:20:52.088658 7fc5fb7f6700 [_impl/db_impl.cc:837] ___ Begin jemalloc statistics ___
...
```

Differential Revision: D16092758

fbshipit-source-id: c2c358346190ed62ceb2a3547a6c4c180b12f7c4
---
 db/malloc_stats.cc     | 4 ----
 port/jemalloc_helper.h | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc
index bcee5c3fbfe..1dfe0d55b43 100644
--- a/db/malloc_stats.cc
+++ b/db/malloc_stats.cc
@@ -20,10 +20,6 @@ namespace rocksdb {
 
 #ifdef ROCKSDB_JEMALLOC
 
-#ifdef JEMALLOC_NO_RENAME
-#define malloc_stats_print je_malloc_stats_print
-#endif
-
 typedef struct {
   char* cur;
   char* end;
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index 26e5fb66336..6aeb780ee6e 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -9,6 +9,7 @@
 #ifdef __FreeBSD__
 #include <malloc_np.h>
 #else
+#define JEMALLOC_MANGLE
 #include <jemalloc/jemalloc.h>
 #endif
 

From 09ea5d8944700be9ce00fdd66f29f34573f33e76 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Tue, 2 Jul 2019 12:58:50 -0700
Subject: [PATCH 199/572] Fix clang build with jemalloc (#5522)

Summary:
Fixes the below build failure for clang compiler using glibc and jemalloc.

Platform: linux x86-64
Compiler: clang version 6.0.0-1ubuntu2
Build failure:
```
$ CXX=clang++ CC=clang USE_CLANG=1 WITH_JEMALLOC_FLAG=1 JEMALLOC=1 EXTRA_LDFLAGS="-L/home/andrew/jemalloc/lib/" EXTRA_CXXFLAGS="-I/home/andrew/jemalloc/include/" make check -j12
...
  CC       memory/jemalloc_nodump_allocator.o
In file included from memory/jemalloc_nodump_allocator.cc:6:
In file included from ./memory/jemalloc_nodump_allocator.h:11:
In file included from ./port/jemalloc_helper.h:16:
/usr/include/clang/6.0.0/include/mm_malloc.h:39:16: error: 'posix_memalign' is missing exception specification 'throw()'
extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
               ^
/home/andrew/jemalloc/include/jemalloc/jemalloc.h:388:26: note: expanded from macro 'posix_memalign'
#  define posix_memalign je_posix_memalign
                         ^
/home/andrew/jemalloc/include/jemalloc/jemalloc.h:77:29: note: expanded from macro 'je_posix_memalign'
#  define je_posix_memalign posix_memalign
                            ^
/home/andrew/jemalloc/include/jemalloc/jemalloc.h:232:38: note: previous declaration is here
JEMALLOC_EXPORT int JEMALLOC_NOTHROW    je_posix_memalign(void **memptr,
                                        ^
/home/andrew/jemalloc/include/jemalloc/jemalloc.h:77:29: note: expanded from macro 'je_posix_memalign'
#  define je_posix_memalign posix_memalign
                            ^
1 error generated.
Makefile:1972: recipe for target 'memory/jemalloc_nodump_allocator.o' failed
make: *** [memory/jemalloc_nodump_allocator.o] Error 1
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5522

Differential Revision: D16069869

Pulled By: miasantreble

fbshipit-source-id: c489bbc993adee194b9a550134c6237a264bc443
---
 port/jemalloc_helper.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index 6aeb780ee6e..a9095ec98dc 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -5,6 +5,19 @@
 
 #pragma once
 
+#if defined(__clang__)
+// glibc's `posix_memalign()` declaration specifies `throw()` while clang's
+// declaration does not. There is a hack in clang to make its re-declaration
+// compatible with glibc's if they are declared consecutively. That hack breaks
+// if yet another `posix_memalign()` declaration comes between glibc's and
+// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's
+// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration.
+//
+// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()`
+// declaration did not specify `throw()` when built with clang.
+#include <mm_malloc.h>
+#endif
+
 #ifdef ROCKSDB_JEMALLOC
 #ifdef __FreeBSD__
 #include <malloc_np.h>

From 84c5c9aab15896e1c55c3febfa1fac5ed2009069 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Wed, 3 Jul 2019 18:36:08 -0700
Subject: [PATCH 200/572] Fix a bug in compaction reads causing checksum
 mismatches and asan errors (#5531)

Summary:
Fixed a bug in compaction reads due to which incorrect number of bytes were being read/utilized. The bug was introduced in https://github.com/facebook/rocksdb/issues/5498 , resulting in "Corruption: block checksum mismatch" and "heap-buffer-overflow" asan errors in our tests.

https://github.com/facebook/rocksdb/issues/5498 was introduced recently and is not in any released versions.

ASAN:
```
> ==2280939==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6250005e83da at pc 0x000000d57f62 bp 0x7f954f483770 sp 0x7f954f482f20
> === How to use this, how to get the raw stack trace, and more: fburl.com/ASAN ===
> READ of size 4 at 0x6250005e83da thread T4
> SCARINESS: 27 (4-byte-read-heap-buffer-overflow-far-from-bounds)

>      #0 tests+0xd57f61                           __asan_memcpy
>      https://github.com/facebook/rocksdb/issues/1 rocksdb/src/util/coding.h:124            rocksdb::DecodeFixed32(char const*)
>      https://github.com/facebook/rocksdb/issues/2 rocksdb/src/table/block_fetcher.cc:39    rocksdb::BlockFetcher::CheckBlockChecksum()
>      https://github.com/facebook/rocksdb/issues/3 rocksdb/src/table/block_fetcher.cc:99    rocksdb::BlockFetcher::TryGetFromPrefetchBuffer()
>      https://github.com/facebook/rocksdb/issues/4 rocksdb/src/table/block_fetcher.cc:209   rocksdb::BlockFetcher::ReadBlockContents()
>      https://github.com/facebook/rocksdb/issues/5 rocksdb/src/table/block_based/block_based_table_reader.cc:93 rocksdb::(anonymous namespace)::ReadBlockFromFile(rocksdb::RandomAccessFileReader*, rocksdb::FilePrefetchBuffer*, rocksdb::Footer const&, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, std::unique_ptr<...>*, rocksdb::ImmutableCFOptions const&, bool, bool, rocksdb::UncompressionDict
 const&, rocksdb::PersistentCacheOptions const&, unsigned long, unsigned long, rocksdb::MemoryAllocator*, bool)
>      https://github.com/facebook/rocksdb/issues/6 rocksdb/src/table/block_based/block_based_table_reader.cc:2331 rocksdb::BlockBasedTable::RetrieveBlock(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<...>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool) const
>      https://github.com/facebook/rocksdb/issues/7 rocksdb/src/table/block_based/block_based_table_reader.cc:2090 rocksdb::DataBlockIter* rocksdb::BlockBasedTable::NewDataBlockIterator<...>(rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::DataBlockIter*, rocksdb::BlockType, bool, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::Status, rocksdb::FilePrefetchBuffe
r*, bool) const
>      https://github.com/facebook/rocksdb/issues/8 rocksdb/src/table/block_based/block_based_table_reader.cc:2720 rocksdb::BlockBasedTableIterator<...>::InitDataBlock()
>      https://github.com/facebook/rocksdb/issues/9 rocksdb/src/table/block_based/block_based_table_reader.cc:2607 rocksdb::BlockBasedTableIterator<...>::SeekToFirst()
>     https://github.com/facebook/rocksdb/issues/10 rocksdb/src/table/iterator_wrapper.h:83  rocksdb::IteratorWrapperBase<...>::SeekToFirst()
>     https://github.com/facebook/rocksdb/issues/11 rocksdb/src/table/merging_iterator.cc:100 rocksdb::MergingIterator::SeekToFirst()
>     https://github.com/facebook/rocksdb/issues/12 rocksdb/compaction/compaction_job.cc:877 rocksdb::CompactionJob::ProcessKeyValueCompaction(rocksdb::CompactionJob::SubcompactionState*)
>     https://github.com/facebook/rocksdb/issues/13 rocksdb/compaction/compaction_job.cc:590 rocksdb::CompactionJob::Run()
>     https://github.com/facebook/rocksdb/issues/14 rocksdb/db_impl/db_impl_compaction_flush.cc:2689 rocksdb::DBImpl::BackgroundCompaction(bool*, rocksdb::JobContext*, rocksdb::LogBuffer*, rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority)
>     https://github.com/facebook/rocksdb/issues/15 rocksdb/db_impl/db_impl_compaction_flush.cc:2248 rocksdb::DBImpl::BackgroundCallCompaction(rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority)
>     https://github.com/facebook/rocksdb/issues/16 rocksdb/db_impl/db_impl_compaction_flush.cc:2024 rocksdb::DBImpl::BGWorkCompaction(void*)
>     https://github.com/facebook/rocksdb/issues/23 rocksdb/src/util/threadpool_imp.cc:266   rocksdb::ThreadPoolImpl::Impl::BGThread(unsigned long)
>     https://github.com/facebook/rocksdb/issues/24 rocksdb/src/util/threadpool_imp.cc:307   rocksdb::ThreadPoolImpl::Impl::BGThreadWrapper(void*)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5531

Test Plan: Verified that this fixes the fb-internal Logdevice test which caught the issue.

Differential Revision: D16109702

Pulled By: sagar0

fbshipit-source-id: 1fc08549cf7b553e338a133ae11eb9f4d5011914
---
 util/file_reader_writer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index f49866d13e7..db16e82ae11 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -842,7 +842,7 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
       assert(max_readahead_size_ >= readahead_size_);
       Status s;
       if (for_compaction) {
-        s = Prefetch(file_reader_, offset, readahead_size_, for_compaction);
+        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), for_compaction);
       } else {
         s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
       }

From 6edc5d0719d9739e06e860a065f1f873844b836c Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Wed, 3 Jul 2019 18:45:36 -0700
Subject: [PATCH 201/572] Block cache tracing: Associate a unique id with Get
 and MultiGet (#5514)

Summary:
This PR associates a unique id with Get and MultiGet. This enables us to track how many blocks a Get/MultiGet request accesses. We can also measure the impact of row cache vs block cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5514

Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32

Differential Revision: D16032681

Pulled By: HaoyuHuang

fbshipit-source-id: 775b05f4440badd58de6667e3ec9f4fc87a0af4c
---
 db/version_set.cc                             | 16 ++++-
 table/block_based/block_based_table_reader.cc | 63 +++++++++++++------
 table/get_context.cc                          | 19 +++---
 table/get_context.h                           |  8 ++-
 table/table_test.cc                           | 46 +++++++-------
 trace_replay/block_cache_tracer.cc            | 29 ++++++++-
 trace_replay/block_cache_tracer.h             | 38 +++++++----
 trace_replay/block_cache_tracer_test.cc       | 38 +++++++++++
 8 files changed, 191 insertions(+), 66 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 3354959aac4..226ba0e7e59 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1663,11 +1663,17 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   }
 
   PinnedIteratorsManager pinned_iters_mgr;
+  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+  }
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
       value, value_found, merge_context, max_covering_tombstone_seq, this->env_,
-      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
+      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+      tracing_get_id);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -1785,7 +1791,12 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   if (merge_operator_) {
     pinned_iters_mgr.StartPinning();
   }
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
 
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+  }
   // Even though we know the batch size won't be > MAX_BATCH_SIZE,
   // use autovector in order to avoid unnecessary construction of GetContext
   // objects, which is expensive
@@ -1797,7 +1808,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
         iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
         iter->value, nullptr, &(iter->merge_context),
         &iter->max_covering_tombstone_seq, this->env_, &iter->seq,
-        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
+        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+        tracing_mget_id);
   }
   int get_ctx_index = 0;
   for (auto iter = range->begin(); iter != range->end();
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 87756f2e240..65bc6dfbc11 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1983,7 +1983,7 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
         /*block_size=*/usage, rep_->cf_id_for_tracing(),
         /*cf_name=*/"", rep_->level_for_tracing(),
         rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-        /*no_insert=*/no_io);
+        /*no_insert=*/no_io, lookup_context->get_id);
     block_cache_tracer_->WriteBlockAccess(access_record, key,
                                           rep_->cf_name_for_tracing(),
                                           /*referenced_key=*/nullptr);
@@ -2065,7 +2065,7 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
         /*block_size=*/usage, rep_->cf_id_for_tracing(),
         /*cf_name=*/"", rep_->level_for_tracing(),
         rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-        /*no_insert=*/no_io);
+        /*no_insert=*/no_io, lookup_context->get_id);
     block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
                                           rep_->cf_name_for_tracing(),
                                           /*referenced_key=*/nullptr);
@@ -2426,7 +2426,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
           /*block_size=*/usage, rep_->cf_id_for_tracing(),
           /*cf_name=*/"", rep_->level_for_tracing(),
           rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-          no_insert);
+          no_insert, lookup_context->get_id);
       block_cache_tracer_->WriteBlockAccess(access_record, key,
                                             rep_->cf_name_for_tracing(),
                                             /*referenced_key=*/nullptr);
@@ -3340,7 +3340,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   CachableEntry<FilterBlockReader> filter_entry;
   bool may_match;
   FilterBlockReader* filter = nullptr;
-  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet};
+  uint64_t tracing_get_id = get_context ? get_context->tracing_get_id()
+                                        : BlockCacheTraceHelper::kReservedGetId;
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet,
+                                         tracing_get_id};
   {
     if (!skip_filters) {
       filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
@@ -3406,7 +3409,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       }
 
       BlockCacheLookupContext lookup_data_block_context{
-          TableReaderCaller::kUserGet};
+          TableReaderCaller::kUserGet, tracing_get_id};
       bool does_referenced_key_exist = false;
       DataBlockIter biter;
       uint64_t referenced_data_size = 0;
@@ -3447,8 +3450,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           if (!get_context->SaveValue(
                   parsed_key, biter.value(), &matched,
                   biter.IsValuePinned() ? &biter : nullptr)) {
-            does_referenced_key_exist = true;
-            referenced_data_size = biter.key().size() + biter.value().size();
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+            }
             done = true;
             break;
           }
@@ -3459,6 +3464,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
         // Avoid making copy of block_key, cf_name, and referenced_key when
         // constructing the access record.
+        Slice referenced_key;
+        if (does_referenced_key_exist) {
+          referenced_key = biter.key();
+        } else {
+          referenced_key = ExtractUserKey(key);
+        }
         BlockCacheTraceRecord access_record(
             rep_->ioptions.env->NowMicros(),
             /*block_key=*/"", lookup_data_block_context.block_type,
@@ -3467,12 +3478,13 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
             rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
             lookup_data_block_context.is_cache_hit,
             lookup_data_block_context.no_insert,
+            lookup_data_block_context.get_id,
             /*referenced_key=*/"", referenced_data_size,
             lookup_data_block_context.num_keys_in_block,
             does_referenced_key_exist);
         block_cache_tracer_->WriteBlockAccess(
             access_record, lookup_data_block_context.block_key,
-            rep_->cf_name_for_tracing(), key);
+            rep_->cf_name_for_tracing(), referenced_key);
       }
 
       if (done) {
@@ -3498,14 +3510,19 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                const MultiGetRange* mget_range,
                                const SliceTransform* prefix_extractor,
                                bool skip_filters) {
-  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet};
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   FilterBlockReader* filter = nullptr;
   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                                mget_range->end());
-  {
-    if (!skip_filters) {
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
+    tracing_mget_id = sst_file_range.begin()->get_context->tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet,
+                                         tracing_mget_id};
+  if (!skip_filters) {
+    {
       // TODO: Figure out where the stats should go
       filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
                                read_options.read_tier == kBlockCacheTier,
@@ -3644,7 +3661,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         uint64_t referenced_data_size = 0;
         bool does_referenced_key_exist = false;
         BlockCacheLookupContext lookup_data_block_context(
-            TableReaderCaller::kUserMultiGet);
+            TableReaderCaller::kUserMultiGet, tracing_mget_id);
         if (first_block) {
           if (!block_handles[idx_in_batch].IsNull() ||
               !results[idx_in_batch].IsEmpty()) {
@@ -3703,7 +3720,6 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           ParsedInternalKey parsed_key;
           Cleanable dummy;
           Cleanable* value_pinner = nullptr;
-
           if (!ParseInternalKey(biter->key(), &parsed_key)) {
             s = Status::Corruption(Slice());
           }
@@ -3719,11 +3735,13 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
               value_pinner = biter;
             }
           }
-
-          if (!get_context->SaveValue(
-                  parsed_key, biter->value(), &matched, value_pinner)) {
-            does_referenced_key_exist = true;
-            referenced_data_size = biter->key().size() + biter->value().size();
+          if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+                                      value_pinner)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size =
+                  biter->key().size() + biter->value().size();
+            }
             done = true;
             break;
           }
@@ -3733,6 +3751,12 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
           // Avoid making copy of block_key, cf_name, and referenced_key when
           // constructing the access record.
+          Slice referenced_key;
+          if (does_referenced_key_exist) {
+            referenced_key = biter->key();
+          } else {
+            referenced_key = ExtractUserKey(key);
+          }
           BlockCacheTraceRecord access_record(
               rep_->ioptions.env->NowMicros(),
               /*block_key=*/"", lookup_data_block_context.block_type,
@@ -3741,12 +3765,13 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
               rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
               lookup_data_block_context.is_cache_hit,
               lookup_data_block_context.no_insert,
+              lookup_data_block_context.get_id,
               /*referenced_key=*/"", referenced_data_size,
               lookup_data_block_context.num_keys_in_block,
               does_referenced_key_exist);
           block_cache_tracer_->WriteBlockAccess(
               access_record, lookup_data_block_context.block_key,
-              rep_->cf_name_for_tracing(), key);
+              rep_->cf_name_for_tracing(), referenced_key);
         }
         s = biter->status();
         if (done) {
diff --git a/table/get_context.cc b/table/get_context.cc
index 9be16b0627d..f0c7928bf42 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -38,15 +38,13 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
 
 }  // namespace
 
-GetContext::GetContext(const Comparator* ucmp,
-                       const MergeOperator* merge_operator, Logger* logger,
-                       Statistics* statistics, GetState init_state,
-                       const Slice& user_key, PinnableSlice* pinnable_val,
-                       bool* value_found, MergeContext* merge_context,
-                       SequenceNumber* _max_covering_tombstone_seq, Env* env,
-                       SequenceNumber* seq,
-                       PinnedIteratorsManager* _pinned_iters_mgr,
-                       ReadCallback* callback, bool* is_blob_index)
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
+    SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
+    bool* is_blob_index, uint64_t tracing_get_id)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -62,7 +60,8 @@ GetContext::GetContext(const Comparator* ucmp,
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
       callback_(callback),
-      is_blob_index_(is_blob_index) {
+      is_blob_index_(is_blob_index),
+      tracing_get_id_(tracing_get_id) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
   }
diff --git a/table/get_context.h b/table/get_context.h
index ddce33fb3be..f567229cc9f 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -85,7 +85,8 @@ class GetContext {
              SequenceNumber* max_covering_tombstone_seq, Env* env,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
-             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0);
 
   GetContext() = default;
 
@@ -135,6 +136,8 @@ class GetContext {
 
   void ReportCounters();
 
+  uint64_t tracing_get_id() const { return tracing_get_id_; }
+
  private:
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
@@ -158,6 +161,9 @@ class GetContext {
   ReadCallback* callback_;
   bool sample_;
   bool* is_blob_index_;
+  // Used for block cache tracing only. A tracing get id uniquely identifies a
+  // Get or a MultiGet.
+  const uint64_t tracing_get_id_;
 };
 
 // Call this to replay a log and bring the get_context up to date. The replay
diff --git a/table/table_test.cc b/table/table_test.cc
index 418ecf004b7..c3a1f82ed37 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2563,23 +2563,25 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
                GetPlainInternalComparator(options.comparator), &keys, &kvmap);
       auto reader = c.GetTableReader();
       PinnableSlice value;
-      GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
-                             GetContext::kNotFound, user_key, &value, nullptr,
-                             nullptr, nullptr, nullptr);
-      get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
-                            moptions.prefix_extractor.get()));
-      if (index_and_filter_in_cache) {
-        // data, index and filter block
-        ASSERT_EQ(get_perf_context()->block_read_count, 3);
-        ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
-        ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
-      } else {
-        // just the data block
-        ASSERT_EQ(get_perf_context()->block_read_count, 1);
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        if (index_and_filter_in_cache) {
+          // data, index and filter block
+          ASSERT_EQ(get_perf_context()->block_read_count, 3);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // just the data block
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+        }
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_STREQ(value.data(), "hello");
       }
-      ASSERT_EQ(get_context.State(), GetContext::kFound);
-      ASSERT_STREQ(value.data(), "hello");
 
       // Get non-existing key
       user_key = "does-not-exist";
@@ -2587,13 +2589,15 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       encoded_key = internal_key.Encode().ToString();
 
       value.Reset();
-      get_context = GetContext(options.comparator, nullptr, nullptr, nullptr,
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
                                nullptr, nullptr, nullptr);
-      get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
-                            moptions.prefix_extractor.get()));
-      ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+      }
 
       if (index_and_filter_in_cache) {
         if (bloom_filter_type == 0) {
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index cc875bf0dcd..115a75d924b 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -31,6 +31,7 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
 const uint64_t kMicrosInSecond = 1000 * 1000;
 const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
     "UnknownColumnFamily";
+const uint64_t BlockCacheTraceHelper::kReservedGetId = 0;
 
 bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type,
                                                      TableReaderCaller caller) {
@@ -39,6 +40,11 @@ bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type,
           caller == TableReaderCaller::kUserMultiGet);
 }
 
+bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet;
+}
+
 BlockCacheTraceWriter::BlockCacheTraceWriter(
     Env* env, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer)
@@ -65,6 +71,9 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
   trace.payload.push_back(record.caller);
   trace.payload.push_back(record.is_cache_hit);
   trace.payload.push_back(record.no_insert);
+  if (BlockCacheTraceHelper::ShouldTraceGetId(record.caller)) {
+    PutFixed64(&trace.payload, record.get_id);
+  }
   if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type,
                                                       record.caller)) {
     PutLengthPrefixedSlice(&trace.payload, referenced_key);
@@ -197,7 +206,12 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   }
   record->no_insert = static_cast<Boolean>(enc_slice[0]);
   enc_slice.remove_prefix(kCharSize);
-
+  if (BlockCacheTraceHelper::ShouldTraceGetId(record->caller)) {
+    if (!GetFixed64(&enc_slice, &record->get_id)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the get id.");
+    }
+  }
   if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type,
                                                       record->caller)) {
     Slice referenced_key;
@@ -236,6 +250,7 @@ Status BlockCacheTracer::StartTrace(
   if (writer_.load()) {
     return Status::Busy();
   }
+  get_id_counter_.store(1);
   trace_options_ = trace_options;
   writer_.store(
       new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer)));
@@ -266,4 +281,16 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
                                           referenced_key);
 }
 
+uint64_t BlockCacheTracer::NextGetId() {
+  if (!writer_.load(std::memory_order_relaxed)) {
+    return BlockCacheTraceHelper::kReservedGetId;
+  }
+  uint64_t prev_value = get_id_counter_.fetch_add(1);
+  if (prev_value == BlockCacheTraceHelper::kReservedGetId) {
+    // fetch and add again.
+    return get_id_counter_.fetch_add(1);
+  }
+  return prev_value;
+}
+
 }  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index e21111727c9..4788a3f447f 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -18,6 +18,16 @@ namespace rocksdb {
 
 extern const uint64_t kMicrosInSecond;
 
+class BlockCacheTraceHelper {
+ public:
+  static bool ShouldTraceReferencedKey(TraceType block_type,
+                                       TableReaderCaller caller);
+  static bool ShouldTraceGetId(TableReaderCaller caller);
+
+  static const std::string kUnknownColumnFamilyName;
+  static const uint64_t kReservedGetId;
+};
+
 // Lookup context for tracing block cache accesses.
 // We trace block accesses at five places:
 // 1. BlockBasedTable::GetFilter
@@ -38,8 +48,10 @@ extern const uint64_t kMicrosInSecond;
 // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
 // kUserApproximateSize).
 struct BlockCacheLookupContext {
-BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
-const TableReaderCaller caller;
+  BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
+  BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id)
+      : caller(_caller), get_id(_get_id) {}
+  const TableReaderCaller caller;
   // These are populated when we perform lookup/insert on block cache. The block
   // cache tracer uses these inforation when logging the block access at
   // BlockBasedTable::GET and BlockBasedTable::MultiGet.
@@ -49,6 +61,10 @@ const TableReaderCaller caller;
   uint64_t block_size = 0;
   std::string block_key;
   uint64_t num_keys_in_block = 0;
+  // The unique id associated with Get and MultiGet. This enables us to track
+  // how many blocks a Get/MultiGet request accesses. We can also measure the
+  // impact of row cache vs block cache.
+  uint64_t get_id = 0;
 
   void FillLookupContext(bool _is_cache_hit, bool _no_insert,
                          TraceType _block_type, uint64_t _block_size,
@@ -78,7 +94,8 @@ struct BlockCacheTraceRecord {
   TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
   Boolean is_cache_hit = Boolean::kFalse;
   Boolean no_insert = Boolean::kFalse;
-
+  // Required field for Get and MultiGet
+  uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
   // Required fields for data block and user Get/Multi-Get only.
   std::string referenced_key;
   uint64_t referenced_data_size = 0;
@@ -91,7 +108,7 @@ struct BlockCacheTraceRecord {
                         TraceType _block_type, uint64_t _block_size,
                         uint64_t _cf_id, std::string _cf_name, uint32_t _level,
                         uint64_t _sst_fd_number, TableReaderCaller _caller,
-                        bool _is_cache_hit, bool _no_insert,
+                        bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
                         std::string _referenced_key = "",
                         uint64_t _referenced_data_size = 0,
                         uint64_t _num_keys_in_block = 0,
@@ -107,6 +124,7 @@ struct BlockCacheTraceRecord {
         caller(_caller),
         is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
         no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
+        get_id(_get_id),
         referenced_key(_referenced_key),
         referenced_data_size(_referenced_data_size),
         num_keys_in_block(_num_keys_in_block),
@@ -121,14 +139,6 @@ struct BlockCacheTraceHeader {
   uint32_t rocksdb_minor_version;
 };
 
-class BlockCacheTraceHelper {
- public:
-  static bool ShouldTraceReferencedKey(TraceType block_type,
-                                       TableReaderCaller caller);
-
-  static const std::string kUnknownColumnFamilyName;
-};
-
 // BlockCacheTraceWriter captures all RocksDB block cache accesses using a
 // user-provided TraceWriter. Every RocksDB operation is written as a single
 // trace. Each trace will have a timestamp and type, followed by the trace
@@ -207,11 +217,15 @@ class BlockCacheTracer {
                           const Slice& block_key, const Slice& cf_name,
                           const Slice& referenced_key);
 
+  // GetId cycles from 1 to port::kMaxUint64.
+  uint64_t NextGetId();
+
  private:
   TraceOptions trace_options_;
   // A mutex protects the writer_.
   InstrumentedMutex trace_writer_mutex_;
   std::atomic<BlockCacheTraceWriter*> writer_;
+  std::atomic<uint64_t> get_id_counter_;
 };
 
 }  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index e7a5881044f..aae513ad5d7 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -71,6 +71,9 @@ class BlockCacheTracerTest : public testing::Test {
       record.sst_fd_number = kSSTFDNumber + key_id;
       record.is_cache_hit = Boolean::kFalse;
       record.no_insert = Boolean::kFalse;
+      // Provide get_id for all callers. The writer should only write get_id
+      // when the caller is either GET or MGET.
+      record.get_id = key_id + 1;
       // Provide these fields for all block types.
       // The writer should only write these fields for data blocks and the
       // caller is either GET or MGET.
@@ -120,6 +123,12 @@ class BlockCacheTracerTest : public testing::Test {
       ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number);
       ASSERT_EQ(Boolean::kFalse, record.is_cache_hit);
       ASSERT_EQ(Boolean::kFalse, record.no_insert);
+      if (record.caller == TableReaderCaller::kUserGet ||
+          record.caller == TableReaderCaller::kUserMultiGet) {
+        ASSERT_EQ(key_id + 1, record.get_id);
+      } else {
+        ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id);
+      }
       if (block_type == TraceType::kBlockTraceDataBlock &&
           (record.caller == TableReaderCaller::kUserGet ||
            record.caller == TableReaderCaller::kUserMultiGet)) {
@@ -239,6 +248,35 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
   }
 }
 
+TEST_F(BlockCacheTracerTest, NextGetId) {
+  BlockCacheTracer writer;
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    // next get id should always return 0 before we call StartTrace.
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+    ASSERT_EQ(2, writer.NextGetId());
+    writer.EndTrace();
+    // next get id should return 0.
+    ASSERT_EQ(0, writer.NextGetId());
+  }
+
+  // Start trace again and next get id should return 1.
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+  }
+}
+
 TEST_F(BlockCacheTracerTest, MixedBlocks) {
   {
     // Generate a trace file containing a mix of blocks.

From e4dcf5fd22509ae6741733a0f02feb7b68421f55 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 3 Jul 2019 19:03:29 -0700
Subject: [PATCH 202/572] db_bench to add a new "benchmark" to print out all
 stats history (#5532)

Summary:
Sometimes it is helpful to fetch the whole history of stats after benchmark runs. Add such an option
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5532

Test Plan: Run the benchmark manually and observe the output is as expected.

Differential Revision: D16097764

fbshipit-source-id: 10b5b735a22a18be198b8f348be11f11f8806904
---
 HISTORY.md             |  1 +
 tools/db_bench_tool.cc | 36 ++++++++++++++++++++++++++++++++++++
 tools/ldb_cmd.cc       | 32 +++++++++++---------------------
 util/string_util.cc    | 11 +++++++++++
 util/string_util.h     |  4 ++++
 5 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c3af6ba06d7..6e0fcc54efb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -13,6 +13,7 @@
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
 * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 * Add C bindings for secondary instance, i.e. DBImplSecondary.
+* db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index abffae5d9e8..39f9eebc7e0 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -49,6 +49,7 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/stats_history.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/options_util.h"
@@ -2867,6 +2868,8 @@ class Benchmark {
         PrintStats("rocksdb.levelstats");
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
+      } else if (name == "stats_history") {
+        PrintStatsHistory();
       } else if (name == "replay") {
         if (num_threads > 1) {
           fprintf(stderr, "Multi-threaded replay is not yet supported\n");
@@ -6259,6 +6262,39 @@ class Benchmark {
     }
   }
 
+  void PrintStatsHistory() {
+    if (db_.db != nullptr) {
+      PrintStatsHistoryImpl(db_.db, false);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStatsHistoryImpl(db_with_cfh.db, true);
+    }
+  }
+
+  void PrintStatsHistoryImpl(DB* db, bool print_header) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    std::unique_ptr<StatsHistoryIterator> shi;
+    Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
+    if (!s.ok()) {
+      fprintf(stdout, "%s\n", s.ToString().c_str());
+      return;
+    }
+    assert(shi);
+    while (shi->Valid()) {
+      uint64_t stats_time = shi->GetStatsTime();
+      fprintf(stdout, "------ %s ------\n",
+              TimeToHumanString(static_cast<int>(stats_time)).c_str());
+      for (auto& entry : shi->GetStatsMap()) {
+        fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
+                entry.first.c_str(), entry.second);
+      }
+      shi->Next();
+    }
+  }
+
   void PrintStats(const char* key) {
     if (db_.db != nullptr) {
       PrintStats(db_.db, key, false);
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index a1507b188b2..fba32d9d622 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -859,8 +859,7 @@ void CompactorCommand::DoCommand() {
   delete end;
 }
 
-// ----------------------------------------------------------------------------
-
+// ---------------------------------------------------------------------------
 const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
 const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
 const std::string DBLoaderCommand::ARG_COMPACT = "compact";
@@ -1168,19 +1167,8 @@ void DropColumnFamilyCommand::DoCommand() {
 }
 
 // ----------------------------------------------------------------------------
-
 namespace {
 
-std::string ReadableTime(int unixtime) {
-  char time_buffer [80];
-  time_t rawtime = unixtime;
-  struct tm tInfo;
-  struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
-  assert(timeinfo == &tInfo);
-  strftime(time_buffer, 80, "%c", timeinfo);
-  return std::string(time_buffer);
-}
-
 // This function only called when it's the sane case of >1 buckets in time-range
 // Also called only when timekv falls between ttl_start and ttl_end provided
 void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start,
@@ -1202,13 +1190,13 @@ void PrintBucketCounts(const std::vector<uint64_t>& bucket_counts,
   int time_point = ttl_start;
   for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
     fprintf(stdout, "Keys in range %s to %s : %lu\n",
-            ReadableTime(time_point).c_str(),
-            ReadableTime(time_point + bucket_size).c_str(),
+            TimeToHumanString(time_point).c_str(),
+            TimeToHumanString(time_point + bucket_size).c_str(),
             (unsigned long)bucket_counts[i]);
   }
   fprintf(stdout, "Keys in range %s to %s : %lu\n",
-          ReadableTime(time_point).c_str(),
-          ReadableTime(ttl_end).c_str(),
+          TimeToHumanString(time_point).c_str(),
+          TimeToHumanString(ttl_end).c_str(),
           (unsigned long)bucket_counts[num_buckets - 1]);
 }
 
@@ -1564,7 +1552,8 @@ void DBDumperCommand::DoDumpCommand() {
   std::vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
-            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
   }
 
   HistogramImpl vsize_hist;
@@ -1619,7 +1608,7 @@ void DBDumperCommand::DoDumpCommand() {
 
     if (!count_only_ && !count_delim_) {
       if (is_db_ttl_ && timestamp_) {
-        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
       }
       std::string str =
           PrintKeyValue(iter->key().ToString(), iter->value().ToString(),
@@ -2397,7 +2386,8 @@ void ScanCommand::DoCommand() {
   }
   if (is_db_ttl_ && timestamp_) {
     fprintf(stdout, "Scanning key-values from %s to %s\n",
-            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
   }
   for ( ;
         it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
@@ -2409,7 +2399,7 @@ void ScanCommand::DoCommand() {
         continue;
       }
       if (timestamp_) {
-        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
       }
     }
 
diff --git a/util/string_util.cc b/util/string_util.cc
index 74f6afbf0f4..9b447d50ce3 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include "rocksdb/env.h"
 #include "port/port.h"
+#include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
 namespace rocksdb {
@@ -139,6 +140,16 @@ std::string BytesToHumanString(uint64_t bytes) {
   return std::string(buf);
 }
 
+std::string TimeToHumanString(int unixtime) {
+  char time_buffer[80];
+  time_t rawtime = unixtime;
+  struct tm tInfo;
+  struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
+  assert(timeinfo == &tInfo);
+  strftime(time_buffer, 80, "%c", timeinfo);
+  return std::string(time_buffer);
+}
+
 std::string EscapeString(const Slice& value) {
   std::string r;
   AppendEscapedStringTo(&r, value);
diff --git a/util/string_util.h b/util/string_util.h
index 6e125ddfa8f..faf763e54a1 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -50,6 +50,10 @@ extern std::string NumberToHumanString(int64_t num);
 // ex: 1048576 -> 1.00 GB
 extern std::string BytesToHumanString(uint64_t bytes);
 
+// Return a human-readable version of unix time
+// ex: 1562116015 -> "Tue Jul  2 18:06:55 2019"
+extern std::string TimeToHumanString(int unixtime);
+
 // Append a human-readable time in micros.
 int AppendHumanMicros(uint64_t micros, char* output, int len,
                       bool fixed_format);

From 4f66ec977d9b8b83c0b7e16d25a43281cd6a8073 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Thu, 4 Jul 2019 17:24:33 -0700
Subject: [PATCH 203/572] Fix lower bound check error when iterate across file
 boundary (#5540)

Summary:
Since https://github.com/facebook/rocksdb/issues/5468 `LevelIterator` compare lower bound and file smallest key on `NewFileIterator` and cache the result to reduce per key lower bound check. However when iterate across file boundary, it doesn't update the cached result since `Valid()=false` because `Valid()` still reflect the status of the previous file iterator. Fixing it by remove the `Valid()` check from `CheckMayBeOutOfLowerBound()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5540

Test Plan:
See the new test.

Signed-off-by: Yi Wu <yiwu@pingcap.com>

Differential Revision: D16127653

fbshipit-source-id: a0691e1164658d485c17971aaa97028812f74678
---
 db/db_iterator_test.cc | 26 ++++++++++++++++++++++++++
 db/version_set.cc      |  3 ++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 67a97b20b81..997b38602c4 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2821,6 +2821,32 @@ TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
   delete iter;
 }
 
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+  ASSERT_OK(Put("aaa", "v"));
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  // Move both files to bottom level.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Slice lower_bound("b");
+  ReadOptions read_opts;
+  read_opts.iterate_lower_bound = &lower_bound;
+  std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 226ba0e7e59..32dd61db830 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -984,7 +984,8 @@ class LevelIterator final : public InternalIterator {
   // Note MyRocks may update iterate bounds between seek. To workaround it,
   // we need to check and update may_be_out_of_lower_bound_ accordingly.
   void CheckMayBeOutOfLowerBound() {
-    if (Valid() && read_options_.iterate_lower_bound != nullptr) {
+    if (read_options_.iterate_lower_bound != nullptr &&
+        file_index_ < flevel_->num_files) {
       may_be_out_of_lower_bound_ =
           user_comparator_.Compare(
               ExtractUserKey(file_smallest_key(file_index_)),

From 2de61d91293eb2ec2185d2bbe2b2eebc55db94cc Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 5 Jul 2019 12:28:48 -0700
Subject: [PATCH 204/572] Assert get_context not null in BlockBasedTable::Get()
 (#5542)

Summary:
clang analyze fails after https://github.com/facebook/rocksdb/pull/5514 for this failure:
table/block_based/block_based_table_reader.cc:3450:16: warning: Called C++ object pointer is null
          if (!get_context->SaveValue(
               ^~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.

The reaon is that a branching is added earlier in the function on get_context is null or not, CLANG analyze thinks that it can be null and we make the function call withou the null checking.
Fix the issue by removing the branch and add an assert.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5542

Test Plan: "make all check" passes and CLANG analyze failure goes away.

Differential Revision: D16133988

fbshipit-source-id: d4627d03c4746254cc11926c523931086ccebcda
---
 table/block_based/block_based_table_reader.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 65bc6dfbc11..baa5c397eb7 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3335,13 +3335,13 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
                             const SliceTransform* prefix_extractor,
                             bool skip_filters) {
   assert(key.size() >= 8);  // key must be internal key
+  assert(get_context != nullptr);
   Status s;
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   bool may_match;
   FilterBlockReader* filter = nullptr;
-  uint64_t tracing_get_id = get_context ? get_context->tracing_get_id()
-                                        : BlockCacheTraceHelper::kReservedGetId;
+  uint64_t tracing_get_id = get_context->tracing_get_id();
   BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet,
                                          tracing_get_id};
   {

From e0d9d57750cb348f376ddd022276e8f493dd1e17 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Sat, 6 Jul 2019 21:04:22 -0700
Subject: [PATCH 205/572] Fix bugs in WAL trash file handling (#5520)

Summary:
1. Cleanup WAL trash files on open
2. Don't apply deletion rate limit if WAL dir is different from db dir
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5520

Test Plan: Add new unit tests and make check

Differential Revision: D16096750

Pulled By: anand1976

fbshipit-source-id: 6f07858ad864b754b711db416f0389c45ede599b
---
 HISTORY.md                        |   2 +
 db/db_impl/db_impl.cc             |  14 ++--
 db/db_impl/db_impl.h              |   2 +
 db/db_impl/db_impl_files.cc       |   3 +-
 db/db_impl/db_impl_open.cc        |  23 +++++++
 db/db_sst_test.cc                 | 105 ++++++++++++++++++++++++++++++
 db/wal_manager.cc                 |   9 ++-
 db/wal_manager.h                  |   6 +-
 file/file_util.cc                 |  17 ++++-
 file/file_util.h                  |   6 +-
 utilities/blob_db/blob_db_impl.cc |   6 +-
 11 files changed, 177 insertions(+), 16 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 6e0fcc54efb..c425c578f87 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -14,6 +14,7 @@
 * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 * Add C bindings for secondary instance, i.e. DBImplSecondary.
 * db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
+* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
@@ -40,6 +41,7 @@
 * Fix ingested file and directory not being fsync.
 * Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
+* On DB open, delete WAL trash files left behind in wal_dir
 
 ## 6.2.0 (4/30/2019)
 ### New Features
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 55f89eab32e..cf8dddb7fe1 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3137,6 +3137,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
   ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
+  bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
 
   // Reset the logger because it holds a handle to the
   // log file and prevents cleanup and directory removal
@@ -3159,7 +3160,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
         } else if (type == kTableFile || type == kLogFile) {
-          del = DeleteDBFile(&soptions, path_to_delete, dbname);
+          del =
+              DeleteDBFile(&soptions, path_to_delete, dbname,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
         } else {
           del = env->DeleteFile(path_to_delete);
         }
@@ -3193,7 +3196,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
           if (ParseFileName(fname, &number, &type) &&
               type == kTableFile) {  // Lock file will be deleted at end
             std::string table_path = path + "/" + fname;
-            Status del = DeleteDBFile(&soptions, table_path, dbname);
+            Status del = DeleteDBFile(&soptions, table_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
             if (result.ok() && !del.ok()) {
               result = del;
             }
@@ -3220,7 +3224,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
       for (const auto& file : archiveFiles) {
         if (ParseFileName(file, &number, &type) && type == kLogFile) {
           Status del =
-              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir);
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
           if (result.ok() && !del.ok()) {
             result = del;
           }
@@ -3235,7 +3240,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         if (ParseFileName(file, &number, &type) && type == kLogFile) {
           Status del =
               DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
-                           soptions.wal_dir);
+                           soptions.wal_dir, /*force_bg=*/false,
+                           /*force_fg=*/!wal_in_db_path);
           if (result.ok() && !del.ok()) {
             result = del;
           }
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 737f2337608..d417035b1ef 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1893,6 +1893,8 @@ class DBImpl : public DB {
   // results sequentially. Flush results of memtables with lower IDs get
   // installed to MANIFEST first.
   InstrumentedCondVar atomic_flush_install_cv_;
+
+  bool wal_in_db_path_;
 };
 
 extern Options SanitizeOptions(const std::string& db, const Options& src);
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index c018432c9b8..7afe3955e5b 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -258,7 +258,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
   Status file_deletion_status;
   if (type == kTableFile || type == kLogFile) {
     file_deletion_status =
-        DeleteDBFile(&immutable_db_options_, fname, path_to_sync);
+        DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
+                     /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
   } else {
     file_deletion_status = env_->DeleteFile(fname);
   }
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 13d6959d474..82e61a260b8 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -122,6 +122,25 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   }
 
 #ifndef ROCKSDB_LITE
+  ImmutableDBOptions immutable_db_options(result);
+  if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+    // cannot tell for sure. In either case, assume they're different and
+    // explicitly cleanup the trash log files (bypass DeleteScheduler)
+    // Do this first so even if we end up calling
+    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+    // safe
+    std::vector<std::string> filenames;
+    result.env->GetChildren(result.wal_dir, &filenames);
+    for (std::string& filename : filenames) {
+      if (filename.find(".log.trash",
+                  filename.length() - std::string(".log.trash").length()) !=
+                  std::string::npos) {
+        std::string trash_file = result.wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file);
+      }
+    }
+  }
   // When the DB is stopped, it's possible that there are some .trash files that
   // were not deleted yet, when we open the DB we will find these .trash files
   // and schedule them to be deleted (or delete immediately if SstFileManager
@@ -1294,6 +1313,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     delete impl;
     return s;
   }
+
+  impl->wal_in_db_path_ =
+      IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
   impl->mutex_.Lock();
   // Handles create_if_missing, error_if_exists
   s = impl->Recover(column_families);
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 799d0e14f6b..37adee46722 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -470,6 +470,111 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+class DBWALTestWithParam
+    : public DBSSTTest,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  DBWALTestWithParam() {
+    wal_dir_ = std::get<0>(GetParam());
+    wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+  }
+
+  std::string wal_dir_;
+  bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+  class MyEnv : public EnvWrapper {
+   public:
+    MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+
+    Status DeleteFile(const std::string& fname) {
+      if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+        return Status::OK();
+      }
+
+      return target()->DeleteFile(fname);
+    }
+
+    void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+   private:
+    bool fake_log_delete;
+  };
+
+  std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+  Destroy(last_options_);
+
+  env->set_fake_log_delete(true);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.wal_dir = dbname_ + wal_dir_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  ASSERT_OK(TryReopen(options));
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  Close();
+
+  options.sst_file_manager.reset();
+  std::vector<std::string> filenames;
+  int trash_log_count = 0;
+  if (!wal_dir_same_as_dbname_) {
+    // Forcibly create some trash log files
+    std::unique_ptr<WritableFile> result;
+    env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                         EnvOptions());
+    result.reset();
+  }
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_GE(trash_log_count, 1);
+
+  env->set_fake_log_delete(false);
+  ASSERT_OK(TryReopen(options));
+
+  filenames.clear();
+  trash_log_count = 0;
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_EQ(trash_log_count, 0);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+                        ::testing::Values(std::make_tuple("", true),
+                                          std::make_tuple("_wal_dir", false)));
+
 TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
   Options options = CurrentOptions();
 
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 58671d599c5..0c996baf549 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -187,7 +187,8 @@ void WalManager::PurgeObsoleteWALFiles() {
           continue;
         }
         if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
-          s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                           /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
             ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
                            file_path.c_str(), s.ToString().c_str());
@@ -213,7 +214,8 @@ void WalManager::PurgeObsoleteWALFiles() {
             log_file_size = std::max(log_file_size, file_size);
             ++log_files_num;
           } else {
-            s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                             /*force_fg=*/!wal_in_db_path_);
             if (!s.ok()) {
               ROCKS_LOG_WARN(db_options_.info_log,
                              "Unable to delete file: %s: %s", file_path.c_str(),
@@ -253,7 +255,8 @@ void WalManager::PurgeObsoleteWALFiles() {
   for (size_t i = 0; i < files_del_num; ++i) {
     std::string const file_path = archived_logs[i]->PathName();
     s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
-                     db_options_.wal_dir, false);
+                     db_options_.wal_dir, false,
+                     /*force_fg=*/!wal_in_db_path_);
     if (!s.ok()) {
       ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
                      file_path.c_str(), s.ToString().c_str());
diff --git a/db/wal_manager.h b/db/wal_manager.h
index 9d5afb25d5e..8d185c35076 100644
--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@@ -18,6 +18,7 @@
 #include <memory>
 
 #include "db/version_set.h"
+#include "file/file_util.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -40,7 +41,8 @@ class WalManager {
         env_options_(env_options),
         env_(db_options.env),
         purge_wal_files_last_run_(0),
-        seq_per_batch_(seq_per_batch) {}
+        seq_per_batch_(seq_per_batch),
+        wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {}
 
   Status GetSortedWalFiles(VectorLogPtr& files);
 
@@ -97,6 +99,8 @@ class WalManager {
 
   bool seq_per_batch_;
 
+  bool wal_in_db_path_;
+
   // obsolete files will be deleted every this seconds if ttl deletion is
   // enabled and archive size_limit is disabled.
   static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
diff --git a/file/file_util.cc b/file/file_util.cc
index 0364f834022..050d25da1a7 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -88,12 +88,12 @@ Status CreateFile(Env* env, const std::string& destination,
 }
 
 Status DeleteDBFile(const ImmutableDBOptions* db_options,
-                     const std::string& fname, const std::string& dir_to_sync,
-                     const bool force_bg) {
+                    const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg, const bool force_fg) {
 #ifndef ROCKSDB_LITE
   SstFileManagerImpl* sfm =
       static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
-  if (sfm) {
+  if (sfm && !force_fg) {
     return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
   } else {
     return db_options->env->DeleteFile(fname);
@@ -101,10 +101,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
 #else
   (void)dir_to_sync;
   (void)force_bg;
+  (void)force_fg;
   // SstFileManager is not supported in ROCKSDB_LITE
   // Delete file immediately
   return db_options->env->DeleteFile(fname);
 #endif
 }
 
+bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
+  bool same = false;
+  Status s = db_options->env->AreFilesSame(db_options->wal_dir,
+                                           db_options->db_paths[0].path, &same);
+  if (s.IsNotSupported()) {
+    same = db_options->wal_dir == db_options->db_paths[0].path;
+  }
+  return same;
+}
+
 }  // namespace rocksdb
diff --git a/file/file_util.h b/file/file_util.h
index 9116c1fecfb..75d6d7eb9fe 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -24,7 +24,9 @@ extern Status CreateFile(Env* env, const std::string& destination,
 
 extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
                            const std::string& fname,
-                           const std::string& path_to_sync,
-                           const bool force_bg = false);
+                           const std::string& path_to_sync, const bool force_bg,
+                           const bool force_fg);
+
+extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
 
 }  // namespace rocksdb
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 86eb1460c15..caa9b098804 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1758,7 +1758,8 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
 
     blob_files_.erase(bfile->BlobFileNumber());
     Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
-                             bfile->PathName(), blob_dir_, true);
+                            bfile->PathName(), blob_dir_, true,
+                            /*force_fg=*/false);
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "File failed to be deleted as obsolete %s",
@@ -1848,7 +1849,8 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
     uint64_t number;
     FileType type;
     if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true);
+      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+                                /*force_fg=*/false);
       if (status.ok() && !del.ok()) {
         status = del;
       }

From 8d34806972ad8867ede364feaa9d403e79b87d35 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Sun, 7 Jul 2019 21:29:39 -0700
Subject: [PATCH 206/572] setup wal_in_db_path_ for secondary instance (#5545)

Summary:
PR https://github.com/facebook/rocksdb/pull/5520 adds DBImpl:: wal_in_db_path_ and initializes it in DBImpl::Open, this PR fixes the valgrind error for secondary instance:
```
==236417== Conditional jump or move depends on uninitialised value(s)
==236417==    at 0x62242A: rocksdb::DeleteDBFile(rocksdb::ImmutableDBOptions const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool, bool) (file_util.cc:96)
==236417==    by 0x512432: rocksdb::DBImpl::DeleteObsoleteFileImpl(int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::FileType, unsigned long) (db_impl_files.cc:261)
==236417==    by 0x515A7A: rocksdb::DBImpl::PurgeObsoleteFiles(rocksdb::JobContext&, bool) (db_impl_files.cc:492)
==236417==    by 0x499153: rocksdb::ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() (column_family.cc:75)
==236417==    by 0x499880: rocksdb::ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() (column_family.cc:84)
==236417==    by 0x4C9AF9: rocksdb::DB::DestroyColumnFamilyHandle(rocksdb::ColumnFamilyHandle*) (db_impl.cc:3105)
==236417==    by 0x44E853: CloseSecondary (db_secondary_test.cc:53)
==236417==    by 0x44E853: rocksdb::DBSecondaryTest::~DBSecondaryTest() (db_secondary_test.cc:31)
==236417==    by 0x44EC77: ~DBSecondaryTest_PrimaryDropColumnFamily_Test (db_secondary_test.cc:443)
==236417==    by 0x44EC77: rocksdb::DBSecondaryTest_PrimaryDropColumnFamily_Test::~DBSecondaryTest_PrimaryDropColumnFamily_Test() (db_secondary_test.cc:443)
==236417==    by 0x83D1D7: HandleSehExceptionsInMethodIfSupported<testing::Test, void> (gtest-all.cc:3824)
==236417==    by 0x83D1D7: void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (gtest-all.cc:3860)
==236417==    by 0x8346DB: testing::TestInfo::Run() [clone .part.486] (gtest-all.cc:4078)
==236417==    by 0x8348D4: Run (gtest-all.cc:4047)
==236417==    by 0x8348D4: testing::TestCase::Run() [clone .part.487] (gtest-all.cc:4190)
==236417==    by 0x834D14: Run (gtest-all.cc:6100)
==236417==    by 0x834D14: testing::internal::UnitTestImpl::RunAllTests() (gtest-all.cc:6062)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5545

Differential Revision: D16146224

Pulled By: miasantreble

fbshipit-source-id: 184c90e451352951da4e955f054d4b1a1f29ea29
---
 db/db_impl/db_impl_secondary.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 8b93f675f8c..e14e53e55c3 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -587,6 +587,9 @@ Status DB::OpenAsSecondary(
       &impl->write_controller_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->wal_in_db_path_ =
+      IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
   impl->mutex_.Lock();
   s = impl->Recover(column_families, true, false, false);
   if (s.ok()) {

From 7c76a7fba271ed9023d9d7ed714ae2b519087fdf Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Sun, 7 Jul 2019 22:40:52 -0700
Subject: [PATCH 207/572] Support GetAllKeyVersions() for non-default cf
 (#5544)

Summary:
Previously `GetAllKeyVersions()` supports default column family only. This PR add support for other column families.

Test plan (devserver):
```
$make clean && COMPILE_WITH_ASAN=1 make -j32 db_basic_test
$./db_basic_test --gtest_filter=DBBasicTest.GetAllKeyVersions
```
All other unit tests must pass.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5544

Differential Revision: D16147551

Pulled By: riversand963

fbshipit-source-id: 5a61aece2a32d789e150226a9b8d53f4a5760168
---
 HISTORY.md                        |  1 +
 db/db_basic_test.cc               | 50 +++++++++++++++++++++++++++++++
 include/rocksdb/utilities/debug.h |  4 +++
 utilities/debug.cc                | 26 +++++++++++++---
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c425c578f87..d7eb51160ee 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -15,6 +15,7 @@
 * Add C bindings for secondary instance, i.e. DBImplSecondary.
 * db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
 * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
+* Overload GetAllKeyVersions() to support non-default column family.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 66d3b3aff7c..dc77fb91a9b 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
 #include "table/block_based/block_builder.h"
 #include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
@@ -1286,6 +1287,55 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
   }
 }
 
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  const size_t kNumInserts = 4;
+  const size_t kNumDeletes = 4;
+  const size_t kNumUpdates = 4;
+
+  // Check default column family
+  for (size_t i = 0; i != kNumInserts; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(std::to_string(i)));
+  }
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+  // Check non-default column family
+  for (size_t i = 0; i != kNumInserts - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes - 1; ++i) {
+    ASSERT_OK(Delete(1, std::to_string(i)));
+  }
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+#endif  // !ROCKSDB_LITE
+
 class DBBasicTestWithParallelIO
     : public DBTestBase,
       public testing::WithParamInterface<std::tuple<bool,bool,bool,bool>> {
diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h
index 50645423d0a..3fc414b6edf 100644
--- a/include/rocksdb/utilities/debug.h
+++ b/include/rocksdb/utilities/debug.h
@@ -40,6 +40,10 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 8ddf64b5dc4..3c35f4c1122 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -14,16 +14,34 @@ namespace rocksdb {
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
-  assert(key_versions != nullptr);
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key,
+                           max_num_ikeys, key_versions);
+}
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions) {
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  if (nullptr == cfh) {
+    return Status::InvalidArgument("Column family handle cannot be null.");
+  }
+  if (nullptr == key_versions) {
+    return Status::InvalidArgument("key_versions cannot be null.");
+  }
   key_versions->clear();
 
   DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
-  auto icmp = InternalKeyComparator(idb->GetOptions().comparator);
+  auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
   ReadRangeDelAggregator range_del_agg(&icmp,
                                        kMaxSequenceNumber /* upper_bound */);
   Arena arena;
-  ScopedArenaIterator iter(
-      idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber));
+  ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg,
+                                                    kMaxSequenceNumber, cfh));
 
   if (!begin_key.empty()) {
     InternalKey ikey;

From 6ca3feed5c5e3cb71a26f3aa58fdb46d64020c35 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 8 Jul 2019 00:09:44 -0700
Subject: [PATCH 208/572] Fix -Werror=shadow (#5546)

Summary:
This PR fixes shadow errors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5546

Test Plan: make clean && make check -j32 && make clean && USE_CLANG=1 make check -j32 && make clean && COMPILE_WITH_ASAN=1 make check -j32

Differential Revision: D16147841

Pulled By: HaoyuHuang

fbshipit-source-id: 1043500d70c134185f537ab4c3900452752f1534
---
 table/block_based/block_based_table_reader.cc | 4 ++--
 table/get_context.h                           | 2 +-
 utilities/simulator_cache/cache_simulator.cc  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index baa5c397eb7..26c1365c4e7 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3341,7 +3341,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   CachableEntry<FilterBlockReader> filter_entry;
   bool may_match;
   FilterBlockReader* filter = nullptr;
-  uint64_t tracing_get_id = get_context->tracing_get_id();
+  uint64_t tracing_get_id = get_context->get_tracing_get_id();
   BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet,
                                          tracing_get_id};
   {
@@ -3517,7 +3517,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                mget_range->end());
   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
-    tracing_mget_id = sst_file_range.begin()->get_context->tracing_get_id();
+    tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
   }
   BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet,
                                          tracing_mget_id};
diff --git a/table/get_context.h b/table/get_context.h
index f567229cc9f..7a37beb2df2 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -136,7 +136,7 @@ class GetContext {
 
   void ReportCounters();
 
-  uint64_t tracing_get_id() const { return tracing_get_id_; }
+  uint64_t get_tracing_get_id() const { return tracing_get_id_; }
 
  private:
   const Comparator* ucmp_;
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
index 145efdb6cba..65f626036b0 100644
--- a/utilities/simulator_cache/cache_simulator.cc
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -34,8 +34,8 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
 double CacheSimulator::miss_ratio() {
   uint64_t hits = sim_cache_->get_hit_counter();
   uint64_t misses = sim_cache_->get_miss_counter();
-  uint64_t total_accesses = hits + misses;
-  return static_cast<double>(misses * 100.0 / total_accesses);
+  uint64_t accesses = hits + misses;
+  return static_cast<double>(misses * 100.0 / accesses);
 }
 
 uint64_t CacheSimulator::total_accesses() {

From 872a261ffc2a440dfe9e60d99e421e42f5f2cf5e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 8 Jul 2019 13:28:08 -0700
Subject: [PATCH 209/572] db_stress to print some internal keys after
 verification failure (#5543)

Summary:
Print out some more information when db_tress fails with verification failures to help debugging problems.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5543

Test Plan:
Manually ingest some failures and observe the outputs are like this:

Verification failed
[default] 0000000000199A5A => 7C3D000078797A7B74757677707172736C6D6E6F68696A6B
[6] 000000000019C8BD => 65380000616063626D6C6F6E69686B6A
internal keys in default CF [0000000000199A5A, 000000000019C8BD] (max 8)
  key 0000000000199A5A seq 179246 type 1
  key 000000000019C8BD seq 163970 type 1
Lastest Sequence Number: 292234

Differential Revision: D16153717

fbshipit-source-id: b33fa50a828c190cbf8249a37955432044f92daf
---
 tools/db_stress.cc | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 813f8068278..66a10d4f3a2 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -55,6 +55,7 @@ int main() {
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/debug.h"
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -4261,16 +4262,46 @@ class AtomicFlushStressTest : public StressTest {
           key = iters[i]->key();
           value = iters[i]->value();
         } else {
-          if (key.compare(iters[i]->key()) != 0) {
+          int cmp = key.compare(iters[i]->key());
+          if (cmp != 0) {
             fprintf(stderr, "Verification failed\n");
-            fprintf(stderr, "cf%s: %s => %s\n",
+            fprintf(stderr, "[%s] %s => %s\n",
                     column_families_[0]->GetName().c_str(),
                     key.ToString(true /* hex */).c_str(),
-                    value.ToString(/* hex */).c_str());
-            fprintf(stderr, "cf%s: %s => %s\n",
+                    value.ToString(true /* hex */).c_str());
+            fprintf(stderr, "[%s] %s => %s\n",
                     column_families_[i]->GetName().c_str(),
                     iters[i]->key().ToString(true /* hex */).c_str(),
                     iters[i]->value().ToString(true /* hex */).c_str());
+#ifndef ROCKSDB_LITE
+            Slice begin_key;
+            Slice end_key;
+            if (cmp < 0) {
+              begin_key = key;
+              end_key = iters[i]->key();
+            } else {
+              begin_key = iters[i]->key();
+              end_key = key;
+            }
+            // We should print both of CF 0 and i but GetAllKeyVersions() now
+            // only supports default CF.
+            std::vector<KeyVersion> versions;
+            const size_t kMaxNumIKeys = 8;
+            Status s = GetAllKeyVersions(db_, begin_key, end_key, kMaxNumIKeys,
+                                         &versions);
+            fprintf(stderr,
+                    "Internal keys in default CF [%s, %s] (max %" ROCKSDB_PRIszt
+                    ")\n",
+                    begin_key.ToString(true /* hex */).c_str(),
+                    end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
+            for (const KeyVersion& kv : versions) {
+              fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
+                      Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
+                      kv.type);
+            }
+#endif  // ROCKSDB_LITE
+            fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
+                    db_->GetLatestSequenceNumber());
             shared->SetVerificationFailure();
           }
         }

From a6a9213a367819bbe2c16b398f00f7dfa9b0dc18 Mon Sep 17 00:00:00 2001
From: Tim Hatch <thatch@fb.com>
Date: Tue, 9 Jul 2019 10:47:31 -0700
Subject: [PATCH 210/572] Fix interpreter lines for files with python2-only
 syntax.

Reviewed By: lisroach

Differential Revision: D15362271

fbshipit-source-id: 48fab12ab6e55a8537b19b4623d2545ca9950ec5
---
 coverage/parse_gcov_output.py | 1 +
 tools/db_crashtest.py         | 2 +-
 tools/ldb_test.py             | 1 +
 tools/write_stress_runner.py  | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py
index fbdabd96839..a5e98722202 100644
--- a/coverage/parse_gcov_output.py
+++ b/coverage/parse_gcov_output.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import re
 import sys
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 173a6a8da9c..2a38d4c96d9 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import os
 import sys
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index 26167ee83fd..4403379460b 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import os
 import glob
diff --git a/tools/write_stress_runner.py b/tools/write_stress_runner.py
index 9a0e920a724..fc0c99c235a 100644
--- a/tools/write_stress_runner.py
+++ b/tools/write_stress_runner.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import subprocess
 import argparse

From cb19e7411f17713adcfefbd45988dc6b18174914 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 9 Jul 2019 11:01:12 -0700
Subject: [PATCH 211/572] Fix bugs in DBWALTest.kTolerateCorruptedTailRecords
 triggered by #5520 (#5550)

Summary:
https://github.com/facebook/rocksdb/pull/5520 caused a buffer overflow bug in DBWALTest.kTolerateCorruptedTailRecords. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5550

Test Plan: Run the test in UBSAN. It used to fail. Not it succeeds.

Differential Revision: D16165516

fbshipit-source-id: 42c56a6bc64eb091f054b87757fcbef60da825f7
---
 db/db_wal_test.cc | 4 +++-
 file/file_util.cc | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 4859bdc90f4..2d5e7bc1d53 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -824,7 +824,9 @@ class RecoveryTestHelper {
   // Create WAL files with values filled in
   static void FillData(DBWALTest* test, const Options& options,
                        const size_t wal_count, size_t* count) {
-    const ImmutableDBOptions db_options(options);
+    // Calling internal functions requires sanitized options.
+    Options sanitized_options = SanitizeOptions(test->dbname_, options);
+    const ImmutableDBOptions db_options(sanitized_options);
 
     *count = 0;
 
diff --git a/file/file_util.cc b/file/file_util.cc
index 050d25da1a7..ee52bf640fb 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -110,6 +110,7 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
 
 bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
   bool same = false;
+  assert(!db_options->db_paths.empty());
   Status s = db_options->env->AreFilesSame(db_options->wal_dir,
                                            db_options->db_paths[0].path, &same);
   if (s.IsNotSupported()) {

From aa0367aabbb2ee891a4f7674351d8b10875670fa Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 9 Jul 2019 12:46:01 -0700
Subject: [PATCH 212/572] Allow ldb to open DB as secondary (#5537)

Summary:
Right now ldb can open running DB through read-only DB. However, it might leave info logs files to the read-only DB directory. Add an option to open the DB as secondary to avoid it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5537

Test Plan:
Run
./ldb scan  --max_keys=10 --db=/tmp/rocksdbtest-2491/dbbench --secondary_path=/tmp --no_value --hex
and
./ldb get 0x00000000000000103030303030303030 --hex --db=/tmp/rocksdbtest-2491/dbbench --secondary_path=/tmp
against a normal db_bench run and observe the output changes. Also observe that no new info logs files are created under /tmp/rocksdbtest-2491/dbbench.
Run without --secondary_path and observe that new info logs created under /tmp/rocksdbtest-2491/dbbench.

Differential Revision: D16113886

fbshipit-source-id: 4e09dec47c2528f6ca08a9e7a7894ba2d9daebbb
---
 HISTORY.md                          |  1 +
 include/rocksdb/utilities/ldb_cmd.h |  5 +++++
 tools/ldb_cmd.cc                    | 29 +++++++++++++++++++++++++----
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d7eb51160ee..099c9f37e86 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,7 @@
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
+* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index 57ab88a34eb..e7000742d1b 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -31,6 +31,7 @@ class LDBCommand {
   // Command-line arguments
   static const std::string ARG_DB;
   static const std::string ARG_PATH;
+  static const std::string ARG_SECONDARY_PATH;
   static const std::string ARG_HEX;
   static const std::string ARG_KEY_HEX;
   static const std::string ARG_VALUE_HEX;
@@ -128,6 +129,10 @@ class LDBCommand {
  protected:
   LDBCommandExecuteResult exec_state_;
   std::string db_path_;
+  // If empty, open DB as primary. If non-empty, open the DB as secondary
+  // with this secondary path. When running against a database opened by
+  // another process, ldb wll leave the source directory completely intact. 
+  std::string secondary_path_;
   std::string column_family_name_;
   DB* db_;
   DBWithTTL* db_ttl_;
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index fba32d9d622..8f4258cf36e 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -47,6 +47,7 @@ namespace rocksdb {
 
 const std::string LDBCommand::ARG_DB = "db";
 const std::string LDBCommand::ARG_PATH = "path";
+const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
 const std::string LDBCommand::ARG_HEX = "hex";
 const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
 const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
@@ -321,6 +322,12 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
     column_family_name_ = kDefaultColumnFamilyName;
   }
 
+  itr = options.find(ARG_SECONDARY_PATH);
+  secondary_path_ = "";
+  if (itr != options.end()) {
+    secondary_path_ = itr->second;
+  }
+
   is_key_hex_ = IsKeyHex(options, flags);
   is_value_hex_ = IsValueHex(options, flags);
   is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
@@ -360,6 +367,10 @@ void LDBCommand::OpenDB() {
       exec_state_ = LDBCommandExecuteResult::Failed(
           "ldb doesn't support TTL DB with multiple column families");
     }
+    if (!secondary_path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Open as secondary is not supported for TTL DB yet.");
+    }
     if (is_read_only_) {
       st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true);
     } else {
@@ -382,7 +393,7 @@ void LDBCommand::OpenDB() {
         }
       }
     }
-    if (is_read_only_) {
+    if (is_read_only_ && secondary_path_.empty()) {
       if (column_families_.empty()) {
         st = DB::OpenForReadOnly(options_, db_path_, &db_);
       } else {
@@ -391,10 +402,19 @@ void LDBCommand::OpenDB() {
       }
     } else {
       if (column_families_.empty()) {
-        st = DB::Open(options_, db_path_, &db_);
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
+        }
       } else {
-        st = DB::Open(options_, db_path_, column_families_, &handles_opened,
-                      &db_);
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, column_families_, &handles_opened,
+                        &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
+                                   column_families_, &handles_opened, &db_);
+        }
       }
     }
   }
@@ -452,6 +472,7 @@ ColumnFamilyHandle* LDBCommand::GetCfHandle() {
 std::vector<std::string> LDBCommand::BuildCmdLineOptions(
     std::vector<std::string> options) {
   std::vector<std::string> ret = {ARG_DB,
+                                  ARG_SECONDARY_PATH,
                                   ARG_BLOOM_BITS,
                                   ARG_BLOCK_SIZE,
                                   ARG_AUTO_COMPACTION,

From f786b4a5b4f1f162a7e7452b33e2e5cf0d755b9b Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 9 Jul 2019 12:57:02 -0700
Subject: [PATCH 213/572] Improve result print on atomic flush stress test
 failure (#5549)

Summary:
When atomic flush stress test fails, we print internal keys within the range with mismatched key/values for all column families.

Test plan (on devserver)
Manually hack the code to randomly insert wrong data. Run the test.
```
$make clean && COMPILE_WITH_TSAN=1 make -j32 db_stress
$./db_stress -test_atomic_flush=true -ops_per_thread=10000
```
Check that proper error messages are printed, as follows:
```
2019/07/08-17:40:14  Starting verification
Verification failed
Latest Sequence Number: 190903
[default] 000000000000050B => 56290000525350515E5F5C5D5A5B5859
[3] 0000000000000533 => EE100000EAEBE8E9E6E7E4E5E2E3E0E1FEFFFCFDFAFBF8F9
Internal keys in CF 'default', [000000000000050B, 0000000000000533] (max 8)
  key 000000000000050B seq 139920 type 1
  key 0000000000000533 seq 0 type 1
Internal keys in CF '3', [000000000000050B, 0000000000000533] (max 8)
  key 0000000000000533 seq 0 type 1
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5549

Differential Revision: D16158709

Pulled By: riversand963

fbshipit-source-id: f07fa87763f87b3bd908da03c956709c6456bcab
---
 tools/db_stress.cc | 77 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 66a10d4f3a2..3f767a9e76a 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -923,7 +923,8 @@ class SharedState {
         stress_test_(stress_test),
         verification_failure_(false),
         no_overwrite_ids_(FLAGS_column_families),
-        values_(nullptr) {
+        values_(nullptr),
+        printing_verification_results_(false) {
     // Pick random keys in each column family that will not experience
     // overwrite
 
@@ -1204,6 +1205,16 @@ class SharedState {
     return expected_mmap_buffer_.get() != nullptr;
   }
 
+  bool PrintingVerificationResults() {
+    bool tmp = false;
+    return !printing_verification_results_.compare_exchange_strong(
+        tmp, true, std::memory_order_relaxed);
+  }
+
+  void FinishPrintingVerificationResults() {
+    printing_verification_results_.store(false, std::memory_order_relaxed);
+  }
+
  private:
   port::Mutex mu_;
   port::CondVar cv_;
@@ -1231,6 +1242,7 @@ class SharedState {
   // and storing it in the container may require copying depending on the impl.
   std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
   std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
+  std::atomic<bool> printing_verification_results_;
 };
 
 const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
@@ -4235,6 +4247,7 @@ class AtomicFlushStressTest : public StressTest {
         }
         break;
       } else if (valid_cnt != iters.size()) {
+        shared->SetVerificationFailure();
         for (size_t i = 0; i != num; ++i) {
           if (!iters[i]->Valid()) {
             if (statuses[i].ok()) {
@@ -4250,13 +4263,19 @@ class AtomicFlushStressTest : public StressTest {
                     column_families_[i]->GetName().c_str());
           }
         }
-        shared->SetVerificationFailure();
+        break;
+      }
+      if (shared->HasVerificationFailedYet()) {
         break;
       }
       // If the program reaches here, then all column families' iterators are
       // still valid.
+      if (shared->PrintingVerificationResults()) {
+        continue;
+      }
       Slice key;
       Slice value;
+      int num_mismatched_cfs = 0;
       for (size_t i = 0; i != num; ++i) {
         if (i == 0) {
           key = iters[i]->key();
@@ -4264,11 +4283,16 @@ class AtomicFlushStressTest : public StressTest {
         } else {
           int cmp = key.compare(iters[i]->key());
           if (cmp != 0) {
-            fprintf(stderr, "Verification failed\n");
-            fprintf(stderr, "[%s] %s => %s\n",
-                    column_families_[0]->GetName().c_str(),
-                    key.ToString(true /* hex */).c_str(),
-                    value.ToString(true /* hex */).c_str());
+            ++num_mismatched_cfs;
+            if (1 == num_mismatched_cfs) {
+              fprintf(stderr, "Verification failed\n");
+              fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
+                      db_->GetLatestSequenceNumber());
+              fprintf(stderr, "[%s] %s => %s\n",
+                      column_families_[0]->GetName().c_str(),
+                      key.ToString(true /* hex */).c_str(),
+                      value.ToString(true /* hex */).c_str());
+            }
             fprintf(stderr, "[%s] %s => %s\n",
                     column_families_[i]->GetName().c_str(),
                     iters[i]->key().ToString(true /* hex */).c_str(),
@@ -4283,29 +4307,38 @@ class AtomicFlushStressTest : public StressTest {
               begin_key = iters[i]->key();
               end_key = key;
             }
-            // We should print both of CF 0 and i but GetAllKeyVersions() now
-            // only supports default CF.
             std::vector<KeyVersion> versions;
             const size_t kMaxNumIKeys = 8;
-            Status s = GetAllKeyVersions(db_, begin_key, end_key, kMaxNumIKeys,
-                                         &versions);
-            fprintf(stderr,
-                    "Internal keys in default CF [%s, %s] (max %" ROCKSDB_PRIszt
-                    ")\n",
-                    begin_key.ToString(true /* hex */).c_str(),
-                    end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
-            for (const KeyVersion& kv : versions) {
-              fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
-                      Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
-                      kv.type);
+            const auto print_key_versions = [&](ColumnFamilyHandle* cfh) {
+              Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key,
+                                           kMaxNumIKeys, &versions);
+              if (!s.ok()) {
+                fprintf(stderr, "%s\n", s.ToString().c_str());
+                return;
+              }
+              assert(nullptr != cfh);
+              fprintf(stderr,
+                      "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt
+                      ")\n",
+                      cfh->GetName().c_str(),
+                      begin_key.ToString(true /* hex */).c_str(),
+                      end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
+              for (const KeyVersion& kv : versions) {
+                fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
+                        Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
+                        kv.type);
+              }
+            };
+            if (1 == num_mismatched_cfs) {
+              print_key_versions(column_families_[0]);
             }
+            print_key_versions(column_families_[i]);
 #endif  // ROCKSDB_LITE
-            fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
-                    db_->GetLatestSequenceNumber());
             shared->SetVerificationFailure();
           }
         }
       }
+      shared->FinishPrintingVerificationResults();
       for (auto& iter : iters) {
         iter->Next();
       }

From 60d8b19836745ce01deed59138802a3aa75bc488 Mon Sep 17 00:00:00 2001
From: ggaurav28 <51927531+ggaurav28@users.noreply.github.com>
Date: Tue, 9 Jul 2019 14:48:07 -0700
Subject: [PATCH 214/572] Implemented a file logger that uses
 WritableFileWriter (#5491)

Summary:
Current PosixLogger performs IO operations using posix calls. Thus the
current implementation will not work for non-posix env. Created a new
logger class EnvLogger that uses env specific WritableFileWriter for IO operations.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5491

Test Plan: make check

Differential Revision: D15909002

Pulled By: ggaurav28

fbshipit-source-id: 13a8105176e8e42db0c59798d48cb6a0dbccc965
---
 CMakeLists.txt                   |   1 +
 Makefile                         |   4 +
 TARGETS                          |   5 +
 env/env.cc                       |  21 ++++
 env/env_posix.cc                 |   7 +-
 env/io_posix.cc                  |   1 -
 include/rocksdb/env.h            |  12 ++-
 logging/auto_roll_logger_test.cc |  24 +----
 logging/env_logger.h             | 165 +++++++++++++++++++++++++++++++
 logging/env_logger_test.cc       | 164 ++++++++++++++++++++++++++++++
 src.mk                           |   1 +
 test_util/testutil.cc            |  18 ++++
 test_util/testutil.h             |   3 +
 util/file_reader_writer.h        |   2 +-
 14 files changed, 400 insertions(+), 28 deletions(-)
 create mode 100644 logging/env_logger.h
 create mode 100644 logging/env_logger_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ca338bd63f..50e082662b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -949,6 +949,7 @@ if(WITH_TESTS)
         env/mock_env_test.cc
         file/delete_scheduler_test.cc
         logging/auto_roll_logger_test.cc
+        logging/env_logger_test.cc
         logging/event_logger_test.cc
         memory/arena_test.cc
         memtable/inlineskiplist_test.cc
diff --git a/Makefile b/Makefile
index b0b52a37365..f1834e0ecf9 100644
--- a/Makefile
+++ b/Makefile
@@ -432,6 +432,7 @@ TESTS = \
 	inlineskiplist_test \
 	env_basic_test \
 	env_test \
+	env_logger_test \
 	hash_test \
 	thread_local_test \
 	rate_limiter_test \
@@ -1529,6 +1530,9 @@ filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
 auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index 3935f1f740d..82e1d375d96 100644
--- a/TARGETS
+++ b/TARGETS
@@ -368,6 +368,11 @@ ROCKS_TESTS = [
         "logging/auto_roll_logger_test.cc",
         "serial",
     ],
+    [
+        "env_logger_test",
+        "logging/env_logger_test.cc",
+        "serial",
+    ],
     [
         "autovector_test",
         "util/autovector_test.cc",
diff --git a/env/env.cc b/env/env.cc
index e5e0e99c0a0..87b6b35c16c 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -10,6 +10,7 @@
 #include "rocksdb/env.h"
 
 #include <thread>
+#include "logging/env_logger.h"
 #include "memory/arena.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -22,6 +23,11 @@ namespace rocksdb {
 Env::~Env() {
 }
 
+Status Env::NewLogger(const std::string& fname,
+                      std::shared_ptr<Logger>* result) {
+  return NewEnvLogger(fname, this, result);
+}
+
 std::string Env::PriorityToString(Env::Priority priority) {
   switch (priority) {
     case Env::Priority::BOTTOM:
@@ -422,5 +428,20 @@ EnvOptions::EnvOptions() {
   AssignEnvOptions(this, options);
 }
 
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result) {
+  EnvOptions options;
+  // TODO: Tune the buffer size.
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  std::unique_ptr<WritableFile> writable_file;
+  const auto status = env->NewWritableFile(fname, &writable_file, options);
+  if (!status.ok()) {
+    return status;
+  }
+
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, env);
+  return Status::OK();
+}
 
 }  // namespace rocksdb
diff --git a/env/env_posix.cc b/env/env_posix.cc
index c0edb00968e..7f7f6b2df5b 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -887,13 +887,14 @@ class PosixEnv : public Env {
     FILE* f;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(), "w"
+      f = fopen(fname.c_str(),
+                "w"
 #ifdef __GLIBC_PREREQ
 #if __GLIBC_PREREQ(2, 7)
-          "e" // glibc extension to enable O_CLOEXEC
+                "e"  // glibc extension to enable O_CLOEXEC
 #endif
 #endif
-          );
+      );
     }
     if (f == nullptr) {
       result->reset();
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 304c4ffe1c7..293516feee8 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -27,7 +27,6 @@
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
-#include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index ba8978dc810..67464cc5c55 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -395,9 +395,11 @@ class Env {
   // same directory.
   virtual Status GetTestDirectory(std::string* path) = 0;
 
-  // Create and return a log file for storing informational messages.
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can overide to provide custom
+  // logger.
   virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result) = 0;
+                           std::shared_ptr<Logger>* result);
 
   // Returns the number of micro-seconds since some fixed point in time.
   // It is often used as system time such as in GenericRateLimiter
@@ -1563,4 +1565,10 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname);
 // This is a factory method for TimedEnv defined in utilities/env_timed.cc.
 Env* NewTimedEnv(Env* base_env);
 
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result);
+
 }  // namespace rocksdb
diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
index cce98d374ef..fa668114cfb 100644
--- a/logging/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/db.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 namespace {
@@ -444,7 +445,7 @@ TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
        {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}});
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); });
+  flush_thread = port::Thread([&]() { auto_roll_logger->Flush(); });
   TEST_SYNC_POINT(
       "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit");
   RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
@@ -557,25 +558,6 @@ static std::vector<std::string> GetOldFileNames(const std::string& path) {
   return ret;
 }
 
-// Return the number of lines where a given pattern was found in the file
-static size_t GetLinesCount(const std::string& fname,
-                            const std::string& pattern) {
-  std::stringstream ssbuf;
-  std::string line;
-  size_t count = 0;
-
-  std::ifstream inFile(fname.c_str());
-  ssbuf << inFile.rdbuf();
-
-  while (getline(ssbuf, line)) {
-    if (line.find(pattern) != std::string::npos) {
-      count++;
-    }
-  }
-
-  return count;
-}
-
 TEST_F(AutoRollLoggerTest, LogHeaderTest) {
   static const size_t MAX_HEADERS = 10;
   static const size_t LOG_MAX_SIZE = 1024 * 5;
@@ -627,7 +609,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
       // verify that the files rolled over
       ASSERT_NE(oldfname, newfname);
       // verify that the old log contains all the header logs
-      ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+      ASSERT_EQ(test::GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
     }
   }
 }
diff --git a/logging/env_logger.h b/logging/env_logger.h
new file mode 100644
index 00000000000..94cf129228c
--- /dev/null
+++ b/logging/env_logger.h
@@ -0,0 +1,165 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that uses custom Env object for logging.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include "port/sys_time.h"
+#include <time.h>
+
+#include "monitoring/iostats_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/file_reader_writer.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class EnvLogger : public Logger {
+ public:
+  EnvLogger(std::unique_ptr<WritableFile>&& writable_file,
+            const std::string& fname, const EnvOptions& options, Env* env,
+            InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(std::move(writable_file), fname, options, env),
+        last_flush_micros_(0),
+        env_(env),
+        flush_pending_(false) {}
+
+  ~EnvLogger() {
+    if (!closed_) {
+      closed_ = true;
+      CloseHelper();
+    }
+  }
+
+ private:
+  void FlushLocked() {
+    mutex_.AssertHeld();
+    if (flush_pending_) {
+      flush_pending_ = false;
+      file_.Flush();
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+
+  void Flush() override {
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin1");
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin2");
+
+    MutexLock l(&mutex_);
+    FlushLocked();
+  }
+
+  Status CloseImpl() override { return CloseHelper(); }
+
+  Status CloseHelper() {
+    mutex_.Lock();
+    const auto close_status = file_.Close();
+    mutex_.Unlock();
+
+    if (close_status.ok()) {
+      return close_status;
+    }
+    return Status::IOError("Close of log file failed with error:" +
+                           (close_status.getState()
+                                ? std::string(close_status.getState())
+                                : std::string()));
+  }
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    IOSTATS_TIMER_GUARD(logger_nanos);
+
+    const uint64_t thread_id = env_->GetThreadID();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 65536;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;  // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      mutex_.Lock();
+      // We will ignore any error returned by Append().
+      file_.Append(Slice(base, p - base));
+      flush_pending_ = true;
+      const uint64_t now_micros = env_->NowMicros();
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        FlushLocked();
+      }
+      mutex_.Unlock();
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+
+  size_t GetLogFileSize() const override {
+    MutexLock l(&mutex_);
+    return file_.GetFileSize();
+  }
+
+ private:
+  WritableFileWriter file_;
+  mutable port::Mutex mutex_;  // Mutex to protect the shared variables below.
+  const static uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  std::atomic<bool> flush_pending_;
+};
+
+}  // namespace rocksdb
diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc
new file mode 100644
index 00000000000..316c231fad9
--- /dev/null
+++ b/logging/env_logger_test.cc
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "env/mock_env.h"
+#include "logging/env_logger.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+// In this test we only want to Log some simple log message with
+// no format.
+void LogMessage(std::shared_ptr<Logger> logger, const std::string& message) {
+  Log(logger, "%s", message.c_str());
+}
+
+// Helper method to write the message num_times in the given logger.
+void WriteLogs(std::shared_ptr<Logger> logger, const std::string& message,
+               int num_times) {
+  for (int ii = 0; ii < num_times; ++ii) {
+    LogMessage(logger, message);
+  }
+}
+
+}  // namespace
+
+class EnvLoggerTest : public testing::Test {
+ public:
+  Env* env_;
+
+  EnvLoggerTest() : env_(Env::Default()) {}
+
+  ~EnvLoggerTest() = default;
+
+  std::shared_ptr<Logger> CreateLogger() {
+    std::shared_ptr<Logger> result;
+    assert(NewEnvLogger(kLogFile, env_, &result).ok());
+    assert(result);
+    result->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    return result;
+  }
+
+  void DeleteLogFile() {
+    ASSERT_OK(env_->DeleteFile(kLogFile));
+  }
+
+  static const std::string kSampleMessage;
+  static const std::string kTestDir;
+  static const std::string kLogFile;
+};
+
+const std::string EnvLoggerTest::kSampleMessage =
+    "this is the message to be written to the log file!!";
+const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file");
+
+TEST_F(EnvLoggerTest, EmptyLogFile) {
+  auto logger = CreateLogger();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Check the size of the log file.
+  uint64_t file_size;
+  ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+  ASSERT_EQ(file_size, 0);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, LogMultipleLines) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  // Flush the logs.
+  logger->Flush();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Overwrite) {
+  {
+    auto logger = CreateLogger();
+
+    // Write multiple lines.
+    const int kNumIter = 10;
+    WriteLogs(logger, kSampleMessage, kNumIter);
+
+    ASSERT_EQ(logger->Close(), Status::OK());
+
+    // Validate whether the log file has 'kNumIter' number of lines.
+    ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  }
+
+  // Now reopen the file again.
+  {
+    auto logger = CreateLogger();
+
+    // File should be empty.
+    uint64_t file_size;
+    ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+    ASSERT_EQ(file_size, 0);
+    ASSERT_EQ(logger->GetLogFileSize(), 0);
+    ASSERT_EQ(logger->Close(), Status::OK());
+  }
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Close) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, ConcurrentLogging) {
+  auto logger = CreateLogger();
+
+  const int kNumIter = 20;
+  std::function<void()> cb = [&]() {
+    WriteLogs(logger, kSampleMessage, kNumIter);
+    logger->Flush();
+  };
+
+  // Write to the logs from multiple threads.
+  std::vector<port::Thread> threads;
+  const int kNumThreads = 5;
+  // Create threads.
+  for (int ii = 0; ii < kNumThreads; ++ii) {
+    threads.push_back(port::Thread(cb));
+  }
+
+  // Wait for them to complete.
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Verfiy the log file.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage),
+            kNumIter * kNumThreads);
+  DeleteLogFile();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src.mk b/src.mk
index 7c35ee67589..8b0122dbe22 100644
--- a/src.mk
+++ b/src.mk
@@ -345,6 +345,7 @@ MAIN_SOURCES =                                                          \
   env/env_test.cc                                                       \
   env/mock_env_test.cc                                                  \
   logging/auto_roll_logger_test.cc                                      \
+  logging/env_logger_test.cc                                            \
   logging/event_logger_test.cc                                          \
   memory/arena_test.cc                                                  \
   memtable/inlineskiplist_test.cc                                       \
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 61a49d88a17..46f878f8ce5 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -11,6 +11,7 @@
 
 #include <array>
 #include <cctype>
+#include <fstream>
 #include <sstream>
 
 #include "db/memtable_list.h"
@@ -426,5 +427,22 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) {
   return s.ok();
 }
 
+size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
+  std::stringstream ssbuf;
+  std::string line;
+  size_t count = 0;
+
+  std::ifstream inFile(fname.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(pattern) != std::string::npos) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/test_util/testutil.h b/test_util/testutil.h
index bc0b2b07d5f..bb732ff3a5a 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -750,5 +750,8 @@ Status DestroyDir(Env* env, const std::string& dir);
 
 bool IsDirectIOSupported(Env* env, const std::string& dir);
 
+// Return the number of lines where a given pattern was found in a file.
+size_t GetLinesCount(const std::string& fname, const std::string& pattern);
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 0a7e5032d2f..0c5089d0758 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -282,7 +282,7 @@ class WritableFileWriter {
   // returns NotSupported status.
   Status SyncWithoutFlush(bool use_fsync);
 
-  uint64_t GetFileSize() { return filesize_; }
+  uint64_t GetFileSize() const { return filesize_; }
 
   Status InvalidateCache(size_t offset, size_t length) {
     return writable_file_->InvalidateCache(offset, length);

From 82d8ca8ade08b2c26acad33d954ba0b4cd770e2d Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 10 Jul 2019 11:26:22 -0700
Subject: [PATCH 215/572] Upload db directory during cleanup for certain tests
 (#5554)

Summary:
Add an extra cleanup step so that db directory can be saved and uploaded.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5554

Reviewed By: yancouto

Differential Revision: D16168844

Pulled By: riversand963

fbshipit-source-id: ec7b2cee5f11c7d388c36531f8b076d648e2fb19
---
 build_tools/rocksdb-lego-determinator | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index dc32b3af9ff..af86a16c2be 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -63,6 +63,21 @@ CLEANUP_ENV="
     'user':'root'
 }"
 
+UPLOAD_DB_DIR="
+{
+    'name':'Upload database directory',
+    'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/',
+    'user':'root',
+    'cleanup':true,
+    'provide_artifacts': [
+    {
+      'name':'rocksdb_db_dir',
+      'paths': ['rocksdb_db.tar.gz'],
+      'bundle': false,
+    },
+    ],
+}"
+
 # We will eventually set the RATIO to 1, but we want do this
 # in steps. RATIO=$(nproc) will make it work as J=1
 if [ -z $RATIO ]; then
@@ -428,7 +443,8 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
-            }
+            },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -519,6 +535,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -586,6 +603,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -678,6 +696,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }

From 1a59b6e2a97c9933d323bdeb379bb72c43dfc41c Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Thu, 11 Jul 2019 12:40:08 -0700
Subject: [PATCH 216/572] Cache simulator: Add a ghost cache for admission
 control and a hybrid row-block cache. (#5534)

Summary:
This PR adds a ghost cache for admission control. Specifically, it admits an entry on its second access.
It also adds a hybrid row-block cache that caches the referenced key-value pairs of a Get/MultiGet request instead of its blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5534

Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32

Differential Revision: D16101124

Pulled By: HaoyuHuang

fbshipit-source-id: b99edda6418a888e94eb40f71ece45d375e234b1
---
 CMakeLists.txt                                |   1 +
 Makefile                                      |   4 +
 TARGETS                                       |   5 +
 src.mk                                        |   1 +
 tools/block_cache_trace_analyzer.cc           |  29 +-
 tools/block_cache_trace_analyzer_test.cc      |  11 +-
 trace_replay/block_cache_tracer.cc            |   8 +
 trace_replay/block_cache_tracer.h             |   1 +
 utilities/simulator_cache/cache_simulator.cc  | 212 +++++++++--
 utilities/simulator_cache/cache_simulator.h   | 126 ++++++-
 .../simulator_cache/cache_simulator_test.cc   | 337 ++++++++++++++++++
 11 files changed, 684 insertions(+), 51 deletions(-)
 create mode 100644 utilities/simulator_cache/cache_simulator_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50e082662b5..c47f9811ef2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1006,6 +1006,7 @@ if(WITH_TESTS)
         utilities/options/options_util_test.cc
         utilities/persistent_cache/hash_table_test.cc
         utilities/persistent_cache/persistent_cache_test.cc
+        utilities/simulator_cache/cache_simulator_test.cc
         utilities/simulator_cache/sim_cache_test.cc
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
diff --git a/Makefile b/Makefile
index f1834e0ecf9..1828b833b02 100644
--- a/Makefile
+++ b/Makefile
@@ -510,6 +510,7 @@ TESTS = \
 	cassandra_serialize_test \
 	ttl_test \
 	backupable_db_test \
+	cache_simulator_test \
 	sim_cache_test \
 	version_edit_test \
 	version_set_test \
@@ -1321,6 +1322,9 @@ backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TE
 checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index 82e1d375d96..6ef3da179dc 100644
--- a/TARGETS
+++ b/TARGETS
@@ -423,6 +423,11 @@ ROCKS_TESTS = [
         "cache/cache_test.cc",
         "serial",
     ],
+    [
+        "cache_simulator_test",
+        "utilities/simulator_cache/cache_simulator_test.cc",
+        "serial",
+    ],
     [
         "cassandra_format_test",
         "utilities/cassandra/cassandra_format_test.cc",
diff --git a/src.mk b/src.mk
index 8b0122dbe22..bc49b7ce074 100644
--- a/src.mk
+++ b/src.mk
@@ -405,6 +405,7 @@ MAIN_SOURCES =                                                          \
   utilities/object_registry_test.cc                                     \
   utilities/option_change_migration/option_change_migration_test.cc     \
   utilities/options/options_util_test.cc                                \
+  utilities/simulator_cache/cache_simulator_test.cc                     \
   utilities/simulator_cache/sim_cache_test.cc                           \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
   utilities/transactions/optimistic_transaction_test.cc                 \
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 4770348a79d..bd8d8971bfc 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -23,9 +23,12 @@ DEFINE_string(
     block_cache_sim_config_path, "",
     "The config file path. One cache configuration per line. The format of a "
     "cache configuration is "
-    "cache_name,num_shard_bits,cache_capacity_1,...,cache_capacity_N. "
-    "cache_name is lru or lru_priority. cache_capacity can be xK, xM or xG "
-    "where x is a positive number.");
+    "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_"
+    "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and "
+    "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to "
+    "a cache_name to add a ghost cache in front of the real cache. "
+    "ghost_capacity and cache_capacity can be xK, xM or xG where x is a "
+    "positive number.");
 DEFINE_int32(block_cache_trace_downsample_ratio, 1,
              "The trace collected accesses on one in every "
              "block_cache_trace_downsample_ratio blocks. We scale "
@@ -104,6 +107,10 @@ const std::string kGroupbyAll = "all";
 const std::set<std::string> kGroupbyLabels{
     kGroupbyBlock,     kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
     kGroupbyBlockType, kGroupbyCaller,       kGroupbyAll};
+const std::string kSupportedCacheNames =
+    " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid "
+    "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss "
+    "ghost_lru_hybrid_no_insert_on_row_miss ";
 
 std::string block_type_to_string(TraceType type) {
   switch (type) {
@@ -194,7 +201,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   }
   // Write header.
   const std::string header =
-      "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses";
+      "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_"
+      "accesses";
   out << header << std::endl;
   for (auto const& config_caches : cache_simulator_->sim_caches()) {
     const CacheConfiguration& config = config_caches.first;
@@ -205,6 +213,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
       out << ",";
       out << config.num_shard_bits;
       out << ",";
+      out << config.ghost_cache_capacity;
+      out << ",";
       out << config.cache_capacities[i];
       out << ",";
       out << std::fixed << std::setprecision(4) << miss_ratio;
@@ -993,18 +1003,21 @@ std::vector<CacheConfiguration> parse_cache_config_file(
       config_strs.push_back(substr);
     }
     // Sanity checks.
-    if (config_strs.size() < 3) {
+    if (config_strs.size() < 4) {
       fprintf(stderr, "Invalid cache simulator configuration %s\n",
               line.c_str());
       exit(1);
     }
-    if (config_strs[0] != "lru") {
-      fprintf(stderr, "We only support LRU cache %s\n", line.c_str());
+    if (kSupportedCacheNames.find(" " + config_strs[0] + " ") ==
+        std::string::npos) {
+      fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n",
+              line.c_str(), kSupportedCacheNames.c_str());
       exit(1);
     }
     cache_config.cache_name = config_strs[0];
     cache_config.num_shard_bits = ParseUint32(config_strs[1]);
-    for (uint32_t i = 2; i < config_strs.size(); i++) {
+    cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]);
+    for (uint32_t i = 3; i < config_strs.size(); i++) {
       uint64_t capacity = ParseUint64(config_strs[i]);
       if (capacity == 0) {
         fprintf(stderr, "Invalid cache capacity %s, %s\n",
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index 21d8bcbbb3f..efb202cb4ab 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -205,7 +205,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
   }
   {
     // Generate a cache sim config.
-    std::string config = "lru,1,1K,1M,1G";
+    std::string config = "lru,1,0,1K,1M,1G";
     std::ofstream out(block_cache_sim_config_path_);
     ASSERT_TRUE(out.is_open());
     out << config << std::endl;
@@ -230,14 +230,15 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
         getline(ss, substr, ',');
         result_strs.push_back(substr);
       }
-      ASSERT_EQ(5, result_strs.size());
+      ASSERT_EQ(6, result_strs.size());
       ASSERT_LT(config_index, expected_capacities.size());
       ASSERT_EQ("lru", result_strs[0]);  // cache_name
       ASSERT_EQ("1", result_strs[1]);    // num_shard_bits
+      ASSERT_EQ("0", result_strs[2]);    // ghost_cache_capacity
       ASSERT_EQ(std::to_string(expected_capacities[config_index]),
-                result_strs[2]);         // cache_capacity
-      ASSERT_EQ("100.0000", result_strs[3]);  // miss_ratio
-      ASSERT_EQ("50", result_strs[4]);   // number of accesses.
+                result_strs[3]);              // cache_capacity
+      ASSERT_EQ("100.0000", result_strs[4]);  // miss_ratio
+      ASSERT_EQ("50", result_strs[5]);        // number of accesses.
       config_index++;
     }
     ASSERT_EQ(expected_capacities.size(), config_index);
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 115a75d924b..62db942044c 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -45,6 +45,14 @@ bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) {
          caller == TableReaderCaller::kUserMultiGet;
 }
 
+bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet ||
+         caller == TableReaderCaller::kUserIterator ||
+         caller == TableReaderCaller::kUserApproximateSize ||
+         caller == TableReaderCaller::kUserVerifyChecksum;
+}
+
 BlockCacheTraceWriter::BlockCacheTraceWriter(
     Env* env, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer)
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 4788a3f447f..66cbb5adefa 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -23,6 +23,7 @@ class BlockCacheTraceHelper {
   static bool ShouldTraceReferencedKey(TraceType block_type,
                                        TableReaderCaller caller);
   static bool ShouldTraceGetId(TableReaderCaller caller);
+  static bool IsUserAccess(TableReaderCaller caller);
 
   static const std::string kUnknownColumnFamilyName;
   static const uint64_t kReservedGetId;
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
index 65f626036b0..ebfc4cd0eb0 100644
--- a/utilities/simulator_cache/cache_simulator.cc
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -4,42 +4,177 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "utilities/simulator_cache/cache_simulator.h"
+#include "db/dbformat.h"
 
 namespace rocksdb {
-CacheSimulator::CacheSimulator(std::shared_ptr<SimCache> sim_cache)
+
+namespace {
+const std::string kGhostCachePrefix = "ghost_";
+}
+
+GhostCache::GhostCache(std::shared_ptr<Cache> sim_cache)
     : sim_cache_(sim_cache) {}
 
+bool GhostCache::Admit(const Slice& lookup_key) {
+  auto handle = sim_cache_->Lookup(lookup_key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    return true;
+  }
+  sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
+                     /*deleter=*/nullptr, /*handle=*/nullptr);
+  return false;
+}
+
+CacheSimulator::CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache)
+    : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {}
+
 void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool admit = true;
+  const bool is_user_access =
+      BlockCacheTraceHelper::IsUserAccess(access.caller);
+  bool is_cache_miss = true;
+  if (ghost_cache_ && access.no_insert == Boolean::kFalse) {
+    admit = ghost_cache_->Admit(access.block_key);
+  }
   auto handle = sim_cache_->Lookup(access.block_key);
-  if (handle == nullptr && !access.no_insert) {
-    sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
-                       /*deleter=*/nullptr, /*handle=*/nullptr);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    is_cache_miss = false;
+  } else {
+    if (access.no_insert == Boolean::kFalse && admit) {
+      sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+                         /*deleter=*/nullptr, /*handle=*/nullptr);
+    }
   }
+  UpdateMetrics(is_user_access, is_cache_miss);
 }
 
-void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
-  auto handle = sim_cache_->Lookup(access.block_key);
-  if (handle == nullptr && !access.no_insert) {
-    Cache::Priority priority = Cache::Priority::LOW;
-    if (access.block_type == TraceType::kBlockTraceFilterBlock ||
-        access.block_type == TraceType::kBlockTraceIndexBlock ||
-        access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
-      priority = Cache::Priority::HIGH;
+void CacheSimulator::UpdateMetrics(bool is_user_access, bool is_cache_miss) {
+  num_accesses_ += 1;
+  if (is_cache_miss) {
+    num_misses_ += 1;
+  }
+  if (is_user_access) {
+    user_accesses_ += 1;
+    if (is_cache_miss) {
+      user_misses_ += 1;
     }
-    sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+  }
+}
+
+Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority(
+    const BlockCacheTraceRecord& access) const {
+  if (access.block_type == TraceType::kBlockTraceFilterBlock ||
+      access.block_type == TraceType::kBlockTraceIndexBlock ||
+      access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
+    return Cache::Priority::HIGH;
+  }
+  return Cache::Priority::LOW;
+}
+
+void PrioritizedCacheSimulator::AccessKVPair(
+    const Slice& key, uint64_t value_size, Cache::Priority priority,
+    bool no_insert, bool is_user_access, bool* is_cache_miss, bool* admitted,
+    bool update_metrics) {
+  assert(is_cache_miss);
+  assert(admitted);
+  *is_cache_miss = true;
+  *admitted = true;
+  if (ghost_cache_ && !no_insert) {
+    *admitted = ghost_cache_->Admit(key);
+  }
+  auto handle = sim_cache_->Lookup(key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    *is_cache_miss = false;
+  } else if (!no_insert && *admitted && value_size > 0) {
+    sim_cache_->Insert(key, /*value=*/nullptr, value_size,
                        /*deleter=*/nullptr, /*handle=*/nullptr, priority);
   }
+  if (update_metrics) {
+    UpdateMetrics(is_user_access, *is_cache_miss);
+  }
 }
 
-double CacheSimulator::miss_ratio() {
-  uint64_t hits = sim_cache_->get_hit_counter();
-  uint64_t misses = sim_cache_->get_miss_counter();
-  uint64_t accesses = hits + misses;
-  return static_cast<double>(misses * 100.0 / accesses);
+void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool is_cache_miss = true;
+  bool admitted = true;
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
 }
 
-uint64_t CacheSimulator::total_accesses() {
-  return sim_cache_->get_hit_counter() + sim_cache_->get_miss_counter();
+std::string HybridRowBlockCacheSimulator::ComputeRowKey(
+    const BlockCacheTraceRecord& access) {
+  assert(access.get_id != BlockCacheTraceHelper::kReservedGetId);
+  Slice key;
+  if (access.referenced_key_exist_in_block == Boolean::kTrue) {
+    key = ExtractUserKey(access.referenced_key);
+  } else {
+    key = access.referenced_key;
+  }
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString();
+}
+
+void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool is_cache_miss = true;
+  bool admitted = true;
+  if (access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    // This is a Get/MultiGet request.
+    const std::string& row_key = ComputeRowKey(access);
+    if (getid_getkeys_map_[access.get_id].find(row_key) ==
+        getid_getkeys_map_[access.get_id].end()) {
+      // This is the first time that this key is accessed. Look up the key-value
+      // pair first. Do not update the miss/accesses metrics here since it will
+      // be updated later.
+      AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH,
+                   /*no_insert=*/false,
+                   /*is_user_access=*/true, &is_cache_miss, &admitted,
+                   /*update_metrics=*/false);
+      InsertResult result = InsertResult::NO_INSERT;
+      if (admitted && access.referenced_data_size > 0) {
+        result = InsertResult::INSERTED;
+      } else if (admitted) {
+        result = InsertResult::ADMITTED;
+      }
+      getid_getkeys_map_[access.get_id][row_key] =
+          std::make_pair(is_cache_miss, result);
+    }
+    std::pair<bool, InsertResult> miss_inserted =
+        getid_getkeys_map_[access.get_id][row_key];
+    if (!miss_inserted.first) {
+      // This is a cache hit. Skip future accesses to its index/filter/data
+      // blocks. These block lookups are unnecessary if we observe a hit for the
+      // referenced key-value pair already. Thus, we treat these lookups as
+      // hits. This is also to ensure the total number of accesses are the same
+      // when comparing to other policies.
+      UpdateMetrics(/*is_user_access=*/true, /*is_cache_miss=*/false);
+      return;
+    }
+    // The key-value pair observes a cache miss. We need to access its
+    // index/filter/data blocks.
+    AccessKVPair(
+        access.block_key, access.block_type, ComputeBlockPriority(access),
+        /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
+        /*is_user_access=*/true, &is_cache_miss, &admitted,
+        /*update_metrics=*/true);
+    if (access.referenced_data_size > 0 &&
+        miss_inserted.second == InsertResult::ADMITTED) {
+      sim_cache_->Insert(
+          row_key, /*value=*/nullptr, access.referenced_data_size,
+          /*deleter=*/nullptr, /*handle=*/nullptr, Cache::Priority::HIGH);
+      getid_getkeys_map_[access.get_id][row_key] =
+          std::make_pair(true, InsertResult::INSERTED);
+    }
+    return;
+  }
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
 }
 
 BlockCacheTraceSimulator::BlockCacheTraceSimulator(
@@ -56,18 +191,41 @@ Status BlockCacheTraceSimulator::InitializeCaches() {
       // 1/'downsample_ratio' blocks.
       uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_;
       std::shared_ptr<CacheSimulator> sim_cache;
-      if (config.cache_name == "lru") {
-        sim_cache = std::make_shared<CacheSimulator>(NewSimCache(
+      std::unique_ptr<GhostCache> ghost_cache;
+      std::string cache_name = config.cache_name;
+      if (cache_name.find(kGhostCachePrefix) != std::string::npos) {
+        ghost_cache.reset(new GhostCache(
+            NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0)));
+        cache_name = cache_name.substr(kGhostCachePrefix.size());
+      }
+      if (cache_name == "lru") {
+        sim_cache = std::make_shared<CacheSimulator>(
+            std::move(ghost_cache),
             NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
                         /*strict_capacity_limit=*/false,
-                        /*high_pri_pool_ratio=*/0),
-            /*real_cache=*/nullptr, config.num_shard_bits));
-      } else if (config.cache_name == "lru_priority") {
-        sim_cache = std::make_shared<PrioritizedCacheSimulator>(NewSimCache(
+                        /*high_pri_pool_ratio=*/0));
+      } else if (cache_name == "lru_priority") {
+        sim_cache = std::make_shared<PrioritizedCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5));
+      } else if (cache_name == "lru_hybrid") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*insert_blocks_upon_row_kvpair_miss=*/true);
+      } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
             NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
                         /*strict_capacity_limit=*/false,
                         /*high_pri_pool_ratio=*/0.5),
-            /*real_cache=*/nullptr, config.num_shard_bits));
+            /*insert_blocks_upon_row_kvpair_miss=*/false);
       } else {
         // Not supported.
         return Status::InvalidArgument("Unknown cache name " +
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
index b391d5dc8a5..b6667eeed12 100644
--- a/utilities/simulator_cache/cache_simulator.h
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -5,7 +5,6 @@
 
 #pragma once
 
-#include "rocksdb/utilities/sim_cache.h"
 #include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
@@ -14,22 +13,46 @@ namespace rocksdb {
 struct CacheConfiguration {
   std::string cache_name;  // LRU.
   uint32_t num_shard_bits;
+  uint64_t ghost_cache_capacity;  // ghost cache capacity in bytes.
   std::vector<uint64_t>
       cache_capacities;  // simulate cache capacities in bytes.
 
-  bool operator=(const CacheConfiguration& o) const {
-    return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits;
+  bool operator==(const CacheConfiguration& o) const {
+    return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+           ghost_cache_capacity == o.ghost_cache_capacity;
   }
   bool operator<(const CacheConfiguration& o) const {
     return cache_name < o.cache_name ||
-           (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits);
+           (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) ||
+           (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+            ghost_cache_capacity < o.ghost_cache_capacity);
   }
 };
 
+// A ghost cache admits an entry on its second access.
+class GhostCache {
+ public:
+  explicit GhostCache(std::shared_ptr<Cache> sim_cache);
+  ~GhostCache() = default;
+  // No copy and move.
+  GhostCache(const GhostCache&) = delete;
+  GhostCache& operator=(const GhostCache&) = delete;
+  GhostCache(GhostCache&&) = delete;
+  GhostCache& operator=(GhostCache&&) = delete;
+
+  // Returns true if the lookup_key is in the ghost cache.
+  // Returns false otherwise.
+  bool Admit(const Slice& lookup_key);
+
+ private:
+  std::shared_ptr<Cache> sim_cache_;
+};
+
 // A cache simulator that runs against a block cache trace.
 class CacheSimulator {
  public:
-  CacheSimulator(std::shared_ptr<SimCache> sim_cache);
+  CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                 std::shared_ptr<Cache> sim_cache);
   virtual ~CacheSimulator() = default;
   // No copy and move.
   CacheSimulator(const CacheSimulator&) = delete;
@@ -38,12 +61,37 @@ class CacheSimulator {
   CacheSimulator& operator=(CacheSimulator&&) = delete;
 
   virtual void Access(const BlockCacheTraceRecord& access);
-  void reset_counter() { sim_cache_->reset_counter(); }
-  double miss_ratio();
-  uint64_t total_accesses();
+  void reset_counter() {
+    num_misses_ = 0;
+    num_accesses_ = 0;
+    user_accesses_ = 0;
+    user_misses_ = 0;
+  }
+  double miss_ratio() const {
+    if (num_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
+  }
+  uint64_t total_accesses() const { return num_accesses_; }
+
+  double user_miss_ratio() const {
+    if (user_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
+  }
+  uint64_t user_accesses() const { return user_accesses_; }
 
  protected:
-  std::shared_ptr<SimCache> sim_cache_;
+  void UpdateMetrics(bool is_user_access, bool is_cache_miss);
+
+  std::unique_ptr<GhostCache> ghost_cache_;
+  std::shared_ptr<Cache> sim_cache_;
+  uint64_t num_accesses_ = 0;
+  uint64_t num_misses_ = 0;
+  uint64_t user_accesses_ = 0;
+  uint64_t user_misses_ = 0;
 };
 
 // A prioritized cache simulator that runs against a block cache trace.
@@ -51,9 +99,65 @@ class CacheSimulator {
 // priority in the cache.
 class PrioritizedCacheSimulator : public CacheSimulator {
  public:
-  PrioritizedCacheSimulator(std::shared_ptr<SimCache> sim_cache)
-      : CacheSimulator(sim_cache) {}
+  PrioritizedCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                            std::shared_ptr<Cache> sim_cache)
+      : CacheSimulator(std::move(ghost_cache), sim_cache) {}
   void Access(const BlockCacheTraceRecord& access) override;
+
+ protected:
+  // Access the key-value pair and returns true upon a cache miss.
+  void AccessKVPair(const Slice& key, uint64_t value_size,
+                    Cache::Priority priority, bool no_insert,
+                    bool is_user_access, bool* is_cache_miss, bool* admitted,
+                    bool update_metrics);
+
+  Cache::Priority ComputeBlockPriority(
+      const BlockCacheTraceRecord& access) const;
+};
+
+// A hybrid row and block cache simulator. It looks up/inserts key-value pairs
+// referenced by Get/MultiGet requests, and not their accessed index/filter/data
+// blocks.
+//
+// Upon a Get/MultiGet request, it looks up the referenced key first.
+// If it observes a cache hit, future block accesses on this key-value pair is
+// skipped since the request is served already. Otherwise, it continues to look
+// up/insert its index/filter/data blocks. It also inserts the referenced
+// key-value pair in the cache for future lookups.
+class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
+ public:
+  HybridRowBlockCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache,
+                               bool insert_blocks_upon_row_kvpair_miss)
+      : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache),
+        insert_blocks_upon_row_kvpair_miss_(
+            insert_blocks_upon_row_kvpair_miss) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+
+ private:
+  // Row key is a concatenation of the access's fd_number and the referenced
+  // user key.
+  // TODO(haoyu): the row key should contain sequence number.
+  std::string ComputeRowKey(const BlockCacheTraceRecord& access);
+
+  enum InsertResult : char {
+    INSERTED,
+    ADMITTED,
+    NO_INSERT,
+  };
+
+  // A map stores get_id to a map of row keys. For each row key, it stores a
+  // boolean and an enum. The first bool is true when we observe a miss upon the
+  // first time we encounter the row key. The second arg is INSERTED when the
+  // kv-pair has been inserted into the cache, ADMITTED if it should be inserted
+  // but haven't been, NO_INSERT if it should not be inserted.
+  //
+  // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
+  // know its size. This may happen if the first access on the referenced key is
+  // an index/filter block.
+  std::map<uint64_t, std::map<std::string, std::pair<bool, InsertResult>>>
+      getid_getkeys_map_;
+  bool insert_blocks_upon_row_kvpair_miss_;
 };
 
 // A block cache simulator that reports miss ratio curves given a set of cache
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
new file mode 100644
index 00000000000..fb0c9e84976
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -0,0 +1,337 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <cstdlib>
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+namespace {
+const std::string kBlockKeyPrefix = "test-block-";
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kGetId = 1;
+const uint64_t kGetBlockId = 100;
+const uint64_t kCompactionBlockId = 1000;
+const uint64_t kCacheSize = 1024 * 1024 * 1024;
+const uint64_t kGhostCacheSize = 1024 * 1024;
+}  // namespace
+
+class CacheSimulatorTest : public testing::Test {
+ public:
+  const size_t kNumBlocks = 5;
+  const size_t kValueSize = 1000;
+
+  CacheSimulatorTest() { env_ = rocksdb::Env::Default(); }
+
+  BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kUserGet;
+    record.level = 6;
+    record.sst_fd_number = kGetBlockId;
+    record.get_id = getid;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kFalse;
+    record.referenced_key =
+        kRefKeyPrefix + std::to_string(kGetId) + std::string(8, 'c');
+    record.referenced_key_exist_in_block = Boolean::kTrue;
+    record.referenced_data_size = 100;
+    record.num_keys_in_block = 300;
+    return record;
+  }
+
+  BlockCacheTraceRecord GenerateCompactionRecord() {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kCompaction;
+    record.level = 6;
+    record.sst_fd_number = kCompactionBlockId;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kTrue;
+    return record;
+  }
+
+  Env* env_;
+};
+
+TEST_F(CacheSimulatorTest, GhostCache) {
+  const std::string key1 = "test1";
+  const std::string key2 = "test2";
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  EXPECT_FALSE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_FALSE(ghost_cache->Admit(key2));
+  EXPECT_TRUE(ghost_cache->Admit(key2));
+}
+
+TEST_F(CacheSimulatorTest, CacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord();
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<CacheSimulator> cache_simulator(
+      new CacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio());
+  ASSERT_EQ(2, cache_simulator->user_accesses());
+  ASSERT_EQ(50, cache_simulator->user_miss_ratio());
+
+  cache_simulator->Access(compaction_access);
+  cache_simulator->Access(compaction_access);
+  ASSERT_EQ(4, cache_simulator->total_accesses());
+  ASSERT_EQ(75, cache_simulator->miss_ratio());
+  ASSERT_EQ(2, cache_simulator->user_accesses());
+  ASSERT_EQ(50, cache_simulator->user_miss_ratio());
+
+  cache_simulator->reset_counter();
+  ASSERT_EQ(0, cache_simulator->total_accesses());
+  ASSERT_EQ(-1, cache_simulator->miss_ratio());
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  handle = sim_cache->Lookup(compaction_access.block_key);
+  ASSERT_EQ(nullptr, handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<CacheSimulator> cache_simulator(new CacheSimulator(
+      std::move(ghost_cache),
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio());
+
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1);
+  second_get.referenced_data_size = 0;
+  second_get.referenced_key_exist_in_block = Boolean::kFalse;
+  second_get.referenced_key = kRefKeyPrefix + std::to_string(kGetId);
+  BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2);
+  third_get.referenced_data_size = 0;
+  third_get.referenced_key_exist_in_block = Boolean::kFalse;
+  third_get.referenced_key = kRefKeyPrefix + "third_get";
+  // We didn't find the referenced key in the third get.
+  third_get.referenced_key_exist_in_block = Boolean::kFalse;
+  third_get.referenced_data_size = 0;
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // The first get request accesses 10 blocks. We should only report 10 accesses
+  // and 100% miss.
+  for (uint32_t i = 0; i < 10; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+  ASSERT_EQ(10, cache_simulator->total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio());
+  ASSERT_EQ(10, cache_simulator->user_accesses());
+  ASSERT_EQ(100, cache_simulator->user_miss_ratio());
+  auto handle =
+      sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) +
+                                       "_" + first_get.referenced_key));
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // The second get request accesses the same key. We should report 15
+  // access and 66% miss, 10 misses with 15 accesses.
+  // We do not consider these 5 block lookups as misses since the row hits the
+  // cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    second_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(second_get);
+    block_id++;
+  }
+  ASSERT_EQ(15, cache_simulator->total_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->miss_ratio()));
+  ASSERT_EQ(15, cache_simulator->user_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+  handle = sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
+                             second_get.referenced_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    if (i < 110) {
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+
+  // The third get on a different key and does not have a size.
+  // This key should not be inserted into the cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    third_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(third_get);
+    block_id++;
+  }
+  ASSERT_EQ(20, cache_simulator->total_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(cache_simulator->miss_ratio()));
+  ASSERT_EQ(20, cache_simulator->user_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+  // Assert that the third key is not inserted into the cache.
+  handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" +
+                             third_get.referenced_key);
+  ASSERT_EQ(nullptr, handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    if (i < 110 || i >= 115) {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false));
+  for (uint32_t i = 0; i < 9; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+  auto handle =
+      sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) +
+                                       "_" + first_get.referenced_key));
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  // All blocks are missing from the cache since insert_blocks_row_kvpair_misses
+  // is set to false.
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
+TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) {
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1);
+  const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0),
+          /*insert_blocks_row_kvpair_misses=*/false));
+  // Two get requests access the same key.
+  cache_simulator->Access(first_get);
+  cache_simulator->Access(second_get);
+  ASSERT_EQ(2, cache_simulator->total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio());
+  ASSERT_EQ(2, cache_simulator->user_accesses());
+  ASSERT_EQ(100, cache_simulator->user_miss_ratio());
+  // We insert the key-value pair upon the second get request. A third get
+  // request should observe a hit.
+  for (uint32_t i = 0; i < 10; i++) {
+    cache_simulator->Access(third_get);
+  }
+  ASSERT_EQ(12, cache_simulator->total_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(cache_simulator->miss_ratio()));
+  ASSERT_EQ(12, cache_simulator->user_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From 3e9c5a35237d0ae5d1d8b0499b4dd8844e0ec56d Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 12 Jul 2019 16:52:15 -0700
Subject: [PATCH 217/572] Block cache analyzer: Add more stats (#5516)

Summary:
This PR provides more command line options for block cache analyzer to better understand block cache access pattern.
-analyze_bottom_k_access_count_blocks
-analyze_top_k_access_count_blocks
-reuse_lifetime_labels
-reuse_lifetime_buckets
-analyze_callers
-access_count_buckets
-analyze_blocks_reuse_k_reuse_window
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5516

Test Plan: make clean && COMPILE_WITH_ASAN=1 make check -j32

Differential Revision: D16037440

Pulled By: HaoyuHuang

fbshipit-source-id: b9a4ac0d4712053fab910732077a4d4b91400bc8
---
 tools/block_cache_trace_analyzer.cc      | 1239 +++++++++++++++++-----
 tools/block_cache_trace_analyzer.h       |  134 ++-
 tools/block_cache_trace_analyzer_test.cc |  262 ++++-
 trace_replay/block_cache_tracer.cc       |    2 +
 trace_replay/block_cache_tracer.h        |    3 +
 5 files changed, 1312 insertions(+), 328 deletions(-)

diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index bd8d8971bfc..76633846257 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -44,20 +44,14 @@ DEFINE_bool(print_data_block_access_count_stats, false,
 DEFINE_int32(cache_sim_warmup_seconds, 0,
              "The number of seconds to warmup simulated caches. The hit/miss "
              "counters are reset after the warmup completes.");
-DEFINE_string(
-    block_cache_analysis_result_dir, "",
-    "The directory that saves block cache analysis results. It contains 1) a "
-    "mrc file that saves the computed miss ratios for simulated caches. Its "
-    "format is "
-    "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several "
-    "\"label_access_timeline\" files that contain number of accesses per "
-    "second grouped by the label. File format: "
-    "time,label_1_access_per_second,label_2_access_per_second,...,label_N_"
-    "access_per_second where N is the number of unique labels found in the "
-    "trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" "
-    "csv files that contain the reuse distance/interval grouped by label. File "
-    "format: bucket,label_1,label_2,...,label_N. The first N buckets are "
-    "absolute values. The second N buckets are percentage values.");
+DEFINE_int32(analyze_bottom_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the bottom k among all blocks.");
+DEFINE_int32(analyze_top_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the top k among all blocks.");
+DEFINE_string(block_cache_analysis_result_dir, "",
+              "The directory that saves block cache analysis results.");
 DEFINE_string(
     timeline_labels, "",
     "Group the number of accesses per block per second using these labels. "
@@ -92,6 +86,42 @@ DEFINE_string(
     "seconds, between 10 seconds and 100 seconds, respectively. The last "
     "bucket contains the number of blocks with reuse interval longer than 100 "
     "seconds.");
+DEFINE_string(
+    reuse_lifetime_labels, "",
+    "Group the reuse lifetime of a block using these labels. Reuse "
+    "lifetime is defined as the time interval between the first access on a "
+    "block and the last access on the same block. For blocks that are only "
+    "accessed once, its lifetime is set to kMaxUint64.");
+DEFINE_string(
+    reuse_lifetime_buckets, "",
+    "Group blocks by their reuse lifetime given these buckets. For "
+    "example, if 'reuse_lifetime_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse lifetime less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse lifetime longer than 100 "
+    "seconds.");
+DEFINE_string(
+    analyze_callers, "",
+    "The list of callers to perform a detailed analysis on. If speicfied, the "
+    "analyzer will output a detailed percentage of accesses for each caller "
+    "break down by column family, level, and block type. A list of available "
+    "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, "
+    "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, "
+    "CompactionRefill, Flush, SSTFileReader, Uncategorized.");
+DEFINE_string(access_count_buckets, "",
+              "Group number of blocks by their access count given these "
+              "buckets. If specified, the analyzer will output a detailed "
+              "analysis on the number of blocks grouped by their access count "
+              "break down by block type and column family.");
+DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0,
+             "Analyze the percentage of blocks that are accessed in the "
+             "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], "
+             "[3*k, 4*k],...,[k*(n-1), k*n] seconds. ");
+DEFINE_string(analyze_get_spatial_locality_labels, "",
+              "Group data blocks using these labels.");
+DEFINE_string(analyze_get_spatial_locality_buckets, "",
+              "Group data blocks by their statistics using these buckets.");
 
 namespace rocksdb {
 namespace {
@@ -112,6 +142,24 @@ const std::string kSupportedCacheNames =
     "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss "
     "ghost_lru_hybrid_no_insert_on_row_miss ";
 
+// The suffix for the generated csv files.
+const std::string kFileNameSuffixAccessTimeline = "access_timeline";
+const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
+    "avg_reuse_interval_naccesses";
+const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval";
+const std::string kFileNameSuffixReuseInterval = "access_reuse_interval";
+const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime";
+const std::string kFileNameSuffixAccessReuseBlocksTimeline =
+    "reuse_blocks_timeline";
+const std::string kFileNameSuffixPercentOfAccessSummary =
+    "percentage_of_accesses_summary";
+const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys";
+const std::string kFileNameSuffixPercentDataSizeOnRefKeys =
+    "percent_data_size_on_ref_keys";
+const std::string kFileNameSuffixPercentAccessesOnRefKeys =
+    "percent_accesses_on_ref_keys";
+const std::string kFileNameSuffixAccessCountSummary = "access_count_summary";
+
 std::string block_type_to_string(TraceType type) {
   switch (type) {
     case kBlockTraceFilterBlock:
@@ -168,6 +216,53 @@ std::string caller_to_string(TableReaderCaller caller) {
   return "InvalidCaller";
 }
 
+TableReaderCaller string_to_caller(std::string caller_str) {
+  if (caller_str == "Get") {
+    return kUserGet;
+  } else if (caller_str == "MultiGet") {
+    return kUserMultiGet;
+  } else if (caller_str == "Iterator") {
+    return kUserIterator;
+  } else if (caller_str == "ApproximateSize") {
+    return kUserApproximateSize;
+  } else if (caller_str == "VerifyChecksum") {
+    return kUserVerifyChecksum;
+  } else if (caller_str == "SSTDumpTool") {
+    return kSSTDumpTool;
+  } else if (caller_str == "ExternalSSTIngestion") {
+    return kExternalSSTIngestion;
+  } else if (caller_str == "Repair") {
+    return kRepair;
+  } else if (caller_str == "Prefetch") {
+    return kPrefetch;
+  } else if (caller_str == "Compaction") {
+    return kCompaction;
+  } else if (caller_str == "CompactionRefill") {
+    return kCompactionRefill;
+  } else if (caller_str == "Flush") {
+    return kFlush;
+  } else if (caller_str == "SSTFileReader") {
+    return kSSTFileReader;
+  } else if (caller_str == "Uncategorized") {
+    return kUncategorized;
+  }
+  return TableReaderCaller::kMaxBlockCacheLookupCaller;
+}
+
+bool is_user_access(TableReaderCaller caller) {
+  switch (caller) {
+    case kUserGet:
+    case kUserMultiGet:
+    case kUserIterator:
+    case kUserApproximateSize:
+    case kUserVerifyChecksum:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 const char kBreakLine[] =
     "***************************************************************\n";
 
@@ -248,7 +343,7 @@ std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
 std::string BlockCacheTraceAnalyzer::BuildLabel(
     const std::set<std::string>& labels, const std::string& cf_name,
     uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
-    const std::string& block_key) const {
+    uint64_t block_key) const {
   std::map<std::string, std::string> label_value_map;
   label_value_map[kGroupbyAll] = kGroupbyAll;
   label_value_map[kGroupbyLevel] = std::to_string(level);
@@ -256,7 +351,7 @@ std::string BlockCacheTraceAnalyzer::BuildLabel(
   label_value_map[kGroupbySSTFile] = std::to_string(fd);
   label_value_map[kGroupbyBlockType] = block_type_to_string(type);
   label_value_map[kGroupbyColumnFamily] = cf_name;
-  label_value_map[kGroupbyBlock] = block_key;
+  label_value_map[kGroupbyBlock] = std::to_string(block_key);
   // Concatenate the label values.
   std::string label;
   for (auto const& l : labels) {
@@ -269,12 +364,14 @@ std::string BlockCacheTraceAnalyzer::BuildLabel(
   return label;
 }
 
-void BlockCacheTraceAnalyzer::WriteAccessTimeline(
-    const std::string& label_str) const {
-  std::set<std::string> labels = ParseLabelStr(label_str);
-  uint64_t start_time = port::kMaxUint64;
-  uint64_t end_time = 0;
-  std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+void BlockCacheTraceAnalyzer::TraverseBlocks(
+    std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                       uint32_t /*level*/, TraceType /*block_type*/,
+                       const std::string& /*block_key*/,
+                       uint64_t /*block_key_id*/,
+                       const BlockAccessInfo& /*block_access_info*/)>
+        block_callback) const {
+  uint64_t block_id = 0;
   for (auto const& cf_aggregates : cf_aggregates_map_) {
     // Stats per column family.
     const std::string& cf_name = cf_aggregates.first;
@@ -289,42 +386,161 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(
         for (auto const& block_access_info :
              block_type_aggregates.second.block_access_info_map) {
           // Stats per block.
-          for (auto const& timeline :
-               block_access_info.second.caller_num_accesses_timeline) {
-            const TableReaderCaller caller = timeline.first;
-            const std::string& block_key = block_access_info.first;
-            const std::string label =
-                BuildLabel(labels, cf_name, fd, level, type, caller, block_key);
-            for (auto const& naccess : timeline.second) {
-              const uint64_t timestamp = naccess.first;
-              const uint64_t num = naccess.second;
-              label_access_timeline[label][timestamp] += num;
-              start_time = std::min(start_time, timestamp);
-              end_time = std::max(end_time, timestamp);
-            }
-          }
+          block_callback(cf_name, fd, level, type, block_access_info.first,
+                         block_id, block_access_info.second);
+          block_id++;
         }
       }
     }
   }
+}
+
+void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
+    const std::string& label_str,
+    const std::vector<uint64_t>& percent_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefkeys_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefs_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pndatasize_nblocks;
+  uint64_t nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType /*block_type*/,
+                            const std::string& /*block_key*/,
+                            uint64_t /*block_key_id*/,
+                            const BlockAccessInfo& block) {
+    if (block.num_keys == 0) {
+      return;
+    }
+    uint64_t naccesses = 0;
+    for (auto const& key_access : block.key_num_access_map) {
+      for (auto const& caller_access : key_access.second) {
+        if (caller_access.first == TableReaderCaller::kUserGet) {
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock,
+                   TableReaderCaller::kUserGet, /*block_id=*/0);
+
+    const uint64_t percent_referenced_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.key_num_access_map.size(), block.num_keys), 0.0));
+    const uint64_t percent_accesses_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.num_referenced_key_exist_in_block, naccesses), 0.0));
+    const uint64_t percent_referenced_data_size = static_cast<uint64_t>(
+        std::max(percent(block.referenced_data_size, block.block_size), 0.0));
+    if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) {
+      for (auto const& percent_bucket : percent_buckets) {
+        label_pnrefkeys_nblocks[label][percent_bucket] = 0;
+        label_pnrefs_nblocks[label][percent_bucket] = 0;
+        label_pndatasize_nblocks[label][percent_bucket] = 0;
+      }
+    }
+    label_pnrefkeys_nblocks[label]
+        .upper_bound(percent_referenced_for_existing_keys)
+        ->second += 1;
+    label_pnrefs_nblocks[label]
+        .upper_bound(percent_accesses_for_existing_keys)
+        ->second += 1;
+    label_pndatasize_nblocks[label]
+        .upper_bound(percent_referenced_data_size)
+        ->second += 1;
+    nblocks += 1;
+  };
+  TraverseBlocks(block_callback);
+  WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys,
+                   label_pnrefkeys_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentAccessesOnRefKeys,
+                   label_pnrefs_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentDataSizeOnRefKeys,
+                   label_pndatasize_nblocks, nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
+                                                  uint64_t time_unit,
+                                                  bool user_access_only) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+  std::map<uint64_t, std::vector<std::string>> access_count_block_id_map;
+
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& timeline : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = timeline.first;
+      if (user_access_only && !is_user_access(caller)) {
+        continue;
+      }
+      const std::string label =
+          BuildLabel(labels, cf_name, fd, level, type, caller, block_id);
+      for (auto const& naccess : timeline.second) {
+        const uint64_t timestamp = naccess.first / time_unit;
+        const uint64_t num = naccess.second;
+        label_access_timeline[label][timestamp] += num;
+        start_time = std::min(start_time, timestamp);
+        end_time = std::max(end_time, timestamp);
+        naccesses += num;
+      }
+    }
+    if (naccesses > 0) {
+      access_count_block_id_map[naccesses].push_back(std::to_string(block_id));
+    }
+  };
+  TraverseBlocks(block_callback);
 
   // We have label_access_timeline now. Write them into a file.
-  const std::string output_path =
-      output_dir_ + "/" + label_str + "_access_timeline";
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  const std::string output_path = output_dir_ + "/" + user_access_prefix +
+                                  label_str + "_" + std::to_string(time_unit) +
+                                  "_" + kFileNameSuffixAccessTimeline;
   std::ofstream out(output_path);
   if (!out.is_open()) {
     return;
   }
   std::string header("time");
-  for (auto const& label : label_access_timeline) {
+  if (labels.find("block") != labels.end()) {
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    // Write the most frequently accessed blocks first.
+    for (auto naccess_it = access_count_block_id_map.rbegin();
+         naccess_it != access_count_block_id_map.rend(); naccess_it++) {
+      for (auto& block_id_it : naccess_it->second) {
+        std::string row(block_id_it);
+        for (uint64_t now = start_time; now <= end_time; now++) {
+          auto it = label_access_timeline[block_id_it].find(now);
+          row += ",";
+          if (it != label_access_timeline[block_id_it].end()) {
+            row += std::to_string(it->second);
+          } else {
+            row += "0";
+          }
+        }
+        out << row << std::endl;
+      }
+    }
+    out.close();
+    return;
+  }
+  for (uint64_t now = start_time; now <= end_time; now++) {
     header += ",";
-    header += label.first;
+    header += std::to_string(now);
   }
   out << header << std::endl;
-  std::string row;
-  for (uint64_t now = start_time; now <= end_time; now++) {
-    row = std::to_string(now);
-    for (auto const& label : label_access_timeline) {
+  for (auto const& label : label_access_timeline) {
+    std::string row(label.first);
+    for (uint64_t now = start_time; now <= end_time; now++) {
       auto it = label.second.find(now);
       row += ",";
       if (it != label.second.end()) {
@@ -335,52 +551,38 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(
     }
     out << row << std::endl;
   }
+
   out.close();
 }
 
 void BlockCacheTraceAnalyzer::WriteReuseDistance(
     const std::string& label_str,
-    const std::set<uint64_t>& distance_buckets) const {
+    const std::vector<uint64_t>& distance_buckets) const {
   std::set<std::string> labels = ParseLabelStr(label_str);
   std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
   uint64_t total_num_reuses = 0;
-  for (auto const& cf_aggregates : cf_aggregates_map_) {
-    // Stats per column family.
-    const std::string& cf_name = cf_aggregates.first;
-    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      // Stats per SST file.
-      const uint64_t fd = file_aggregates.first;
-      const uint32_t level = file_aggregates.second.level;
-      for (auto const& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        // Stats per block type.
-        const TraceType type = block_type_aggregates.first;
-        for (auto const& block_access_info :
-             block_type_aggregates.second.block_access_info_map) {
-          // Stats per block.
-          const std::string& block_key = block_access_info.first;
-          const std::string label = BuildLabel(
-              labels, cf_name, fd, level, type,
-              TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
-          if (label_distance_num_reuses.find(label) ==
-              label_distance_num_reuses.end()) {
-            // The first time we encounter this label.
-            for (auto const& distance_bucket : distance_buckets) {
-              label_distance_num_reuses[label][distance_bucket] = 0;
-            }
-          }
-          for (auto const& reuse_distance :
-               block_access_info.second.reuse_distance_count) {
-            label_distance_num_reuses[label]
-                .upper_bound(reuse_distance.first)
-                ->second += reuse_distance.second;
-            total_num_reuses += reuse_distance.second;
-          }
-        }
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, type,
+                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+    if (label_distance_num_reuses.find(label) ==
+        label_distance_num_reuses.end()) {
+      // The first time we encounter this label.
+      for (auto const& distance_bucket : distance_buckets) {
+        label_distance_num_reuses[label][distance_bucket] = 0;
       }
     }
-  }
-
+    for (auto const& reuse_distance : block.reuse_distance_count) {
+      label_distance_num_reuses[label]
+          .upper_bound(reuse_distance.first)
+          ->second += reuse_distance.second;
+      total_num_reuses += reuse_distance.second;
+    }
+  };
+  TraverseBlocks(block_callback);
   // We have label_naccesses and label_distance_num_reuses now. Write them into
   // a file.
   const std::string output_path =
@@ -395,18 +597,6 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance(
     header += label_it.first;
   }
   out << header << std::endl;
-  // Absolute values.
-  for (auto const& bucket : distance_buckets) {
-    std::string row(std::to_string(bucket));
-    for (auto const& label_it : label_distance_num_reuses) {
-      auto const& it = label_it.second.find(bucket);
-      assert(it != label_it.second.end());
-      row += ",";
-      row += std::to_string(it->second);
-    }
-    out << row << std::endl;
-  }
-  // Percentage values.
   for (auto const& bucket : distance_buckets) {
     std::string row(std::to_string(bucket));
     for (auto const& label_it : label_distance_num_reuses) {
@@ -421,7 +611,7 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance(
 }
 
 void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
-    const std::string& label, const std::set<uint64_t>& time_buckets,
+    const std::string& label, const std::vector<uint64_t>& time_buckets,
     const std::map<uint64_t, uint64_t> timeline,
     std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
     uint64_t* total_num_reuses) const {
@@ -434,119 +624,434 @@ void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
     }
   }
   auto it = timeline.begin();
-  const uint64_t prev_timestamp = it->first;
+  uint64_t prev_timestamp = it->first;
   const uint64_t prev_num = it->second;
   it++;
   // Reused within one second.
   if (prev_num > 1) {
-    (*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1;
+    (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1;
     *total_num_reuses += prev_num - 1;
   }
   while (it != timeline.end()) {
     const uint64_t timestamp = it->first;
     const uint64_t num = it->second;
     const uint64_t reuse_interval = timestamp - prev_timestamp;
-    (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num;
+    (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1;
+    if (num > 1) {
+      (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1;
+    }
+    prev_timestamp = timestamp;
     *total_num_reuses += num;
+    it++;
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteStatsToFile(
+    const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+    const std::string& filename_suffix,
+    const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+    uint64_t ntotal) const {
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_" + filename_suffix;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_data) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  for (auto const& bucket : time_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_data) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, ntotal));
+    }
+    out << row << std::endl;
   }
+  out.close();
 }
 
 void BlockCacheTraceAnalyzer::WriteReuseInterval(
     const std::string& label_str,
-    const std::set<uint64_t>& time_buckets) const {
+    const std::vector<uint64_t>& time_buckets) const {
   std::set<std::string> labels = ParseLabelStr(label_str);
   std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_naccesses;
+
   uint64_t total_num_reuses = 0;
-  for (auto const& cf_aggregates : cf_aggregates_map_) {
-    // Stats per column family.
-    const std::string& cf_name = cf_aggregates.first;
-    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      // Stats per SST file.
-      const uint64_t fd = file_aggregates.first;
-      const uint32_t level = file_aggregates.second.level;
-      for (auto const& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        // Stats per block type.
-        const TraceType type = block_type_aggregates.first;
-        for (auto const& block_access_info :
-             block_type_aggregates.second.block_access_info_map) {
-          // Stats per block.
-          const std::string& block_key = block_access_info.first;
-          if (labels.find(kGroupbyCaller) != labels.end()) {
-            for (auto const& timeline :
-                 block_access_info.second.caller_num_accesses_timeline) {
-              const TableReaderCaller caller = timeline.first;
-              const std::string label = BuildLabel(labels, cf_name, fd, level,
-                                                   type, caller, block_key);
-              UpdateReuseIntervalStats(label, time_buckets, timeline.second,
-                                       &label_time_num_reuses,
-                                       &total_num_reuses);
-            }
-            continue;
-          }
-          // Does not group by caller so we need to flatten the access timeline.
-          const std::string label = BuildLabel(
-              labels, cf_name, fd, level, type,
-              TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
-          std::map<uint64_t, uint64_t> timeline;
-          for (auto const& caller_timeline :
-               block_access_info.second.caller_num_accesses_timeline) {
-            for (auto const& time_naccess : caller_timeline.second) {
-              timeline[time_naccess.first] += time_naccess.second;
-            }
-          }
-          UpdateReuseIntervalStats(label, time_buckets, timeline,
-                                   &label_time_num_reuses, &total_num_reuses);
-        }
+  uint64_t total_nblocks = 0;
+  uint64_t total_accesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    total_nblocks++;
+    total_accesses += block.num_accesses;
+    uint64_t avg_reuse_interval = 0;
+    if (block.num_accesses > 1) {
+      avg_reuse_interval = ((block.last_access_time - block.first_access_time) /
+                            kMicrosInSecond) /
+                           block.num_accesses;
+    } else {
+      avg_reuse_interval = port::kMaxUint64 - 1;
+    }
+    if (labels.find(kGroupbyCaller) != labels.end()) {
+      for (auto const& timeline : block.caller_num_accesses_timeline) {
+        const TableReaderCaller caller = timeline.first;
+        const std::string label =
+            BuildLabel(labels, cf_name, fd, level, type, caller, block_id);
+        UpdateReuseIntervalStats(label, time_buckets, timeline.second,
+                                 &label_time_num_reuses, &total_num_reuses);
+      }
+      return;
+    }
+    // Does not group by caller so we need to flatten the access timeline.
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, type,
+                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+    std::map<uint64_t, uint64_t> timeline;
+    for (auto const& caller_timeline : block.caller_num_accesses_timeline) {
+      for (auto const& time_naccess : caller_timeline.second) {
+        timeline[time_naccess.first] += time_naccess.second;
+      }
+    }
+    UpdateReuseIntervalStats(label, time_buckets, timeline,
+                             &label_time_num_reuses, &total_num_reuses);
+    if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) {
+      for (auto const& time_bucket : time_buckets) {
+        label_avg_reuse_nblocks[label][time_bucket] = 0;
+        label_avg_reuse_naccesses[label][time_bucket] = 0;
+      }
+    }
+    label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1;
+    label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second +=
+        block.num_accesses;
+  };
+  TraverseBlocks(block_callback);
+
+  // Write the stats into files.
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval,
+                   label_time_num_reuses, total_num_reuses);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval,
+                   label_avg_reuse_nblocks, total_nblocks);
+  WriteStatsToFile(label_str, time_buckets,
+                   kFileNameSuffixAvgReuseIntervalNaccesses,
+                   label_avg_reuse_naccesses, total_accesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseLifetime(
+    const std::string& label_str,
+    const std::vector<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_lifetime_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t lifetime = 0;
+    if (block.num_accesses > 1) {
+      lifetime =
+          (block.last_access_time - block.first_access_time) / kMicrosInSecond;
+    } else {
+      lifetime = port::kMaxUint64 - 1;
+    }
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, type,
+                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+
+    if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) {
+      // The first time we encounter this label.
+      for (auto const& time_bucket : time_buckets) {
+        label_lifetime_nblocks[label][time_bucket] = 0;
       }
     }
+    label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1;
+    total_nblocks += 1;
+  };
+  TraverseBlocks(block_callback);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime,
+                   label_lifetime_nblocks, total_nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
+    uint64_t reuse_window, bool user_access_only, TraceType block_type) const {
+  // A map from block key to an array of bools that states whether a block is
+  // accessed in a time window.
+  std::map<uint64_t, std::vector<bool>> block_accessed;
+  const uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  const uint64_t reuse_vector_size = (trace_duration / reuse_window);
+  if (reuse_vector_size < 2) {
+    // The reuse window is less than 2. We cannot calculate the reused
+    // percentage of blocks.
+    return;
   }
+  auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType /*type*/,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (block_accessed.find(block_id) == block_accessed.end()) {
+      block_accessed[block_id].resize(reuse_vector_size);
+      for (uint64_t i = 0; i < reuse_vector_size; i++) {
+        block_accessed[block_id][i] = false;
+      }
+    }
+    for (auto const& caller_num : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = caller_num.first;
+      for (auto const& timeline : caller_num.second) {
+        const uint64_t timestamp = timeline.first;
+        const uint64_t elapsed_time =
+            timestamp - trace_start_timestamp_in_seconds_;
+        if (!user_access_only || (user_access_only && is_user_access(caller))) {
+          uint64_t index =
+              std::min(elapsed_time / reuse_window, reuse_vector_size - 1);
+          block_accessed[block_id][index] = true;
+        }
+      }
+    }
+  };
+  TraverseBlocks(block_callback);
 
-  // We have label_naccesses and label_interval_num_reuses now. Write them into
-  // a file.
+  // A cell is the number of blocks accessed in a reuse window.
+  uint64_t reuse_table[reuse_vector_size][reuse_vector_size];
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    // Initialize the reuse_table.
+    for (uint64_t i = 0; i < reuse_vector_size; i++) {
+      reuse_table[start_time][i] = 0;
+    }
+    // Examine all blocks.
+    for (auto const& block : block_accessed) {
+      for (uint64_t i = start_time; i < reuse_vector_size; i++) {
+        if (block.second[start_time] && block.second[i]) {
+          // This block is accessed at start time and at the current time. We
+          // increment reuse_table[start_time][i] since it is reused at the ith
+          // window.
+          reuse_table[start_time][i]++;
+        }
+      }
+    }
+  }
+  const std::string user_access_prefix =
+      user_access_only ? "_user_access_only_" : "_all_access_";
   const std::string output_path =
-      output_dir_ + "/" + label_str + "_reuse_interval";
+      output_dir_ + "/" + block_type_to_string(block_type) +
+      user_access_prefix + std::to_string(reuse_window) + "_" +
+      kFileNameSuffixAccessReuseBlocksTimeline;
   std::ofstream out(output_path);
   if (!out.is_open()) {
     return;
   }
-  std::string header("bucket");
-  for (auto const& label_it : label_time_num_reuses) {
+  std::string header("start_time");
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
     header += ",";
-    header += label_it.first;
+    header += std::to_string(start_time);
   }
   out << header << std::endl;
-  // Absolute values.
-  for (auto const& bucket : time_buckets) {
-    std::string row(std::to_string(bucket));
-    for (auto const& label_it : label_time_num_reuses) {
-      auto const& it = label_it.second.find(bucket);
-      assert(it != label_it.second.end());
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    std::string row(std::to_string(start_time * reuse_window));
+    for (uint64_t j = 0; j < reuse_vector_size; j++) {
       row += ",";
-      row += std::to_string(it->second);
+      if (j < start_time) {
+        row += "100.0";
+      } else {
+        row += std::to_string(percent(reuse_table[start_time][j],
+                                      reuse_table[start_time][start_time]));
+      }
     }
     out << row << std::endl;
   }
-  // Percentage values.
-  for (auto const& bucket : time_buckets) {
-    std::string row(std::to_string(bucket));
-    for (auto const& label_it : label_time_num_reuses) {
-      auto const& it = label_it.second.find(bucket);
-      assert(it != label_it.second.end());
-      row += ",";
-      row += std::to_string(percent(it->second, total_num_reuses));
+  out.close();
+}
+
+std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats(
+    uint64_t total_accesses,
+    const std::map<std::string, uint64_t>& cf_access_count) const {
+  std::string row;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    const std::string& cf_name = cf_aggregates.first;
+    const auto& naccess = cf_access_count.find(cf_name);
+    row += ",";
+    if (naccess != cf_access_count.end()) {
+      row += std::to_string(percent(naccess->second, total_accesses));
+    } else {
+      row += "0";
     }
+  }
+  return row;
+}
+
+void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const {
+  std::map<TableReaderCaller, std::map<std::string, uint64_t>>
+      caller_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          const uint64_t naccess = caller_num.second;
+          caller_cf_accesses[caller][cf_name] += naccess;
+          total_accesses += naccess;
+        }
+      };
+  TraverseBlocks(block_callback);
+
+  const std::string output_path =
+      output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("caller");
+  for (auto const& cf_name : cf_aggregates_map_) {
+    header += ",";
+    header += cf_name.first;
+  }
+  out << header << std::endl;
+  for (auto const& cf_naccess_it : caller_cf_accesses) {
+    const TableReaderCaller caller = cf_naccess_it.first;
+    std::string row;
+    row += caller_to_string(caller);
+    row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second);
     out << row << std::endl;
   }
   out.close();
 }
 
+void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats(
+    TableReaderCaller analyzing_caller) const {
+  std::map<uint32_t, std::map<std::string, uint64_t>> level_cf_accesses;
+  std::map<TraceType, std::map<std::string, uint64_t>> bt_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          if (caller == analyzing_caller) {
+            const uint64_t naccess = caller_num.second;
+            level_cf_accesses[level][cf_name] += naccess;
+            bt_cf_accesses[type][cf_name] += naccess;
+            total_accesses += naccess;
+          }
+        }
+      };
+  TraverseBlocks(block_callback);
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("level");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& level_naccess_it : level_cf_accesses) {
+      const uint32_t level = level_naccess_it.first;
+      std::string row;
+      row += std::to_string(level);
+      row += OutputPercentAccessStats(total_accesses, level_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("bt");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& bt_naccess_it : bt_cf_accesses) {
+      const TraceType bt = bt_naccess_it.first;
+      std::string row;
+      row += block_type_to_string(bt);
+      row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats(
+    const std::vector<uint64_t>& access_count_buckets,
+    bool user_access_only) const {
+  // x: buckets.
+  // y: # of accesses.
+  std::map<std::string, std::map<uint64_t, uint64_t>> bt_access_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> cf_access_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        const std::string type_str = block_type_to_string(type);
+        if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            cf_access_nblocks[cf_name][access] = 0;
+          }
+        }
+        if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            bt_access_nblocks[type_str][access] = 0;
+          }
+        }
+        uint64_t naccesses = 0;
+        for (auto const& caller_access : block.caller_num_access_map) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            naccesses += caller_access.second;
+          }
+        }
+        if (naccesses == 0) {
+          return;
+        }
+        total_nblocks += 1;
+        bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1;
+        cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1;
+      };
+  TraverseBlocks(block_callback);
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  WriteStatsToFile("cf", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   cf_access_nblocks, total_nblocks);
+  WriteStatsToFile("bt", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   bt_access_nblocks, total_nblocks);
+}
+
 BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
     const std::string& trace_file_path, const std::string& output_dir,
+    bool compute_reuse_distance,
     std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
     : env_(rocksdb::Env::Default()),
       trace_file_path_(trace_file_path),
       output_dir_(output_dir),
+      compute_reuse_distance_(compute_reuse_distance),
       cache_simulator_(std::move(cache_simulator)) {}
 
 void BlockCacheTraceAnalyzer::ComputeReuseDistance(
@@ -577,19 +1082,28 @@ void BlockCacheTraceAnalyzer::RecordAccess(
       file_aggr.block_type_aggregates_map[access.block_type];
   BlockAccessInfo& block_access_info =
       block_type_aggr.block_access_info_map[access.block_key];
-  ComputeReuseDistance(&block_access_info);
+  if (compute_reuse_distance_) {
+    ComputeReuseDistance(&block_access_info);
+  }
   block_access_info.AddAccess(access);
   block_info_map_[access.block_key] = &block_access_info;
+  if (trace_start_timestamp_in_seconds_ == 0) {
+    trace_start_timestamp_in_seconds_ =
+        access.access_timestamp / kMicrosInSecond;
+  }
+  trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
 
-  // Add this block to all existing blocks.
-  for (auto& cf_aggregates : cf_aggregates_map_) {
-    for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      for (auto& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        for (auto& existing_block :
-             block_type_aggregates.second.block_access_info_map) {
-          existing_block.second.unique_blocks_since_last_access.insert(
-              access.block_key);
+  if (compute_reuse_distance_) {
+    // Add this block to all existing blocks.
+    for (auto& cf_aggregates : cf_aggregates_map_) {
+      for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+        for (auto& block_type_aggregates :
+             file_aggregates.second.block_type_aggregates_map) {
+          for (auto& existing_block :
+               block_type_aggregates.second.block_access_info_map) {
+            existing_block.second.unique_blocks_since_last_access.insert(
+                access.block_key);
+          }
         }
       }
     }
@@ -608,6 +1122,9 @@ Status BlockCacheTraceAnalyzer::Analyze() {
   if (!s.ok()) {
     return s;
   }
+  uint64_t start = env_->NowMicros();
+  uint64_t processed_records = 0;
+  uint64_t time_interval = 0;
   while (s.ok()) {
     BlockCacheTraceRecord access;
     s = reader.ReadAccess(&access);
@@ -618,6 +1135,17 @@ Status BlockCacheTraceAnalyzer::Analyze() {
     if (cache_simulator_) {
       cache_simulator_->Access(access);
     }
+    processed_records++;
+    uint64_t now = env_->NowMicros();
+    uint64_t duration = (now - start) / kMicrosInSecond;
+    if (duration > 10 * time_interval) {
+      fprintf(stdout,
+              "Running for %" PRIu64 " seconds: Processed %" PRIu64
+              " records/second\n",
+              duration, processed_records / duration);
+      processed_records = 0;
+      time_interval++;
+    }
   }
   return Status::OK();
 }
@@ -626,26 +1154,21 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
   HistogramStat bs_stats;
   std::map<TraceType, HistogramStat> bt_stats_map;
   std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
-  for (auto const& cf_aggregates : cf_aggregates_map_) {
-    // Stats per column family.
-    const std::string& cf_name = cf_aggregates.first;
-    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      // Stats per SST file.
-      for (auto const& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        // Stats per block type.
-        const TraceType type = block_type_aggregates.first;
-        for (auto const& block_access_info :
-             block_type_aggregates.second.block_access_info_map) {
-          // Stats per block.
-          bs_stats.Add(block_access_info.second.block_size);
-          bt_stats_map[type].Add(block_access_info.second.block_size);
-          cf_bt_stats_map[cf_name][type].Add(
-              block_access_info.second.block_size);
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.block_size == 0) {
+          // Block size may be 0 when 1) compaction observes a cache miss and
+          // does not insert the missing block into the cache again. 2)
+          // fetching filter blocks in SST files at the last level.
+          return;
         }
-      }
-    }
-  }
+        bs_stats.Add(block.block_size);
+        bt_stats_map[type].Add(block.block_size);
+        cf_bt_stats_map[cf_name][type].Add(block.block_size);
+      };
+  TraverseBlocks(block_callback);
   fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
   for (auto const& bt_stats : bt_stats_map) {
     print_break_lines(/*num_break_lines=*/1);
@@ -665,33 +1188,151 @@ void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
   }
 }
 
-void BlockCacheTraceAnalyzer::PrintAccessCountStats() const {
+void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
+                                                    uint32_t bottom_k,
+                                                    uint32_t top_k) const {
   HistogramStat access_stats;
   std::map<TraceType, HistogramStat> bt_stats_map;
   std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
-  for (auto const& cf_aggregates : cf_aggregates_map_) {
-    // Stats per column family.
-    const std::string& cf_name = cf_aggregates.first;
-    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      // Stats per SST file.
-      for (auto const& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        // Stats per block type.
-        const TraceType type = block_type_aggregates.first;
-        for (auto const& block_access_info :
-             block_type_aggregates.second.block_access_info_map) {
-          // Stats per block.
-          access_stats.Add(block_access_info.second.num_accesses);
-          bt_stats_map[type].Add(block_access_info.second.num_accesses);
-          cf_bt_stats_map[cf_name][type].Add(
-              block_access_info.second.num_accesses);
-        }
+  std::map<uint64_t, std::vector<std::string>> access_count_blocks;
+  auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType type,
+                            const std::string& block_key, uint64_t /*block_id*/,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& caller_access : block.caller_num_access_map) {
+      if (!user_access_only ||
+          (user_access_only && is_user_access(caller_access.first))) {
+        naccesses += caller_access.second;
       }
     }
-  }
+    if (naccesses == 0) {
+      return;
+    }
+    if (type == TraceType::kBlockTraceDataBlock) {
+      access_count_blocks[naccesses].push_back(block_key);
+    }
+    access_stats.Add(naccesses);
+    bt_stats_map[type].Add(naccesses);
+    cf_bt_stats_map[cf_name][type].Add(naccesses);
+  };
+  TraverseBlocks(block_callback);
   fprintf(stdout,
-          "Block access count stats: The number of accesses per block.\n%s",
+          "Block access count stats: The number of accesses per block. %s\n%s",
+          user_access_only ? "User accesses only" : "All accesses",
           access_stats.ToString().c_str());
+  uint32_t bottom_k_index = 0;
+  for (auto naccess_it = access_count_blocks.begin();
+       naccess_it != access_count_blocks.end(); naccess_it++) {
+    bottom_k_index++;
+    if (bottom_k_index >= bottom_k) {
+      break;
+    }
+    std::map<TableReaderCaller, uint32_t> caller_naccesses;
+    uint64_t naccesses = 0;
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          caller_naccesses[caller_access.first] += caller_access.second;
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    std::string statistics("Caller:");
+    for (auto const& caller_naccessess_it : caller_naccesses) {
+      statistics += caller_to_string(caller_naccessess_it.first);
+      statistics += ":";
+      statistics +=
+          std::to_string(percent(caller_naccessess_it.second, naccesses));
+      statistics += ",";
+    }
+    fprintf(stdout,
+            "Bottom %" PRIu32 " access count. Access count=%" PRIu64
+            " nblocks=%" PRIu64 " %s\n",
+            bottom_k, naccess_it->first, naccess_it->second.size(),
+            statistics.c_str());
+  }
+
+  uint32_t top_k_index = 0;
+  for (auto naccess_it = access_count_blocks.rbegin();
+       naccess_it != access_count_blocks.rend(); naccess_it++) {
+    top_k_index++;
+    if (top_k_index >= top_k) {
+      break;
+    }
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      std::string statistics("Caller:");
+      uint64_t naccesses = 0;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          naccesses += caller_access.second;
+        }
+      }
+      assert(naccesses > 0);
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          statistics += ",";
+          statistics += caller_to_string(caller_access.first);
+          statistics += ":";
+          statistics +=
+              std::to_string(percent(caller_access.second, naccesses));
+        }
+      }
+      uint64_t ref_keys_accesses = 0;
+      uint64_t ref_keys_does_not_exist_accesses = 0;
+      for (auto const& ref_key_caller_access : block->key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            ref_keys_accesses += caller_access.second;
+          }
+        }
+      }
+      for (auto const& ref_key_caller_access :
+           block->non_exist_key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            ref_keys_does_not_exist_accesses += caller_access.second;
+          }
+        }
+      }
+      statistics += ",nkeys=";
+      statistics += std::to_string(block->num_keys);
+      statistics += ",block_size=";
+      statistics += std::to_string(block->block_size);
+      statistics += ",num_ref_keys=";
+      statistics += std::to_string(block->key_num_access_map.size());
+      statistics += ",percent_access_ref_keys=";
+      statistics += std::to_string(percent(ref_keys_accesses, naccesses));
+      statistics += ",num_ref_keys_does_not_exist=";
+      statistics += std::to_string(block->non_exist_key_num_access_map.size());
+      statistics += ",percent_access_ref_keys_does_not_exist=";
+      statistics +=
+          std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses));
+      statistics += ",ref_data_size=";
+      statistics += std::to_string(block->referenced_data_size);
+      fprintf(stdout,
+              "Top %" PRIu32 " access count blocks access_count=%" PRIu64
+              " %s\n",
+              top_k, naccess_it->first, statistics.c_str());
+      // if (block->referenced_data_size > block->block_size) {
+      //   for (auto const& ref_key_it : block->key_num_access_map) {
+      //     ParsedInternalKey internal_key;
+      //     ParseInternalKey(ref_key_it.first, &internal_key);
+      //     printf("######%lu %lu %d %s\n", block->referenced_data_size,
+      //     block->block_size, internal_key.type,
+      //     internal_key.user_key.ToString().c_str());
+      //   }
+      // }
+    }
+  }
+
   for (auto const& bt_stats : bt_stats_map) {
     print_break_lines(/*num_break_lines=*/1);
     fprintf(stdout, "Break down by block type %s: \n%s",
@@ -727,62 +1368,49 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
   HistogramStat stdev_naccesses_per_key_in_a_data_block;
   std::map<std::string, HistogramStat>
       cf_stdev_naccesses_per_key_in_a_data_block;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.num_keys == 0) {
+          return;
+        }
+        // Use four decimal points.
+        uint64_t percent_referenced_for_existing_keys = (uint64_t)(
+            ((double)block.key_num_access_map.size() / (double)block.num_keys) *
+            10000.0);
+        uint64_t percent_referenced_for_non_existing_keys =
+            (uint64_t)(((double)block.non_exist_key_num_access_map.size() /
+                        (double)block.num_keys) *
+                       10000.0);
+        uint64_t percent_accesses_for_existing_keys =
+            (uint64_t)(((double)block.num_referenced_key_exist_in_block /
+                        (double)block.num_accesses) *
+                       10000.0);
 
-  for (auto const& cf_aggregates : cf_aggregates_map_) {
-    // Stats per column family.
-    const std::string& cf_name = cf_aggregates.first;
-    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
-      // Stats per SST file.
-      for (auto const& block_type_aggregates :
-           file_aggregates.second.block_type_aggregates_map) {
-        // Stats per block type.
-        for (auto const& block_access_info :
-             block_type_aggregates.second.block_access_info_map) {
-          // Stats per block.
-          if (block_access_info.second.num_keys == 0) {
-            continue;
+        HistogramStat hist_naccess_per_key;
+        for (auto const& key_access : block.key_num_access_map) {
+          for (auto const& caller_access : key_access.second) {
+            hist_naccess_per_key.Add(caller_access.second);
           }
-          // Use four decimal points.
-          uint64_t percent_referenced_for_existing_keys = (uint64_t)(
-              ((double)block_access_info.second.key_num_access_map.size() /
-               (double)block_access_info.second.num_keys) *
-              10000.0);
-          uint64_t percent_referenced_for_non_existing_keys =
-              (uint64_t)(((double)block_access_info.second
-                              .non_exist_key_num_access_map.size() /
-                          (double)block_access_info.second.num_keys) *
-                         10000.0);
-          uint64_t percent_accesses_for_existing_keys = (uint64_t)(
-              ((double)
-                   block_access_info.second.num_referenced_key_exist_in_block /
-               (double)block_access_info.second.num_accesses) *
-              10000.0);
-
-          HistogramStat hist_naccess_per_key;
-          for (auto const& key_access :
-               block_access_info.second.key_num_access_map) {
-            hist_naccess_per_key.Add(key_access.second);
-          }
-          uint64_t avg_accesses = hist_naccess_per_key.Average();
-          uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation();
-          avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
-          cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
-          stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
-          cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(
-              stdev_accesses);
-
-          existing_keys_stats.Add(percent_referenced_for_existing_keys);
-          cf_existing_keys_stats_map[cf_name].Add(
-              percent_referenced_for_existing_keys);
-          non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
-          cf_non_existing_keys_stats_map[cf_name].Add(
-              percent_referenced_for_non_existing_keys);
-          block_access_stats.Add(percent_accesses_for_existing_keys);
-          cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
         }
-      }
-    }
-  }
+        uint64_t avg_accesses = hist_naccess_per_key.Average();
+        uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation();
+        avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
+        cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
+        stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
+        cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses);
+
+        existing_keys_stats.Add(percent_referenced_for_existing_keys);
+        cf_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_existing_keys);
+        non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
+        cf_non_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_non_existing_keys);
+        block_access_stats.Add(percent_accesses_for_existing_keys);
+        cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
+      };
+  TraverseBlocks(block_callback);
   fprintf(stdout,
           "Histogram on the number of referenced keys existing in a block over "
           "the total number of keys in a block: \n%s",
@@ -1032,15 +1660,15 @@ std::vector<CacheConfiguration> parse_cache_config_file(
   return configs;
 }
 
-std::set<uint64_t> parse_buckets(const std::string& bucket_str) {
-  std::set<uint64_t> buckets;
+std::vector<uint64_t> parse_buckets(const std::string& bucket_str) {
+  std::vector<uint64_t> buckets;
   std::stringstream ss(bucket_str);
   while (ss.good()) {
     std::string bucket;
     getline(ss, bucket, ',');
-    buckets.insert(ParseUint64(bucket));
+    buckets.push_back(ParseUint64(bucket));
   }
-  buckets.insert(port::kMaxUint64);
+  buckets.push_back(port::kMaxUint64);
   return buckets;
 }
 
@@ -1068,20 +1696,27 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
       exit(1);
     }
   }
-  BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
-                                   FLAGS_block_cache_analysis_result_dir,
-                                   std::move(cache_simulator));
+  BlockCacheTraceAnalyzer analyzer(
+      FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
+      !FLAGS_reuse_distance_labels.empty(), std::move(cache_simulator));
   Status s = analyzer.Analyze();
   if (!s.IsIncomplete()) {
     // Read all traces.
     fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
     exit(1);
   }
+  fprintf(stdout, "Status: %s\n", s.ToString().c_str());
 
   analyzer.PrintStatsSummary();
   if (FLAGS_print_access_count_stats) {
     print_break_lines(/*num_break_lines=*/3);
-    analyzer.PrintAccessCountStats();
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
   }
   if (FLAGS_print_block_size_stats) {
     print_break_lines(/*num_break_lines=*/3);
@@ -1099,13 +1734,36 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
     while (ss.good()) {
       std::string label;
       getline(ss, label, ',');
-      analyzer.WriteAccessTimeline(label);
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, true);
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, true);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+      } else {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+      }
     }
   }
 
+  if (!FLAGS_analyze_callers.empty()) {
+    analyzer.WritePercentAccessSummaryStats();
+    std::stringstream ss(FLAGS_analyze_callers);
+    while (ss.good()) {
+      std::string caller;
+      getline(ss, caller, ',');
+      analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller));
+    }
+  }
+
+  if (!FLAGS_access_count_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_access_count_buckets);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false);
+  }
+
   if (!FLAGS_reuse_distance_labels.empty() &&
       !FLAGS_reuse_distance_buckets.empty()) {
-    std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
     std::stringstream ss(FLAGS_reuse_distance_labels);
     while (ss.good()) {
       std::string label;
@@ -1116,7 +1774,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
 
   if (!FLAGS_reuse_interval_labels.empty() &&
       !FLAGS_reuse_interval_buckets.empty()) {
-    std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
     std::stringstream ss(FLAGS_reuse_interval_labels);
     while (ss.good()) {
       std::string label;
@@ -1124,6 +1782,43 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
       analyzer.WriteReuseInterval(label, buckets);
     }
   }
+
+  if (!FLAGS_reuse_lifetime_labels.empty() &&
+      !FLAGS_reuse_lifetime_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_lifetime_buckets);
+    std::stringstream ss(FLAGS_reuse_lifetime_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseLifetime(label, buckets);
+    }
+  }
+
+  if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) {
+    std::vector<TraceType> block_types{TraceType::kBlockTraceIndexBlock,
+                                       TraceType::kBlockTraceDataBlock,
+                                       TraceType::kBlockTraceFilterBlock};
+    for (auto block_type : block_types) {
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/true, block_type);
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/false, block_type);
+    }
+  }
+
+  if (!FLAGS_analyze_get_spatial_locality_labels.empty() &&
+      !FLAGS_analyze_get_spatial_locality_buckets.empty()) {
+    std::vector<uint64_t> buckets =
+        parse_buckets(FLAGS_analyze_get_spatial_locality_buckets);
+    std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteGetSpatialLocality(label, buckets);
+    }
+  }
   return 0;
 }
 
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 617b90280c9..feb7c21f22c 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -9,13 +9,13 @@
 #include <set>
 #include <vector>
 
+#include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/sim_cache.h"
 #include "trace_replay/block_cache_tracer.h"
 #include "utilities/simulator_cache/cache_simulator.h"
 
 namespace rocksdb {
-
 // Statistics of a block.
 struct BlockAccessInfo {
   uint64_t num_accesses = 0;
@@ -23,11 +23,12 @@ struct BlockAccessInfo {
   uint64_t first_access_time = 0;
   uint64_t last_access_time = 0;
   uint64_t num_keys = 0;
-  std::map<std::string, uint64_t>
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
       key_num_access_map;  // for keys exist in this block.
-  std::map<std::string, uint64_t>
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
       non_exist_key_num_access_map;  // for keys do not exist in this block.
   uint64_t num_referenced_key_exist_in_block = 0;
+  uint64_t referenced_data_size = 0;
   std::map<TableReaderCaller, uint64_t> caller_num_access_map;
   // caller:timestamp:number_of_accesses. The granularity of the timestamp is
   // seconds.
@@ -39,6 +40,12 @@ struct BlockAccessInfo {
   std::map<uint64_t, uint64_t> reuse_distance_count;
 
   void AddAccess(const BlockCacheTraceRecord& access) {
+    if (block_size != 0 && access.block_size != 0) {
+      assert(block_size == access.block_size);
+    }
+    if (num_keys != 0 && access.num_keys_in_block != 0) {
+      assert(num_keys == access.num_keys_in_block);
+    }
     if (first_access_time == 0) {
       first_access_time = access.access_timestamp;
     }
@@ -54,10 +61,18 @@ struct BlockAccessInfo {
                                                         access.caller)) {
       num_keys = access.num_keys_in_block;
       if (access.referenced_key_exist_in_block == Boolean::kTrue) {
-        key_num_access_map[access.referenced_key]++;
+        if (key_num_access_map.find(access.referenced_key) ==
+            key_num_access_map.end()) {
+          referenced_data_size += access.referenced_data_size;
+        }
+        key_num_access_map[access.referenced_key][access.caller]++;
         num_referenced_key_exist_in_block++;
+        if (referenced_data_size > block_size && block_size != 0) {
+          ParsedInternalKey internal_key;
+          ParseInternalKey(access.referenced_key, &internal_key);
+        }
       } else {
-        non_exist_key_num_access_map[access.referenced_key]++;
+        non_exist_key_num_access_map[access.referenced_key][access.caller]++;
       }
     }
   }
@@ -83,6 +98,7 @@ class BlockCacheTraceAnalyzer {
  public:
   BlockCacheTraceAnalyzer(
       const std::string& trace_file_path, const std::string& output_dir,
+      bool compute_reuse_distance,
       std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
   ~BlockCacheTraceAnalyzer() = default;
   // No copy and move.
@@ -122,7 +138,8 @@ class BlockCacheTraceAnalyzer {
 
   // Print access count distribution and the distribution break down by block
   // type and column family.
-  void PrintAccessCountStats() const;
+  void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k,
+                             uint32_t top_k) const;
 
   // Print data block accesses by user Get and Multi-Get.
   // It prints out 1) A histogram on the percentage of keys accessed in a data
@@ -131,24 +148,93 @@ class BlockCacheTraceAnalyzer {
   // accesses on keys exist in a data block and its break down by column family.
   void PrintDataBlockAccessStats() const;
 
+  // Write the percentage of accesses break down by column family into a csv
+  // file saved in 'output_dir'.
+  //
+  // The file is named "percentage_of_accesses_summary". The file format is
+  // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in
+  // the trace.
+  void WritePercentAccessSummaryStats() const;
+
+  // Write the percentage of accesses for the given caller break down by column
+  // family, level, and block type into a csv file saved in 'output_dir'.
+  //
+  // It generates two files: 1) caller_level_percentage_of_accesses_summary and
+  // 2) caller_bt_percentage_of_accesses_summary which break down by the level
+  // and block type, respectively. The file format is
+  // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in
+  // the trace.
+  void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const;
+
+  // Write the access count summary into a csv file saved in 'output_dir'.
+  // It groups blocks by their access count.
+  //
+  // It generates two files: 1) cf_access_count_summary and 2)
+  // bt_access_count_summary which break down the access count by column family
+  // and block type, respectively. The file format is
+  // cf/bt,bucket_0,bucket_1,...,bucket_N.
+  void WriteAccessCountSummaryStats(
+      const std::vector<uint64_t>& access_count_buckets,
+      bool user_access_only) const;
+
   // Write miss ratio curves of simulated cache configurations into a csv file
-  // saved in 'output_dir'.
+  // named "mrc" saved in 'output_dir'.
+  //
+  // The file format is
+  // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses".
   void WriteMissRatioCurves() const;
 
   // Write the access timeline into a csv file saved in 'output_dir'.
-  void WriteAccessTimeline(const std::string& label) const;
+  //
+  // The file is named "label_access_timeline".The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique labels found in the trace.
+  void WriteAccessTimeline(const std::string& label, uint64_t time_unit,
+                           bool user_access_only) const;
 
   // Write the reuse distance into a csv file saved in 'output_dir'. Reuse
   // distance is defined as the cumulated size of unique blocks read between two
   // consective accesses on the same block.
+  //
+  // The file is named "label_reuse_distance". The file format is
+  // bucket,label_1,label_2,...,label_N.
   void WriteReuseDistance(const std::string& label_str,
-                          const std::set<uint64_t>& distance_buckets) const;
+                          const std::vector<uint64_t>& distance_buckets) const;
 
   // Write the reuse interval into a csv file saved in 'output_dir'. Reuse
   // interval is defined as the time between two consecutive accesses on the
-  // same block..
+  // same block.
+  //
+  // The file is named "label_reuse_interval". The file format is
+  // bucket,label_1,label_2,...,label_N.
   void WriteReuseInterval(const std::string& label_str,
-                          const std::set<uint64_t>& time_buckets) const;
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse
+  // lifetime is defined as the time interval between the first access of a
+  // block and its last access.
+  //
+  // The file is named "label_reuse_lifetime". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseLifetime(const std::string& label_str,
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse timeline into a csv file saved in 'output_dir'.
+  //
+  // The file is named
+  // "block_type_user_access_only_reuse_window_reuse_timeline". The file format
+  // is start_time,0,1,...,N where N equals trace_duration / reuse_window.
+  void WriteBlockReuseTimeline(uint64_t reuse_window, bool user_access_only,
+                               TraceType block_type) const;
+
+  // Write the Get spatical locality into csv files saved in 'output_dir'.
+  //
+  // It generates three csv files. label_percent_ref_keys,
+  // label_percent_accesses_on_ref_keys, and
+  // label_percent_data_size_on_ref_keys.
+  void WriteGetSpatialLocality(
+      const std::string& label_str,
+      const std::vector<uint64_t>& percent_buckets) const;
 
   const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
   TEST_cf_aggregates_map() const {
@@ -161,28 +247,48 @@ class BlockCacheTraceAnalyzer {
   std::string BuildLabel(const std::set<std::string>& labels,
                          const std::string& cf_name, uint64_t fd,
                          uint32_t level, TraceType type,
-                         TableReaderCaller caller,
-                         const std::string& block_key) const;
+                         TableReaderCaller caller, uint64_t block_key) const;
 
   void ComputeReuseDistance(BlockAccessInfo* info) const;
 
   void RecordAccess(const BlockCacheTraceRecord& access);
 
   void UpdateReuseIntervalStats(
-      const std::string& label, const std::set<uint64_t>& time_buckets,
+      const std::string& label, const std::vector<uint64_t>& time_buckets,
       const std::map<uint64_t, uint64_t> timeline,
       std::map<std::string, std::map<uint64_t, uint64_t>>*
           label_time_num_reuses,
       uint64_t* total_num_reuses) const;
 
+  std::string OutputPercentAccessStats(
+      uint64_t total_accesses,
+      const std::map<std::string, uint64_t>& cf_access_count) const;
+
+  void WriteStatsToFile(
+      const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+      const std::string& filename_suffix,
+      const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+      uint64_t ntotal) const;
+
+  void TraverseBlocks(
+      std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                         uint32_t /*level*/, TraceType /*block_type*/,
+                         const std::string& /*block_key*/,
+                         uint64_t /*block_key_id*/,
+                         const BlockAccessInfo& /*block_access_info*/)>
+          block_callback) const;
+
   rocksdb::Env* env_;
   const std::string trace_file_path_;
   const std::string output_dir_;
+  const bool compute_reuse_distance_;
 
   BlockCacheTraceHeader header_;
   std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
   std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
   std::map<std::string, BlockAccessInfo*> block_info_map_;
+  uint64_t trace_start_timestamp_in_seconds_ = 0;
+  uint64_t trace_end_timestamp_in_seconds_ = 0;
 };
 
 int block_cache_trace_analyzer_tool(int argc, char** argv);
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index efb202cb4ab..45ef99eee75 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -56,6 +56,12 @@ class BlockCacheTracerTest : public testing::Test {
     reuse_distance_buckets_ = "1,1K,1M,1G";
     reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
     reuse_interval_buckets_ = "1,10,100,1000";
+    reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_lifetime_buckets_ = "1,10,100,1000";
+    analyzing_callers_ = "Get,Iterator";
+    access_count_buckets_ = "2,3,4,5,10";
+    analyze_get_spatial_locality_labels_ = "all";
+    analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100";
   }
 
   ~BlockCacheTracerTest() override {
@@ -158,12 +164,22 @@ class BlockCacheTracerTest : public testing::Test {
         "-print_access_count_stats",
         "-print_data_block_access_count_stats",
         "-cache_sim_warmup_seconds=0",
+        "-analyze_bottom_k_access_count_blocks=5",
+        "-analyze_top_k_access_count_blocks=5",
+        "-analyze_blocks_reuse_k_reuse_window=5",
         "-timeline_labels=" + timeline_labels_,
         "-reuse_distance_labels=" + reuse_distance_labels_,
         "-reuse_distance_buckets=" + reuse_distance_buckets_,
         "-reuse_interval_labels=" + reuse_interval_labels_,
         "-reuse_interval_buckets=" + reuse_interval_buckets_,
-    };
+        "-reuse_lifetime_labels=" + reuse_lifetime_labels_,
+        "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_,
+        "-analyze_callers=" + analyzing_callers_,
+        "-access_count_buckets=" + access_count_buckets_,
+        "-analyze_get_spatial_locality_labels=" +
+            analyze_get_spatial_locality_labels_,
+        "-analyze_get_spatial_locality_buckets=" +
+            analyze_get_spatial_locality_buckets_};
     char arg_buffer[kArgBufferSize];
     char* argv[kMaxArgCount];
     int argc = 0;
@@ -189,6 +205,12 @@ class BlockCacheTracerTest : public testing::Test {
   std::string reuse_distance_buckets_;
   std::string reuse_interval_labels_;
   std::string reuse_interval_buckets_;
+  std::string reuse_lifetime_labels_;
+  std::string reuse_lifetime_buckets_;
+  std::string analyzing_callers_;
+  std::string access_count_buckets_;
+  std::string analyze_get_spatial_locality_labels_;
+  std::string analyze_get_spatial_locality_buckets_;
 };
 
 TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
@@ -247,51 +269,65 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
   }
   {
     // Validate the timeline csv files.
-    const uint32_t expected_num_lines = 50;
-    std::stringstream ss(timeline_labels_);
-    while (ss.good()) {
-      std::string l;
-      ASSERT_TRUE(getline(ss, l, ','));
-      const std::string timeline_file =
-          test_path_ + "/" + l + "_access_timeline";
-      std::ifstream infile(timeline_file);
-      std::string line;
-      uint32_t nlines = 0;
-      ASSERT_TRUE(getline(infile, line));
-      uint64_t expected_time = 1;
-      while (getline(infile, line)) {
-        std::stringstream ss_naccess(line);
-        uint32_t naccesses = 0;
-        std::string substr;
-        uint32_t time = 0;
-        while (ss_naccess.good()) {
-          ASSERT_TRUE(getline(ss_naccess, substr, ','));
-          if (time == 0) {
-            time = ParseUint32(substr);
-            continue;
+    const std::vector<std::string> time_units{"_60", "_3600"};
+    const std::vector<std::string> user_access_only_flags{"user_access_only_",
+                                                          "all_access_"};
+    for (auto const& user_access_only : user_access_only_flags) {
+      for (auto const& unit : time_units) {
+        std::stringstream ss(timeline_labels_);
+        while (ss.good()) {
+          std::string l;
+          ASSERT_TRUE(getline(ss, l, ','));
+          if (l.find("block") == std::string::npos) {
+            if (unit != "_60" || user_access_only != "all_access_") {
+              continue;
+            }
           }
-          naccesses += ParseUint32(substr);
+          const std::string timeline_file = test_path_ + "/" +
+                                            user_access_only + l + unit +
+                                            "_access_timeline";
+          std::ifstream infile(timeline_file);
+          std::string line;
+          const uint64_t expected_naccesses = 50;
+          const uint64_t expected_user_accesses = 30;
+          ASSERT_TRUE(getline(infile, line)) << timeline_file;
+          uint32_t naccesses = 0;
+          while (getline(infile, line)) {
+            std::stringstream ss_naccess(line);
+            std::string substr;
+            bool read_label = false;
+            while (ss_naccess.good()) {
+              ASSERT_TRUE(getline(ss_naccess, substr, ','));
+              if (!read_label) {
+                read_label = true;
+                continue;
+              }
+              naccesses += ParseUint32(substr);
+            }
+          }
+          if (user_access_only == "user_access_only_") {
+            ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file;
+          } else {
+            ASSERT_EQ(expected_naccesses, naccesses) << timeline_file;
+          }
+          ASSERT_OK(env_->DeleteFile(timeline_file));
         }
-        nlines++;
-        ASSERT_EQ(1, naccesses);
-        ASSERT_EQ(expected_time, time);
-        expected_time += 1;
       }
-      ASSERT_EQ(expected_num_lines, nlines);
-      ASSERT_OK(env_->DeleteFile(timeline_file));
     }
   }
   {
     // Validate the reuse_interval and reuse_distance csv files.
     std::map<std::string, std::string> test_reuse_csv_files;
-    test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_;
     test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
+    test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_;
+    test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_avg_reuse_interval_naccesses"] =
+        reuse_interval_labels_;
     for (auto const& test : test_reuse_csv_files) {
       const std::string& file_suffix = test.first;
       const std::string& labels = test.second;
-      const uint32_t expected_num_rows = 10;
-      const uint32_t expected_num_rows_absolute_values = 5;
-      const uint32_t expected_reused_blocks = 0;
+      const uint32_t expected_num_rows = 5;
       std::stringstream ss(labels);
       while (ss.good()) {
         std::string l;
@@ -300,7 +336,6 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
         std::ifstream infile(reuse_csv_file);
         std::string line;
         ASSERT_TRUE(getline(infile, line));
-        uint32_t nblocks = 0;
         double npercentage = 0;
         uint32_t nrows = 0;
         while (getline(infile, line)) {
@@ -314,20 +349,162 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
               label_read = true;
               continue;
             }
-            if (nrows < expected_num_rows_absolute_values) {
-              nblocks += ParseUint32(substr);
-            } else {
-              npercentage += ParseDouble(substr);
-            }
+            npercentage += ParseDouble(substr);
           }
         }
         ASSERT_EQ(expected_num_rows, nrows);
-        ASSERT_EQ(expected_reused_blocks, nblocks);
-        ASSERT_LT(npercentage, 0);
+        if ("_reuse_lifetime" == test.first ||
+            "_avg_reuse_interval" == test.first ||
+            "_avg_reuse_interval_naccesses" == test.first) {
+          ASSERT_EQ(100, npercentage) << reuse_csv_file;
+        } else {
+          ASSERT_LT(npercentage, 0);
+        }
         ASSERT_OK(env_->DeleteFile(reuse_csv_file));
       }
     }
   }
+
+  {
+    // Validate the percentage of accesses summary.
+    const std::string percent_access_summary_file =
+        test_path_ + "/percentage_of_accesses_summary";
+    std::ifstream infile(percent_access_summary_file);
+    std::string line;
+    ASSERT_TRUE(getline(infile, line));
+    std::set<std::string> callers;
+    std::set<std::string> expected_callers{"Get", "MultiGet", "Iterator",
+                                           "Prefetch", "Compaction"};
+    while (getline(infile, line)) {
+      std::stringstream caller_percent(line);
+      std::string caller;
+      ASSERT_TRUE(getline(caller_percent, caller, ','));
+      std::string percent;
+      ASSERT_TRUE(getline(caller_percent, percent, ','));
+      ASSERT_FALSE(caller_percent.good());
+      callers.insert(caller);
+      ASSERT_EQ(20, ParseDouble(percent));
+    }
+    ASSERT_EQ(expected_callers.size(), callers.size());
+    for (auto caller : callers) {
+      ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end());
+    }
+    ASSERT_OK(env_->DeleteFile(percent_access_summary_file));
+  }
+  {
+    // Validate the percentage of accesses summary by analyzing callers.
+    std::stringstream analyzing_callers(analyzing_callers_);
+    while (analyzing_callers.good()) {
+      std::string caller;
+      ASSERT_TRUE(getline(analyzing_callers, caller, ','));
+      std::vector<std::string> breakdowns{"level", "bt"};
+      for (auto breakdown : breakdowns) {
+        const std::string file_name = test_path_ + "/" + caller + "_" +
+                                      breakdown +
+                                      "_percentage_of_accesses_summary";
+        std::ifstream infile(file_name);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum = 0;
+        while (getline(infile, line)) {
+          std::stringstream label_percent(line);
+          std::string label;
+          ASSERT_TRUE(getline(label_percent, label, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(label_percent, percent, ','));
+          ASSERT_FALSE(label_percent.good());
+          sum += ParseDouble(percent);
+        }
+        ASSERT_EQ(100, sum);
+        ASSERT_OK(env_->DeleteFile(file_name));
+      }
+    }
+  }
+  const std::vector<std::string> access_types{"user_access_only", "all_access"};
+  const std::vector<std::string> prefix{"bt", "cf"};
+  for (auto const& pre : prefix) {
+    for (auto const& access_type : access_types) {
+      {
+        // Validate the access count summary.
+        const std::string bt_access_count_summary = test_path_ + "/" + pre +
+                                                    "_" + access_type +
+                                                    "_access_count_summary";
+        std::ifstream infile(bt_access_count_summary);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum_percent = 0;
+        while (getline(infile, line)) {
+          std::stringstream bt_percent(line);
+          std::string bt;
+          ASSERT_TRUE(getline(bt_percent, bt, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(bt_percent, percent, ','));
+          sum_percent += ParseDouble(percent);
+        }
+        ASSERT_EQ(100.0, sum_percent);
+        ASSERT_OK(env_->DeleteFile(bt_access_count_summary));
+      }
+    }
+  }
+  for (auto const& access_type : access_types) {
+    std::vector<std::string> block_types{"Index", "Data", "Filter"};
+    for (auto block_type : block_types) {
+      // Validate reuse block timeline.
+      const std::string reuse_blocks_timeline = test_path_ + "/" + block_type +
+                                                "_" + access_type +
+                                                "_5_reuse_blocks_timeline";
+      std::ifstream infile(reuse_blocks_timeline);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline;
+      uint32_t index = 0;
+      while (getline(infile, line)) {
+        std::stringstream timeline(line);
+        bool start_time = false;
+        double sum = 0;
+        while (timeline.good()) {
+          std::string value;
+          ASSERT_TRUE(getline(timeline, value, ','));
+          if (!start_time) {
+            start_time = true;
+            continue;
+          }
+          sum += ParseDouble(value);
+        }
+        index++;
+        ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline;
+      }
+      ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline));
+    }
+  }
+
+  std::stringstream ss(analyze_get_spatial_locality_labels_);
+  while (ss.good()) {
+    std::string l;
+    ASSERT_TRUE(getline(ss, l, ','));
+    const std::vector<std::string> spatial_locality_files{
+        "_percent_ref_keys", "_percent_accesses_on_ref_keys",
+        "_percent_data_size_on_ref_keys"};
+    for (auto const& spatial_locality_file : spatial_locality_files) {
+      const std::string filename = test_path_ + "/" + l + spatial_locality_file;
+      std::ifstream infile(filename);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line));
+      double sum_percent = 0;
+      uint32_t nrows = 0;
+      while (getline(infile, line)) {
+        std::stringstream bt_percent(line);
+        std::string bt;
+        ASSERT_TRUE(getline(bt_percent, bt, ','));
+        std::string percent;
+        ASSERT_TRUE(getline(bt_percent, percent, ','));
+        sum_percent += ParseDouble(percent);
+        nrows++;
+      }
+      ASSERT_EQ(11, nrows);
+      ASSERT_EQ(100.0, sum_percent);
+      ASSERT_OK(env_->DeleteFile(filename));
+    }
+  }
   ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
 }
 
@@ -366,6 +543,7 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     // Read blocks.
     BlockCacheTraceAnalyzer analyzer(trace_file_path_,
                                      /*output_miss_ratio_curve_path=*/"",
+                                     /*compute_reuse_distance=*/true,
                                      /*simulator=*/nullptr);
     // The analyzer ends when it detects an incomplete access record.
     ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 62db942044c..a74dc4d58cb 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -29,6 +29,8 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
 }  // namespace
 
 const uint64_t kMicrosInSecond = 1000 * 1000;
+const uint64_t kSecondInMinute = 60;
+const uint64_t kSecondInHour = 3600;
 const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
     "UnknownColumnFamily";
 const uint64_t BlockCacheTraceHelper::kReservedGetId = 0;
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 66cbb5adefa..3b26a18d639 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -17,6 +17,9 @@
 namespace rocksdb {
 
 extern const uint64_t kMicrosInSecond;
+extern const uint64_t kSecondInMinute;
+extern const uint64_t kSecondInHour;
+
 
 class BlockCacheTraceHelper {
  public:

From 61876614dce8c9155e28d40b5d95ec1bf1cbfa47 Mon Sep 17 00:00:00 2001
From: Sergei Petrunia <psergey@askmonty.org>
Date: Fri, 12 Jul 2019 17:26:19 -0700
Subject: [PATCH 218/572] Fix MyRocks compile warnings-treated-as-errors on
 Fedora 30, gcc 9.1.1 (#5553)

Summary:
- Provide assignment operator in CompactionStats
- Provide a copy constructor for FileDescriptor
- Remove std::move from "return std::move(t)" in BoundedQueue
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5553

Differential Revision: D16230170

fbshipit-source-id: fd7c6e52390b2db1be24141e25649cf62424d078
---
 db/internal_stats.h                           | 22 +++++++++++++++++++
 db/version_edit.h                             |  2 ++
 .../persistent_cache/persistent_cache_util.h  |  2 +-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/db/internal_stats.h b/db/internal_stats.h
index 20fb07f4853..ebe90d574d6 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -237,6 +237,28 @@ class InternalStats {
       }
     }
 
+    CompactionStats& operator=(const CompactionStats& c) {
+      micros = c.micros;
+      cpu_micros = c.cpu_micros;
+      bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+      bytes_read_output_level = c.bytes_read_output_level;
+      bytes_written = c.bytes_written;
+      bytes_moved = c.bytes_moved;
+      num_input_files_in_non_output_levels =
+          c.num_input_files_in_non_output_levels;
+      num_input_files_in_output_level = c.num_input_files_in_output_level;
+      num_output_files = c.num_output_files;
+      num_input_records = c.num_input_records;
+      num_dropped_records = c.num_dropped_records;
+      count = c.count;
+
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+      return *this;
+    }
+
     void Clear() {
       this->micros = 0;
       this->cpu_micros = 0;
diff --git a/db/version_edit.h b/db/version_edit.h
index e1857b37fc4..4a93db34e15 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -52,6 +52,8 @@ struct FileDescriptor {
         smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno) {}
 
+  FileDescriptor(const FileDescriptor& fd) { *this=fd; }
+
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
     packed_number_and_path_id = fd.packed_number_and_path_id;
diff --git a/utilities/persistent_cache/persistent_cache_util.h b/utilities/persistent_cache/persistent_cache_util.h
index 214bb5875d6..254c038f985 100644
--- a/utilities/persistent_cache/persistent_cache_util.h
+++ b/utilities/persistent_cache/persistent_cache_util.h
@@ -48,7 +48,7 @@ class BoundedQueue {
     T t = std::move(q_.front());
     size_ -= t.Size();
     q_.pop_front();
-    return std::move(t);
+    return t;
   }
 
   size_t Size() const {

From 68d43b4d303d76836e0f2a4600de5de5e98fefea Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 12 Jul 2019 18:52:48 -0700
Subject: [PATCH 219/572] A python script to plot graphs for cvs files
 generated by block_cache_trace_analyzer

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5563

Test Plan: Manually run the script on files generated by block_cache_trace_analyzer.

Differential Revision: D16214400

Pulled By: HaoyuHuang

fbshipit-source-id: 94485eed995e9b2b63e197c5dfeb80129fa7897f
---
 tools/block_cache_trace_analyzer_plot.py | 403 +++++++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 tools/block_cache_trace_analyzer_plot.py

diff --git a/tools/block_cache_trace_analyzer_plot.py b/tools/block_cache_trace_analyzer_plot.py
new file mode 100644
index 00000000000..22d56b932c5
--- /dev/null
+++ b/tools/block_cache_trace_analyzer_plot.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+import csv
+import os
+import random
+import sys
+
+import matplotlib.backends.backend_pdf
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+# Make sure a legend has the same color across all generated graphs.
+def get_cmap(n, name="hsv"):
+    """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
+    RGB color; the keyword argument name must be a standard mpl colormap name."""
+    return plt.cm.get_cmap(name, n)
+
+
+color_index = 0
+bar_color_maps = {}
+colors = []
+n_colors = 60
+linear_colors = get_cmap(n_colors)
+for i in range(n_colors):
+    colors.append(linear_colors(i))
+# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
+random.shuffle(colors)
+
+
+def num_to_gb(n):
+    one_gb = 1024 * 1024 * 1024
+    if float(n) % one_gb == 0:
+        return "{}".format(n / one_gb)
+    # Keep two decimal points.
+    return "{0:.2f}".format(float(n) / one_gb)
+
+
+def plot_miss_ratio_graphs(csv_result_dir, output_result_dir):
+    mrc_file_path = csv_result_dir + "/mrc"
+    if not os.path.exists(mrc_file_path):
+        return
+    miss_ratios = {}
+    print("Processing file {}".format(mrc_file_path))
+    with open(mrc_file_path, "r") as csvfile:
+        rows = csv.reader(csvfile, delimiter=",")
+        is_header = False
+        for row in rows:
+            if not is_header:
+                is_header = True
+                continue
+            cache_name = row[0]
+            num_shard_bits = int(row[1])
+            ghost_capacity = int(row[2])
+            capacity = int(row[3])
+            miss_ratio = float(row[4])
+            config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+            if config not in miss_ratios:
+                miss_ratios[config] = {}
+                miss_ratios[config]["x"] = []
+                miss_ratios[config]["y"] = []
+            miss_ratios[config]["x"].append(num_to_gb(capacity))
+            miss_ratios[config]["y"].append(miss_ratio)
+    fig = plt.figure()
+    for config in miss_ratios:
+        plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config)
+    plt.xlabel("Cache capacity (GB)")
+    plt.ylabel("Miss Ratio (%)")
+    # plt.xscale('log', basex=2)
+    plt.ylim(ymin=0)
+    plt.title("RocksDB block cache miss ratios")
+    plt.legend()
+    fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight")
+
+
+def sanitize(label):
+    # matplotlib cannot plot legends that is prefixed with "_"
+    # so we need to remove them here.
+    index = 0
+    for i in range(len(label)):
+        if label[i] == "_":
+            index += 1
+        else:
+            break
+    data = label[index:]
+    # The value of uint64_max in c++.
+    if "18446744073709551615" in data:
+        return "max"
+    return data
+
+
+# Read the csv file vertically, i.e., group the data by columns.
+def read_data_for_plot_vertical(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows[0])):
+        labels.append(sanitize(data_rows[0][i]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                x.append(sanitize(data_rows[i][j]))
+                continue
+            label_stats[j - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+# Read the csv file horizontally, i.e., group the data by rows.
+def read_data_for_plot_horizontal(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows)):
+        labels.append(sanitize(data_rows[i][0]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows[0])):
+        x.append(sanitize(data_rows[0][i]))
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                # label
+                continue
+            label_stats[i - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+def read_data_for_plot(csvfile, vertical):
+    if vertical:
+        return read_data_for_plot_vertical(csvfile)
+    return read_data_for_plot_horizontal(csvfile)
+
+
+def plot_line_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    legend,
+):
+    pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        print("Processing file {}".format(file))
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(labels) == 0:
+                continue
+            # plot figure
+            fig = plt.figure()
+            for label_index in label_stats:
+                plt.plot(
+                    [int(x[i]) for i in range(len(x))],
+                    label_stats[label_index],
+                    label=labels[label_index],
+                )
+
+            # Translate time unit into x labels.
+            if "_60" in file:
+                plt.xlabel("{} (Minute)".format(xlabel))
+            if "_3600" in file:
+                plt.xlabel("{} (Hour)".format(xlabel))
+            plt.ylabel(ylabel)
+            plt.title("{} {}".format(title, file))
+            if legend:
+                plt.legend()
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_stacked_bar_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    x_prefix,
+):
+    global color_index, bar_color_maps, colors
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            print("Processing file {}/{}".format(csv_result_dir, file))
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(label_stats) == 0:
+                continue
+            # Plot figure
+            fig = plt.figure()
+            ind = np.arange(len(x))  # the x locations for the groups
+            width = 0.5  # the width of the bars: can also be len(x) sequence
+            bars = []
+            bottom_bars = []
+            for _i in label_stats[0]:
+                bottom_bars.append(0)
+            for i in range(0, len(label_stats)):
+                # Assign a unique color to this label.
+                if labels[i] not in bar_color_maps:
+                    bar_color_maps[labels[i]] = colors[color_index]
+                    color_index += 1
+                p = plt.bar(
+                    ind,
+                    label_stats[i],
+                    width,
+                    bottom=bottom_bars,
+                    color=bar_color_maps[labels[i]],
+                )
+                bars.append(p[0])
+                for j in range(len(label_stats[i])):
+                    bottom_bars[j] += label_stats[i][j]
+            plt.xlabel(xlabel)
+            plt.ylabel(ylabel)
+            plt.xticks(
+                ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
+            )
+            plt.legend(bars, labels)
+            plt.title("{} filename:{}".format(title, file))
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_access_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_timeline",
+        pdf_name="access_time.pdf",
+        xlabel="Time",
+        ylabel="Throughput",
+        title="Access timeline with group by label",
+        vertical=False,
+        legend=True,
+    )
+
+
+def plot_reuse_graphs(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval_naccesses",
+        pdf_name="avg_reuse_interval_naccesses.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval",
+        pdf_name="avg_reuse_interval.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_reuse_interval",
+        pdf_name="reuse_interval.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of accesses",
+        title="Reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="reuse_lifetime",
+        pdf_name="reuse_lifetime.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of blocks",
+        title="Reuse lifetime",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="reuse_blocks_timeline",
+        pdf_name="reuse_blocks_timeline.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Reuse blocks timeline",
+        vertical=False,
+        legend=False,
+    )
+
+
+def plot_percentage_access_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percentage_of_accesses_summary",
+        pdf_name="percentage_access.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_ref_keys",
+        pdf_name="percent_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_data_size_on_ref_keys",
+        pdf_name="percent_data_size_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_accesses_on_ref_keys",
+        pdf_name="percent_accesses_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+
+
+def plot_access_count_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_count_summary",
+        pdf_name="access_count_summary.pdf",
+        xlabel="Access count",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="< ",
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Must provide two arguments: 1) The directory that saves a list of "
+            "directories which contain block cache trace analyzer result files "
+            "2) the directory to save plotted graphs."
+        )
+        exit(1)
+    csv_result_dir = sys.argv[1]
+    output_result_dir = sys.argv[2]
+    print(
+        "Processing directory {} and save graphs to {}.".format(
+            csv_result_dir, output_result_dir
+        )
+    )
+    for csv_relative_dir in os.listdir(csv_result_dir):
+        csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
+        result_dir = output_result_dir + "/" + csv_relative_dir
+        if not os.path.isdir(csv_abs_dir):
+            print("{} is not a directory".format(csv_abs_dir))
+            continue
+        print("Processing experiment dir: {}".format(csv_relative_dir))
+        if not os.path.exists(result_dir):
+            os.makedirs(result_dir)
+        plot_miss_ratio_graphs(csv_abs_dir, result_dir)
+        plot_access_timeline(csv_abs_dir, result_dir)
+        plot_reuse_graphs(csv_abs_dir, result_dir)
+        plot_percentage_access_summary(csv_abs_dir, result_dir)
+        plot_access_count_summary(csv_abs_dir, result_dir)

From f064d74e4549964566e1f9a5bf988bf94acbd5e1 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Mon, 15 Jul 2019 11:16:55 -0700
Subject: [PATCH 220/572] Cleanup the Arm64 CRC32 unused warning (#5565)

Summary:
When 'HAVE_ARM64_CRC' is set, the blew methods:

- bool rocksdb::crc32c::isSSE42()
- bool rocksdb::crc32c::isPCLMULQDQ()

are defined but not used, the unused-function is raised
when do rocksdb build.

This patch try to cleanup these warnings by add ifndef,
if it build under the HAVE_ARM64_CRC, we will not define
`isSSE42` and `isPCLMULQDQ`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5565

Differential Revision: D16233654

fbshipit-source-id: c32a9dda7465dbf65f9ccafef159124db92cdffd
---
 util/crc32c.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util/crc32c.cc b/util/crc32c.cc
index e8d4116ff42..9e838b830f5 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -398,6 +398,8 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
   return static_cast<uint32_t>(l ^ 0xffffffffu);
 }
 
+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARM64_CRC
 // Detect if SS42 or not.
 #ifndef HAVE_POWER8
 
@@ -436,6 +438,7 @@ static bool isPCLMULQDQ() {
 }
 
 #endif  // HAVE_POWER8
+#endif  // HAVE_ARM64_CRC
 
 typedef uint32_t (*Function)(uint32_t, const char*, size_t);
 

From b0259e45e0be576f98e31020975a8b1cef8fb31f Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Mon, 15 Jul 2019 11:39:18 -0700
Subject: [PATCH 221/572] add more tracing for stats history (#5566)

Summary:
Sample info log output from db_bench:
In-memory:
```
2019/07/12-21:39:19.478490 7fa01b3f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS -------
2019/07/12-21:39:19.478633 7fa01b3f5700 [_impl/db_impl.cc:753] Storing 145 stats with timestamp 1562992759 to in-memory stats history
2019/07/12-21:39:19.478670 7fa01b3f5700 [_impl/db_impl.cc:766] [Pre-GC] In-memory stats history size: 1051218 bytes, slice count: 103
2019/07/12-21:39:19.478704 7fa01b3f5700 [_impl/db_impl.cc:775] [Post-GC] In-memory stats history size: 1051218 bytes, slice count: 102
```
On-disk:
```
2019/07/12-21:48:53.862548 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS -------
2019/07/12-21:48:53.862553 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics
2019/07/12-21:48:53.862852 7f24943f5700 [_impl/db_impl.cc:737] Writing 145 stats with timestamp 1562993333 to persistent stats CF succeeded
```
```
2019/07/12-21:48:51.861711 7f24943f5700 [_impl/db_impl.cc:702] ------- PERSISTING STATS -------
2019/07/12-21:48:51.861729 7f24943f5700 [_impl/db_impl.cc:709] Reading 145 stats from statistics
2019/07/12-21:48:51.861921 7f24943f5700 [_impl/db_impl.cc:732] Writing to persistent stats CF failed -- Result incomplete: Write stall
...
2019/07/12-21:48:51.873032 7f2494bf6700 [WARN] [lumn_family.cc:749] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5566

Differential Revision: D16258187

Pulled By: miasantreble

fbshipit-source-id: 292497099b941418590ed4312411bee36e244dc5
---
 db/db_impl/db_impl.cc | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index cf8dddb7fe1..6f2ebdc8098 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -14,6 +14,7 @@
 #endif
 
 #include <algorithm>
+#include <cinttypes>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -697,10 +698,15 @@ void DBImpl::PersistStats() {
   if (!statistics->getTickerMap(&stats_map)) {
     return;
   }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- PERSISTING STATS -------");
 
   if (immutable_db_options_.persist_stats_to_disk) {
     WriteBatch batch;
     if (stats_slice_initialized_) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+                     stats_slice_.size());
       for (const auto& stat : stats_map) {
         char key[100];
         int length =
@@ -722,8 +728,13 @@ void DBImpl::PersistStats() {
     Status s = Write(wo, &batch);
     if (!s.ok()) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                     "Writing to persistent stats CF failed -- %s\n",
+                     "Writing to persistent stats CF failed -- %s",
                      s.ToString().c_str());
+    } else {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to persistent stats CF succeeded",
+                     stats_slice_.size(), now_seconds);
     }
     // TODO(Zhongyi): add purging for persisted data
   } else {
@@ -736,6 +747,10 @@ void DBImpl::PersistStats() {
           stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
         }
       }
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to in-memory stats history",
+                     stats_slice_.size(), now_seconds);
       stats_history_[now_seconds] = stats_delta;
     }
     stats_slice_initialized_ = true;
@@ -743,15 +758,22 @@ void DBImpl::PersistStats() {
     TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
 
     // delete older stats snapshots to control memory consumption
-    bool purge_needed =
-        EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+    size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+    bool purge_needed = stats_history_size > stats_history_size_limit;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
     while (purge_needed && !stats_history_.empty()) {
       stats_history_.erase(stats_history_.begin());
       purge_needed =
           EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
     }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
   }
-  // TODO: persist stats to disk
 #endif  // !ROCKSDB_LITE
 }
 

From 6e8a1354a799f14fb068fdecd771daa64918d36d Mon Sep 17 00:00:00 2001
From: Tomas Kolda <koldat@gmail.com>
Date: Mon, 15 Jul 2019 12:15:21 -0700
Subject: [PATCH 222/572] Fix regression - 100% CPU - Regression for Windows 7
 (#5557)

Summary:
Fixes https://github.com/facebook/rocksdb/issues/5552
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5557

Differential Revision: D16266329

fbshipit-source-id: a8f6b50298a6f7c8d6c7e172bb26dd7eb6bd8a4d
---
 port/win/env_win.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 9abb14d67ea..7718ebd72c5 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -979,8 +979,7 @@ uint64_t WinEnvIO::NowMicros() {
     return li.QuadPart;
   }
   using namespace std::chrono;
-  return duration_cast<microseconds>(
-      high_resolution_clock::now().time_since_epoch()).count();
+  return duration_cast<microseconds>(system_clock::now().time_since_epoch()).count();
 }
 
 uint64_t WinEnvIO::NowNanos() {

From cd2520361d38ef3556d3bda479fd7a4caa0d1168 Mon Sep 17 00:00:00 2001
From: Jim Lin <jimzuolin@gmail.com>
Date: Mon, 15 Jul 2019 12:55:37 -0700
Subject: [PATCH 223/572] Fix memorty leak in `rocksdb_wal_iter_get_batch`
 function (#5515)

Summary:
`wal_batch.writeBatchPtr.release()` gives up the ownership of the original `WriteBatch`, but there is no new owner, which causes memory leak.

The patch is simple. Removing `release()` prevent ownership change. `std::move` is for speed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5515

Differential Revision: D16264281

Pulled By: riversand963

fbshipit-source-id: 51c556b7a1c977325c3aa24acb636303847151fa
---
 db/c.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/c.cc b/db/c.cc
index 17dc766dd66..4d40558f6b1 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1034,7 +1034,7 @@ void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
 rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
   rocksdb_writebatch_t* result = rocksdb_writebatch_create();
   BatchResult wal_batch = iter->rep->GetBatch();
-  result->rep = * wal_batch.writeBatchPtr.release();
+  result->rep = std::move(*wal_batch.writeBatchPtr);
   if (seq != nullptr) {
     *seq = wal_batch.sequence;
   }

From 3bde41b5a3f71a67cfee67d2a26244b80c777148 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 16 Jul 2019 13:11:23 -0700
Subject: [PATCH 224/572] Move the filter readers out of the block cache
 (#5504)

Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.

Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504

Test Plan: make asan_check

Differential Revision: D16036974

Pulled By: ltamasi

fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
---
 CMakeLists.txt                                |   1 +
 HISTORY.md                                    |   3 +-
 TARGETS                                       |   1 +
 db/db_block_cache_test.cc                     |  14 +-
 src.mk                                        |   1 +
 table/block_based/block_based_filter_block.cc | 178 +++--
 table/block_based/block_based_filter_block.h  |  45 +-
 .../block_based_filter_block_test.cc          | 322 ++++++---
 table/block_based/block_based_table_reader.cc | 626 +++++++-----------
 table/block_based/block_based_table_reader.h  |  69 +-
 table/block_based/cachable_entry.h            |   1 +
 table/block_based/filter_block.h              |  66 +-
 .../block_based/filter_block_reader_common.cc |  90 +++
 .../block_based/filter_block_reader_common.h  |  54 ++
 table/block_based/full_filter_block.cc        | 156 +++--
 table/block_based/full_filter_block.h         |  62 +-
 table/block_based/full_filter_block_test.cc   | 204 ++++--
 table/block_based/partitioned_filter_block.cc | 303 +++++----
 table/block_based/partitioned_filter_block.h  |  68 +-
 .../partitioned_filter_block_test.cc          | 118 ++--
 table/table_reader.h                          |   3 +-
 table/table_test.cc                           |   6 +-
 tools/sst_dump_tool.cc                        |   3 +-
 23 files changed, 1399 insertions(+), 995 deletions(-)
 create mode 100644 table/block_based/filter_block_reader_common.cc
 create mode 100644 table/block_based/filter_block_reader_common.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c47f9811ef2..65904b8cae6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -595,6 +595,7 @@ set(SOURCES
         table/block_based/block_prefix_index.cc
         table/block_based/data_block_hash_index.cc
         table/block_based/data_block_footer.cc
+        table/block_based/filter_block_reader_common.cc
         table/block_based/flush_block_policy.cc
         table/block_based/full_filter_block.cc
         table/block_based/index_builder.cc
diff --git a/HISTORY.md b/HISTORY.md
index 099c9f37e86..2e1e03f68de 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,8 +6,9 @@
 
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Index and filter blocks are now handled similarly to data blocks with regards to the block cache: instead of storing reader objects in the cache, only the blocks themselves are cached. In addition, index and filter blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any).
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
-* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* Due to the above refactoring, block cache eviction statistics for indexes and filters are temporarily broken. We plan to reintroduce them in a later phase.
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
diff --git a/TARGETS b/TARGETS
index 6ef3da179dc..eda1051396d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -192,6 +192,7 @@ cpp_library(
         "table/block_based/block_prefix_index.cc",
         "table/block_based/data_block_footer.cc",
         "table/block_based/data_block_hash_index.cc",
+        "table/block_based/filter_block_reader_common.cc",
         "table/block_based/flush_block_policy.cc",
         "table/block_based/full_filter_block.cc",
         "table/block_based/index_builder.cc",
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 8eb73a23dd7..77f37da0d45 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -365,11 +365,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
   // set the cache capacity to the current usage
   cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
-  // The index eviction statistics were broken by the refactoring that moved
-  // the index readers out of the block cache. Disabling these until we can
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
   // bring the stats back.
   // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
   // Note that the second key needs to be no longer than the first one.
   // Otherwise the second index block may not fit in cache.
   ASSERT_OK(Put(1, "key", "val"));
@@ -380,13 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
             index_bytes_insert);
   ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
             filter_bytes_insert);
-  // The index eviction statistics were broken by the refactoring that moved
-  // the index readers out of the block cache. Disabling these until we can
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
   // bring the stats back.
   // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
   //           index_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
-            filter_bytes_insert);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+  //           filter_bytes_insert);
 }
 
 namespace {
diff --git a/src.mk b/src.mk
index bc49b7ce074..fe930d5f49b 100644
--- a/src.mk
+++ b/src.mk
@@ -115,6 +115,7 @@ LIB_SOURCES =                                                   \
   table/block_based/block_prefix_index.cc                       \
   table/block_based/data_block_hash_index.cc                    \
   table/block_based/data_block_footer.cc                        \
+  table/block_based/filter_block_reader_common.cc               \
   table/block_based/flush_block_policy.cc                       \
   table/block_based/full_filter_block.cc                        \
   table/block_based/index_builder.cc                            \
diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
index e5a32e4635f..5585b8441c5 100644
--- a/table/block_based/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -13,6 +13,7 @@
 #include "db/dbformat.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
@@ -162,58 +163,120 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
 }
 
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
-    const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt, bool _whole_key_filtering,
-    BlockContents&& contents, Statistics* stats)
-    : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
-      policy_(table_opt.filter_policy.get()),
-      prefix_extractor_(prefix_extractor),
-      data_(nullptr),
-      offset_(nullptr),
-      num_(0),
-      base_lg_(0),
-      contents_(std::move(contents)) {
-  assert(policy_);
-  size_t n = contents_.data.size();
-  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents_.data[n - 1];
-  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
-  if (last_word > n - 5) return;
-  data_ = contents_.data.data();
-  offset_ = data_ + last_word;
-  num_ = (n - 5 - last_word) / 4;
+    const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  assert(table());
+  assert(table()->get_rep());
+  assert(table()->get_rep()->filter_policy);
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<BlockContents> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new BlockBasedFilterBlockReader(table, std::move(filter_block)));
 }
 
 bool BlockBasedFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
   assert(block_offset != kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     return true;
   }
-  return MayMatch(key, block_offset);
+  return MayMatch(key, block_offset, no_io, get_context, lookup_context);
 }
 
 bool BlockBasedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
   assert(block_offset != kNotValid);
-  return MayMatch(prefix, block_offset);
+  return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
+    const BlockContents& contents, const char** data, const char** offset,
+    size_t* num, size_t* base_lg) {
+  assert(data);
+  assert(offset);
+  assert(num);
+  assert(base_lg);
+
+  const size_t n = contents.data.size();
+  if (n < 5) {  // 1 byte for base_lg and 4 for start of offset array
+    return false;
+  }
+
+  const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
+  if (last_word > n - 5) {
+    return false;
+  }
+
+  *data = contents.data.data();
+  *offset = (*data) + last_word;
+  *num = (n - 5 - last_word) / 4;
+  *base_lg = contents.data[n - 1];
+
+  return true;
 }
 
-bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
-                                           uint64_t block_offset) {
-  uint64_t index = block_offset >> base_lg_;
-  if (index < num_) {
-    uint32_t start = DecodeFixed32(offset_ + index * 4);
-    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
-    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
-      Slice filter = Slice(data_ + start, limit - start);
-      bool const may_match = policy_->KeyMayMatch(entry, filter);
+bool BlockBasedFilterBlockReader::MayMatch(
+    const Slice& entry, uint64_t block_offset, bool no_io,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return true;  // Errors are treated as potential matches
+  }
+
+  const uint64_t index = block_offset >> base_lg;
+  if (index < num) {
+    const uint32_t start = DecodeFixed32(offset + index * 4);
+    const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+    if (start <= limit && limit <= (uint32_t)(offset - data)) {
+      const Slice filter = Slice(data + start, limit - start);
+
+      assert(table());
+      assert(table()->get_rep());
+      const FilterPolicy* const policy = table()->get_rep()->filter_policy;
+
+      const bool may_match = policy->KeyMayMatch(entry, filter);
       if (may_match) {
         PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
         return true;
@@ -230,27 +293,54 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
 }
 
 size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
-  return num_ * 4 + 5 + (offset_ - data_);
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
 }
 
 std::string BlockBasedFilterBlockReader::ToString() const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                           nullptr /* lookup_context */, &filter_block);
+  if (!s.ok()) {
+    return std::string("Unable to retrieve filter block");
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return std::string("Error parsing filter block");
+  }
+
   std::string result;
   result.reserve(1024);
 
   std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
-  AppendItem(&result, s_fb, rocksdb::ToString(num_));
+  AppendItem(&result, s_fb, rocksdb::ToString(num));
   AppendItem(&result, s_bo, s_hd);
 
-  for (size_t index = 0; index < num_; index++) {
-    uint32_t start = DecodeFixed32(offset_ + index * 4);
-    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
+  for (size_t index = 0; index < num; index++) {
+    uint32_t start = DecodeFixed32(offset + index * 4);
+    uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
 
     if (start != limit) {
       result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n");
-      Slice filter = Slice(data_ + start, limit - start);
+      Slice filter = Slice(data + start, limit - start);
       AppendItem(&result, start, filter.ToString(true));
     }
   }
   return result;
 }
+
 }  // namespace rocksdb
diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
index cd86ff5c8a5..43dbc4f4f9f 100644
--- a/table/block_based/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -22,7 +22,8 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/format.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -75,42 +76,42 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
 
 // A FilterBlockReader is used to parse filter from SST table.
 // KeyMayMatch and PrefixMayMatch would trigger filter checking
-class BlockBasedFilterBlockReader : public FilterBlockReader {
+class BlockBasedFilterBlockReader
+    : public FilterBlockReaderCommon<BlockContents> {
  public:
-  // REQUIRES: "contents" and *policy must stay live while *this is live.
-  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
-                              const BlockBasedTableOptions& table_opt,
-                              bool whole_key_filtering,
-                              BlockContents&& contents, Statistics* statistics);
+  BlockBasedFilterBlockReader(const BlockBasedTable* t,
+                              CachableEntry<BlockContents>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
   bool IsBlockBased() override { return true; }
 
   bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
                    uint64_t block_offset, const bool no_io,
-                   const Slice* const const_ikey_ptr,
-                   BlockCacheLookupContext* context) override;
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
   bool PrefixMayMatch(const Slice& prefix,
                       const SliceTransform* prefix_extractor,
                       uint64_t block_offset, const bool no_io,
                       const Slice* const const_ikey_ptr,
-                      BlockCacheLookupContext* context) override;
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
   size_t ApproximateMemoryUsage() const override;
 
   // convert this object to a human readable form
   std::string ToString() const override;
 
  private:
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  const char* data_;    // Pointer to filter data (at block-start)
-  const char* offset_;  // Pointer to beginning of offset array (at block-end)
-  size_t num_;          // Number of entries in offset array
-  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
-  BlockContents contents_;
+  static bool ParseFieldsFromBlock(const BlockContents& contents,
+                                   const char** data, const char** offset,
+                                   size_t* num, size_t* base_lg);
 
-  bool MayMatch(const Slice& entry, uint64_t block_offset);
-
-  // No copying allowed
-  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
-  void operator=(const BlockBasedFilterBlockReader&);
+  bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
 };
+
 }  // namespace rocksdb
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index 220888dd2fb..70bbde96ac8 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -10,6 +10,7 @@
 #include "table/block_based/block_based_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -41,28 +42,58 @@ class TestHashFilter : public FilterPolicy {
   }
 };
 
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->cache_key_prefix_size = 10;
+  }
+};
+
 class FilterBlockTest : public testing::Test {
  public:
-  TestHashFilter policy_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
-
-  FilterBlockTest() {
-    table_options_.filter_policy.reset(new TestHashFilter());
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  FilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.no_block_cache = true;
+    table_options_.filter_policy.reset(new TestHashFilter);
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table)));
   }
 };
 
 TEST_F(FilterBlockTest, EmptyBuilder) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  BlockContents block(builder.Finish());
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
+  Slice slice(builder.Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
   ASSERT_TRUE(reader.KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader.KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 }
 
 TEST_F(FilterBlockTest, SingleChunk) {
@@ -77,30 +108,46 @@ TEST_F(FilterBlockTest, SingleChunk) {
   builder.StartBlock(300);
   builder.Add("hello");
   ASSERT_EQ(5, builder.NumAdded());
-  BlockContents block(builder.Finish());
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 }
 
 TEST_F(FilterBlockTest, MultiChunk) {
@@ -123,93 +170,139 @@ TEST_F(FilterBlockTest, MultiChunk) {
   builder.Add("box");
   builder.Add("hello");
 
-  BlockContents block(builder.Finish());
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
 
   // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/uint64_t{0},
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/2000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/3100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check third filter (empty)
   ASSERT_TRUE(!reader.KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 }
 
 // Test for block based filter block
 // use new interface in FilterPolicy to create filter builder/reader
 class BlockBasedFilterBlockTest : public testing::Test {
  public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
-
-  BlockBasedFilterBlockTest() {
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  BlockBasedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
-  }
 
-  ~BlockBasedFilterBlockTest() override {}
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table)));
+  }
 };
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
   FilterBlockBuilder* builder =
       new BlockBasedFilterBlockBuilder(nullptr, table_options_);
-  BlockContents block(builder->Finish());
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
+  Slice slice(builder->Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
   ASSERT_TRUE(reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   delete builder;
   delete reader;
@@ -226,30 +319,42 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
   builder->Add("box");
   builder->StartBlock(300);
   builder->Add("hello");
-  BlockContents block(builder->Finish());
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
   ASSERT_TRUE(reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   delete builder;
   delete reader;
@@ -276,65 +381,86 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
   builder->Add("box");
   builder->Add("hello");
 
-  BlockContents block(builder->Finish());
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
 
   // Check first filter
   ASSERT_TRUE(reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check second filter
   ASSERT_TRUE(reader->KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check third filter (empty)
   ASSERT_TRUE(!reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   // Check last filter
   ASSERT_TRUE(reader->KeyMayMatch(
       "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(reader->KeyMayMatch(
       "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader->KeyMayMatch(
       "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 
   delete builder;
   delete reader;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 26c1365c4e7..a888603d72b 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -85,6 +85,8 @@ Status ReadBlockFromFile(
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
     bool for_compaction = false) {
+  assert(result);
+
   BlockContents contents;
   BlockFetcher block_fetcher(
       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
@@ -99,6 +101,32 @@ Status ReadBlockFromFile(
   return s;
 }
 
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<BlockContents>* result, const ImmutableCFOptions& ioptions,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
+    const UncompressionDict& uncompression_dict,
+    const PersistentCacheOptions& cache_options,
+    SequenceNumber /* global_seqno */, size_t /* read_amp_bytes_per_bit */,
+    MemoryAllocator* memory_allocator, bool for_compaction = false) {
+  assert(result);
+
+  result->reset(new BlockContents);
+
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, result->get(), ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
+
+  const Status s = block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    result->reset();
+  }
+
+  return s;
+}
+
 inline MemoryAllocator* GetMemoryAllocator(
     const BlockBasedTableOptions& table_options) {
   return table_options.block_cache.get()
@@ -120,7 +148,6 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) {
   delete entry;
 }
 
-void DeleteCachedFilterEntry(const Slice& key, void* value);
 void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
 
 // Release the cached entry and decrement its ref count.
@@ -283,8 +310,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   // unmodified.
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader,
-                       BlockCacheLookupContext* lookup_context) {
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
     assert(table != nullptr);
     assert(table->get_rep());
     assert(!pin || prefetch);
@@ -304,7 +332,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       }
     }
 
-    *index_reader = new PartitionIndexReader(table, std::move(index_block));
+    index_reader->reset(
+        new PartitionIndexReader(table, std::move(index_block)));
 
     return Status::OK();
   }
@@ -445,7 +474,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
   size_t ApproximateMemoryUsage() const override {
     size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
 #else
     usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
@@ -472,8 +501,9 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
   // unmodified.
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader,
-                       BlockCacheLookupContext* lookup_context) {
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
     assert(table != nullptr);
     assert(table->get_rep());
     assert(!pin || prefetch);
@@ -493,7 +523,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
       }
     }
 
-    *index_reader = new BinarySearchIndexReader(table, std::move(index_block));
+    index_reader->reset(
+        new BinarySearchIndexReader(table, std::move(index_block)));
 
     return Status::OK();
   }
@@ -532,7 +563,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
   size_t ApproximateMemoryUsage() const override {
     size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
 #else
     usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
@@ -552,8 +583,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   static Status Create(const BlockBasedTable* table,
                        FilePrefetchBuffer* prefetch_buffer,
                        InternalIterator* meta_index_iter, bool use_cache,
-                       bool prefetch, bool pin, IndexReader** index_reader,
-                       BlockCacheLookupContext* lookup_context) {
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
     assert(table != nullptr);
     assert(index_reader != nullptr);
     assert(!pin || prefetch);
@@ -579,8 +611,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     // hard error. We can still fall back to the original binary search index.
     // So, Create will succeed regardless, from this point on.
 
-    auto new_index_reader = new HashIndexReader(table, std::move(index_block));
-    *index_reader = new_index_reader;
+    index_reader->reset(new HashIndexReader(table, std::move(index_block)));
 
     // Get prefixes block
     BlockHandle prefixes_handle;
@@ -636,7 +667,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
                                  prefixes_meta_contents.data, &prefix_index);
     // TODO: log error
     if (s.ok()) {
-      new_index_reader->prefix_index_.reset(prefix_index);
+      HashIndexReader* const hash_index_reader =
+          static_cast<HashIndexReader*>(index_reader->get());
+      hash_index_reader->prefix_index_.reset(prefix_index);
     }
 
     return Status::OK();
@@ -679,7 +712,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
   size_t ApproximateMemoryUsage() const override {
     size_t usage = ApproximateIndexBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
 #else
     if (prefix_index_) {
       usage += prefix_index_->ApproximateMemoryUsage();
@@ -1453,22 +1486,49 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     }
   }
 
-  {
-    // Find compression dictionary handle
-    bool found_compression_dict;
-    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
-                                   &rep_->compression_dict_handle);
+  // Find compression dictionary handle
+  bool found_compression_dict = false;
+  s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
+                                 &rep_->compression_dict_handle);
+  if (!s.ok()) {
+    return s;
   }
 
   BlockBasedTableOptions::IndexType index_type = rep_->index_type;
 
   const bool use_cache = table_options.cache_index_and_filter_blocks;
 
+  // pin both index and filters, down to all partitions
+  const bool pin_all =
+      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+
   // prefetch the first level of index
   const bool prefetch_index =
       prefetch_all ||
       (table_options.pin_top_level_index_and_filter &&
        index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+  // pin the first level of index
+  const bool pin_index =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+  std::unique_ptr<IndexReader> index_reader;
+  s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+                                   prefetch_index, pin_index, lookup_context,
+                                   &index_reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  rep_->index_reader = std::move(index_reader);
+
+  // The partitions of partitioned index are always stored in cache. They
+  // are hence follow the configuration for pin and prefetch regardless of
+  // the value of cache_index_and_filter_blocks
+  if (prefetch_all) {
+    rep_->index_reader->CacheDependencies(pin_all);
+  }
+
   // prefetch the first level of filter
   const bool prefetch_filter =
       prefetch_all ||
@@ -1476,83 +1536,36 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
        rep_->filter_type == Rep::FilterType::kPartitionedFilter);
   // Partition fitlers cannot be enabled without partition indexes
   assert(!prefetch_filter || prefetch_index);
-  // pin both index and filters, down to all partitions
-  const bool pin_all =
-      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
-  // pin the first level of index
-  const bool pin_index =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
   // pin the first level of filter
   const bool pin_filter =
       pin_all || (table_options.pin_top_level_index_and_filter &&
                   rep_->filter_type == Rep::FilterType::kPartitionedFilter);
 
-  IndexReader* index_reader = nullptr;
-  if (s.ok()) {
-    s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
-                                     prefetch_index, pin_index, &index_reader,
-                                     lookup_context);
-    if (s.ok()) {
-      assert(index_reader != nullptr);
-      rep_->index_reader.reset(index_reader);
-      // The partitions of partitioned index are always stored in cache. They
-      // are hence follow the configuration for pin and prefetch regardless of
-      // the value of cache_index_and_filter_blocks
+  if (rep_->filter_policy) {
+    auto filter = new_table->CreateFilterBlockReader(
+        prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        lookup_context);
+    if (filter) {
+      // Refer to the comment above about paritioned indexes always being cached
       if (prefetch_all) {
-        rep_->index_reader->CacheDependencies(pin_all);
+        filter->CacheDependencies(pin_all);
       }
-    } else {
-      delete index_reader;
-      index_reader = nullptr;
+
+      rep_->filter = std::move(filter);
     }
   }
 
-  // pre-fetching of blocks is turned on
-  // Will use block cache for meta-blocks access
-  // Always prefetch index and filter for level 0
   // TODO(ajkr): also prefetch compression dictionary block
   // TODO(ajkr): also pin compression dictionary block when
   // `pin_l0_filter_and_index_blocks_in_cache == true`.
-  if (table_options.cache_index_and_filter_blocks) {
-    assert(table_options.block_cache != nullptr);
-    if (s.ok() && prefetch_filter) {
-      // Hack: Call GetFilter() to implicitly add filter to the block_cache
-      auto filter_entry =
-          new_table->GetFilter(rep_->table_prefix_extractor.get(),
-                               /*prefetch_buffer=*/nullptr, /*no_io=*/false,
-                               /*get_context=*/nullptr, lookup_context);
-      if (filter_entry.GetValue() != nullptr && prefetch_all) {
-        filter_entry.GetValue()->CacheDependencies(
-            pin_all, rep_->table_prefix_extractor.get());
-      }
-      // if pin_filter is true then save it in rep_->filter_entry; it will be
-      // released in the destructor only, hence it will be pinned in the
-      // cache while this reader is alive
-      if (pin_filter) {
-        rep_->filter_entry = std::move(filter_entry);
-      }
-    }
-  } else {
+  if (!table_options.cache_index_and_filter_blocks) {
     std::unique_ptr<const BlockContents> compression_dict_block;
-    if (s.ok()) {
-      // Set filter block
-      if (rep_->filter_policy) {
-        const bool is_a_filter_partition = true;
-        auto filter = new_table->ReadFilter(
-            prefetch_buffer, rep_->filter_handle, !is_a_filter_partition,
-            rep_->table_prefix_extractor.get());
-        rep_->filter.reset(filter);
-        // Refer to the comment above about paritioned indexes always being
-        // cached
-        if (filter && prefetch_all) {
-          filter->CacheDependencies(pin_all,
-                                    rep_->table_prefix_extractor.get());
-        }
-      }
-      s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
+    s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
+    if (!s.ok()) {
+      return s;
     }
-    if (s.ok() && !rep_->compression_dict_handle.IsNull()) {
+
+    if (!rep_->compression_dict_handle.IsNull()) {
       assert(compression_dict_block != nullptr);
       // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
       rep_->uncompression_dict.reset(new UncompressionDict(
@@ -1560,6 +1573,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
           rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
     }
   }
+
+  assert(s.ok());
   return s;
 }
 
@@ -1631,10 +1646,43 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
   return Status::OK();
 }
 
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               SequenceNumber /* global_seqno */,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
+                       size_t read_amp_bytes_per_bit, Statistics* statistics) {
+    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+                     statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+};
+
+template <typename TBlocklike>
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
-    const ReadOptions& read_options, CachableEntry<Block>* block,
+    const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
     const UncompressionDict& uncompression_dict, BlockType block_type,
     GetContext* get_context) const {
   const size_t read_amp_bytes_per_bit =
@@ -1654,7 +1702,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
                                           block_type, get_context);
     if (cache_handle != nullptr) {
       block->SetCachedValue(
-          reinterpret_cast<Block*>(block_cache->Value(cache_handle)),
+          reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
           block_cache, cache_handle);
       return s;
     }
@@ -1698,16 +1746,17 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
-    std::unique_ptr<Block> block_holder(
-        new Block(std::move(contents), rep_->get_global_seqno(block_type),
-                  read_amp_bytes_per_bit, statistics));  // uncompressed block
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(contents), rep_->get_global_seqno(block_type),
+            read_amp_bytes_per_bit, statistics));  // uncompressed block
 
     if (block_cache != nullptr && block_holder->own_bytes() &&
         read_options.fill_cache) {
       size_t charge = block_holder->ApproximateMemoryUsage();
       Cache::Handle* cache_handle = nullptr;
       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                              &DeleteCachedEntry<Block>, &cache_handle);
+                              &DeleteCachedEntry<TBlocklike>, &cache_handle);
 #ifndef NDEBUG
       block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
@@ -1730,10 +1779,11 @@ Status BlockBasedTable::GetDataBlockFromCache(
   return s;
 }
 
+template <typename TBlocklike>
 Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
-    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
+    CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
     CompressionType raw_block_comp_type,
     const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
     MemoryAllocator* memory_allocator, BlockType block_type,
@@ -1757,7 +1807,7 @@ Status BlockBasedTable::PutDataBlockToCache(
   Status s;
   Statistics* statistics = ioptions.statistics;
 
-  std::unique_ptr<Block> block_holder;
+  std::unique_ptr<TBlocklike> block_holder;
   if (raw_block_comp_type != kNoCompression) {
     // Retrieve the uncompressed contents into a new buffer
     BlockContents uncompressed_block_contents;
@@ -1771,11 +1821,13 @@ Status BlockBasedTable::PutDataBlockToCache(
       return s;
     }
 
-    block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no,
-                                 read_amp_bytes_per_bit, statistics));
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics));
   } else {
-    block_holder.reset(new Block(std::move(*raw_block_contents), seq_no,
-                                 read_amp_bytes_per_bit, statistics));
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics));
   }
 
   // Insert compressed block into compressed block cache.
@@ -1809,7 +1861,8 @@ Status BlockBasedTable::PutDataBlockToCache(
     size_t charge = block_holder->ApproximateMemoryUsage();
     Cache::Handle* cache_handle = nullptr;
     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                            &DeleteCachedEntry<Block>, &cache_handle, priority);
+                            &DeleteCachedEntry<TBlocklike>, &cache_handle,
+                            priority);
 #ifndef NDEBUG
     block_cache->TEST_mark_as_data_block(block_cache_key, charge);
 #endif  // NDEBUG
@@ -1829,171 +1882,36 @@ Status BlockBasedTable::PutDataBlockToCache(
   return s;
 }
 
-FilterBlockReader* BlockBasedTable::ReadFilter(
-    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
-    const bool is_a_filter_partition,
-    const SliceTransform* prefix_extractor) const {
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
   auto& rep = rep_;
-  // TODO: We might want to unify with ReadBlockFromFile() if we start
-  // requiring checksum verification in Table::Open.
-  if (rep->filter_type == Rep::FilterType::kNoFilter) {
-    return nullptr;
-  }
-  BlockContents block;
-
-  BlockFetcher block_fetcher(
-      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
-      filter_handle, &block, rep->ioptions, false /* decompress */,
-      false /*maybe_compressed*/, BlockType::kFilter,
-      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
-      GetMemoryAllocator(rep->table_options));
-  Status s = block_fetcher.ReadBlockContents();
-
-  if (!s.ok()) {
-    // Error reading the block
-    return nullptr;
+  auto filter_type = rep->filter_type;
+  if (filter_type == Rep::FilterType::kNoFilter) {
+    return std::unique_ptr<FilterBlockReader>();
   }
 
   assert(rep->filter_policy);
 
-  auto filter_type = rep->filter_type;
-  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
-      is_a_filter_partition) {
-    filter_type = Rep::FilterType::kFullFilter;
-  }
-
   switch (filter_type) {
-    case Rep::FilterType::kPartitionedFilter: {
-      return new PartitionedFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->whole_key_filtering, std::move(block), nullptr,
-          rep->ioptions.statistics, rep->internal_comparator, this,
-          rep_->index_key_includes_seq, rep_->index_value_is_full);
-    }
+    case Rep::FilterType::kPartitionedFilter:
+      return PartitionedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
 
     case Rep::FilterType::kBlockFilter:
-      return new BlockBasedFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->table_options, rep->whole_key_filtering, std::move(block),
-          rep->ioptions.statistics);
-
-    case Rep::FilterType::kFullFilter: {
-      auto filter_bits_reader =
-          rep->filter_policy->GetFilterBitsReader(block.data);
-      assert(filter_bits_reader != nullptr);
-      return new FullFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->whole_key_filtering, std::move(block), filter_bits_reader,
-          rep->ioptions.statistics);
-    }
+      return BlockBasedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kFullFilter:
+      return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
+                                           prefetch, pin, lookup_context);
 
     default:
       // filter_type is either kNoFilter (exited the function at the first if),
       // or it must be covered in this switch block
       assert(false);
-      return nullptr;
-  }
-}
-
-CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
-    bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context) const {
-  const BlockHandle& filter_blk_handle = rep_->filter_handle;
-  const bool is_a_filter_partition = true;
-  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
-                   no_io, get_context, lookup_context, prefix_extractor);
-}
-
-CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context,
-    const SliceTransform* prefix_extractor) const {
-  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
-  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
-  // read fails at Open() time. We don't want to reload again since it will
-  // most probably fail again.
-  if (!is_a_filter_partition &&
-      !rep_->table_options.cache_index_and_filter_blocks) {
-    return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr,
-            /*own_value=*/false};
-  }
-
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  if (rep_->filter_policy == nullptr /* do not use filter */ ||
-      block_cache == nullptr /* no block cache at all */) {
-    return CachableEntry<FilterBlockReader>();
-  }
-
-  if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
-    return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
-            /*cache_handle=*/nullptr, /*own_value=*/false};
-  }
-
-  PERF_TIMER_GUARD(read_filter_block_nanos);
-
-  // Fetching from the cache
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                         filter_blk_handle, cache_key);
-
-  Cache::Handle* cache_handle =
-      GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
-
-  FilterBlockReader* filter = nullptr;
-  size_t usage = 0;
-  bool is_cache_hit = false;
-  bool return_empty_reader = false;
-  if (cache_handle != nullptr) {
-    filter =
-        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
-    usage = filter->ApproximateMemoryUsage();
-    is_cache_hit = true;
-  } else if (no_io) {
-    // Do not invoke any io.
-    return_empty_reader = true;
-  } else {
-    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
-                        is_a_filter_partition, prefix_extractor);
-    if (filter != nullptr) {
-      usage = filter->ApproximateMemoryUsage();
-      Status s = block_cache->Insert(
-          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
-          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-      if (s.ok()) {
-        UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage);
-      } else {
-        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
-        delete filter;
-        return_empty_reader = true;
-      }
-    }
-  }
-
-  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
-      lookup_context) {
-    // Avoid making copy of block_key and cf_name when constructing the access
-    // record.
-    BlockCacheTraceRecord access_record(
-        rep_->ioptions.env->NowMicros(),
-        /*block_key=*/"", TraceType::kBlockTraceFilterBlock,
-        /*block_size=*/usage, rep_->cf_id_for_tracing(),
-        /*cf_name=*/"", rep_->level_for_tracing(),
-        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-        /*no_insert=*/no_io, lookup_context->get_id);
-    block_cache_tracer_->WriteBlockAccess(access_record, key,
-                                          rep_->cf_name_for_tracing(),
-                                          /*referenced_key=*/nullptr);
+      return std::unique_ptr<FilterBlockReader>();
   }
-
-  if (return_empty_reader) {
-    return CachableEntry<FilterBlockReader>();
-  }
-  return {filter, cache_handle ? block_cache : nullptr, cache_handle,
-          /*own_value=*/false};
 }
 
 CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
@@ -2178,6 +2096,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   }
 
   block.TransferTo(iter);
+
   return iter;
 }
 
@@ -2294,10 +2213,11 @@ Status BlockBasedTable::GetDataBlockFromCache(
 // If contents is non-null, it skips the cache lookup and disk read, since
 // the caller has already read it. In both cases, if ro.fill_cache is true,
 // it inserts the block into the block cache.
+template <typename TBlocklike>
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, BlockType block_type,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     BlockContents* contents) const {
   assert(block_entry != nullptr);
@@ -2347,17 +2267,18 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
       no_insert = false;
       Statistics* statistics = rep_->ioptions.statistics;
-      bool do_decompress =
-          block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
+      const bool maybe_compressed =
+          block_type != BlockType::kFilter && rep_->blocks_maybe_compressed;
+      const bool do_uncompress = maybe_compressed && !block_cache_compressed;
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
       if (!contents) {
         StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
-            &raw_block_contents, rep_->ioptions,
-            do_decompress /* do uncompress */, rep_->blocks_maybe_compressed,
-            block_type, uncompression_dict, rep_->persistent_cache_options,
+            &raw_block_contents, rep_->ioptions, do_uncompress,
+            maybe_compressed, block_type, uncompression_dict,
+            rep_->persistent_cache_options,
             GetMemoryAllocator(rep_->table_options),
             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
         s = block_fetcher.ReadBlockContents();
@@ -2387,21 +2308,25 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     uint64_t nkeys = 0;
     if (block_entry->GetValue()) {
       // Approximate the number of keys in the block using restarts.
-      nkeys = rep_->table_options.block_restart_interval *
-              block_entry->GetValue()->NumRestarts();
+      nkeys =
+          rep_->table_options.block_restart_interval *
+          BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
       usage = block_entry->GetValue()->ApproximateMemoryUsage();
     }
     TraceType trace_block_type = TraceType::kTraceMax;
     switch (block_type) {
-      case BlockType::kIndex:
-        trace_block_type = TraceType::kBlockTraceIndexBlock;
-        break;
       case BlockType::kData:
         trace_block_type = TraceType::kBlockTraceDataBlock;
         break;
+      case BlockType::kFilter:
+        trace_block_type = TraceType::kBlockTraceFilterBlock;
+        break;
       case BlockType::kRangeDeletion:
         trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
         break;
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
       default:
         // This cannot happen.
         assert(false);
@@ -2603,10 +2528,11 @@ void BlockBasedTable::MaybeLoadBlocksToCache(
   }
 }
 
+template <typename TBlocklike>
 Status BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, BlockType block_type,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     bool for_compaction) const {
   assert(block_entry);
@@ -2639,16 +2565,19 @@ Status BlockBasedTable::RetrieveBlock(
     return Status::Incomplete("no blocking io");
   }
 
-  std::unique_ptr<Block> block;
+  const bool maybe_compressed =
+      block_type != BlockType::kFilter && rep_->blocks_maybe_compressed;
+  const bool do_uncompress = maybe_compressed;
+  std::unique_ptr<TBlocklike> block;
 
   {
     StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
                  READ_BLOCK_GET_MICROS);
     s = ReadBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
-        rep_->ioptions, rep_->blocks_maybe_compressed,
-        rep_->blocks_maybe_compressed, block_type, uncompression_dict,
-        rep_->persistent_cache_options, rep_->get_global_seqno(block_type),
+        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+        uncompression_dict, rep_->persistent_cache_options,
+        rep_->get_global_seqno(block_type),
         block_type == BlockType::kData
             ? rep_->table_options.read_amp_bytes_per_bit
             : 0,
@@ -2665,6 +2594,22 @@ Status BlockBasedTable::RetrieveBlock(
   return s;
 }
 
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<BlockContents>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<BlockContents>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction) const;
+
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
@@ -2733,10 +2678,7 @@ bool BlockBasedTable::PrefixMayMatch(
   Status s;
 
   // First, try check with full filter
-  auto filter_entry =
-      GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false,
-                /*get_context=*/nullptr, lookup_context);
-  FilterBlockReader* filter = filter_entry.GetValue();
+  FilterBlockReader* const filter = rep_->filter.get();
   bool filter_checked = true;
   if (filter != nullptr) {
     if (!filter->IsBlockBased()) {
@@ -2798,7 +2740,7 @@ bool BlockBasedTable::PrefixMayMatch(
         BlockHandle handle = iiter->value().handle;
         may_match = filter->PrefixMayMatch(
             prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
-            /*const_key_ptr=*/nullptr, lookup_context);
+            /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
       }
     }
   }
@@ -3273,7 +3215,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
 bool BlockBasedTable::FullFilterKeyMayMatch(
     const ReadOptions& read_options, FilterBlockReader* filter,
     const Slice& internal_key, const bool no_io,
-    const SliceTransform* prefix_extractor,
+    const SliceTransform* prefix_extractor, GetContext* get_context,
     BlockCacheLookupContext* lookup_context) const {
   if (filter == nullptr || filter->IsBlockBased()) {
     return true;
@@ -3281,20 +3223,21 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
   Slice user_key = ExtractUserKey(internal_key);
   const Slice* const const_ikey_ptr = &internal_key;
   bool may_match = true;
-  if (filter->whole_key_filtering()) {
+  if (rep_->whole_key_filtering) {
     size_t ts_sz =
         rep_->internal_comparator.user_comparator()->timestamp_size();
     Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
     may_match =
         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
-                            no_io, const_ikey_ptr, lookup_context);
+                            no_io, const_ikey_ptr, get_context, lookup_context);
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0 &&
              prefix_extractor->InDomain(user_key) &&
              !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
-                                     prefix_extractor, kNotValid, false,
-                                     const_ikey_ptr, lookup_context)) {
+                                     prefix_extractor, kNotValid, no_io,
+                                     const_ikey_ptr, get_context,
+                                     lookup_context)) {
     may_match = false;
   }
   if (may_match) {
@@ -3312,7 +3255,7 @@ void BlockBasedTable::FullFilterKeysMayMatch(
   if (filter == nullptr || filter->IsBlockBased()) {
     return;
   }
-  if (filter->whole_key_filtering()) {
+  if (rep_->whole_key_filtering) {
     filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
                          lookup_context);
   } else if (!read_options.total_order_seek && prefix_extractor &&
@@ -3338,25 +3281,19 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   assert(get_context != nullptr);
   Status s;
   const bool no_io = read_options.read_tier == kBlockCacheTier;
-  CachableEntry<FilterBlockReader> filter_entry;
-  bool may_match;
-  FilterBlockReader* filter = nullptr;
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
   uint64_t tracing_get_id = get_context->get_tracing_get_id();
   BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet,
                                          tracing_get_id};
-  {
-    if (!skip_filters) {
-      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
-                               read_options.read_tier == kBlockCacheTier,
-                               get_context, &lookup_context);
-    }
-    filter = filter_entry.GetValue();
+  const bool may_match =
+      FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
+                            get_context, &lookup_context);
 
-    // First check the full filter
-    // If full filter not useful, Then go into each block
-    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
-                                      prefix_extractor, &lookup_context);
-  }
   if (!may_match) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
@@ -3388,7 +3325,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           filter != nullptr && filter->IsBlockBased() == true &&
           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
                                prefix_extractor, v.handle.offset(), no_io,
-                               /*const_ikey_ptr=*/nullptr, &lookup_context);
+                               /*const_ikey_ptr=*/nullptr, get_context,
+                               &lookup_context);
 
       if (not_exist_in_filter) {
         // Not found
@@ -3510,31 +3448,23 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                                const MultiGetRange* mget_range,
                                const SliceTransform* prefix_extractor,
                                bool skip_filters) {
-  const bool no_io = read_options.read_tier == kBlockCacheTier;
-  CachableEntry<FilterBlockReader> filter_entry;
-  FilterBlockReader* filter = nullptr;
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
                                mget_range->end());
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
   }
   BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet,
                                          tracing_mget_id};
-  if (!skip_filters) {
-    {
-      // TODO: Figure out where the stats should go
-      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr,
-                               read_options.read_tier == kBlockCacheTier,
-                               /*get_context=*/nullptr, &lookup_context);
-    }
-    filter = filter_entry.GetValue();
+  FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
+                         prefix_extractor, &lookup_context);
 
-    // First check the full filter
-    // If full filter not useful, Then go into each block
-    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
-                           prefix_extractor, &lookup_context);
-  }
   if (skip_filters || !sst_file_range.empty()) {
     IndexBlockIter iiter_on_stack;
     // if prefix_extractor found in block differs from options, disable
@@ -4006,7 +3936,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
       options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
-      /*get_context=*/nullptr, /*lookup_contex=*/nullptr));
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr));
   iiter->Seek(key);
   assert(iiter->Valid());
 
@@ -4022,8 +3952,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 Status BlockBasedTable::CreateIndexReader(
     FilePrefetchBuffer* prefetch_buffer,
     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
-    bool pin, IndexReader** index_reader,
-    BlockCacheLookupContext* lookup_context) {
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
   // kHashSearch requires non-empty prefix_extractor but bypass checking
   // prefix_extractor here since we have no access to MutableCFOptions.
   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
@@ -4033,14 +3963,14 @@ Status BlockBasedTable::CreateIndexReader(
   switch (rep_->index_type) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
-                                          prefetch, pin, index_reader,
-                                          lookup_context);
+                                          prefetch, pin, lookup_context,
+                                          index_reader);
     }
     case BlockBasedTableOptions::kBinarySearch:
     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
-                                             prefetch, pin, index_reader,
-                                             lookup_context);
+                                             prefetch, pin, lookup_context,
+                                             index_reader);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -4056,14 +3986,14 @@ Status BlockBasedTable::CreateIndexReader(
                          " Fall back to binary search index.");
           return BinarySearchIndexReader::Create(this, prefetch_buffer,
                                                  use_cache, prefetch, pin,
-                                                 index_reader, lookup_context);
+                                                 lookup_context, index_reader);
         }
         meta_index_iter = meta_iter_guard.get();
       }
 
       return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
-                                     use_cache, prefetch, pin, index_reader,
-                                     lookup_context);
+                                     use_cache, prefetch, pin, lookup_context,
+                                     index_reader);
     }
     default: {
       std::string error_message =
@@ -4079,7 +4009,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
   std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/&context));
+                       /*lookup_context=*/&context));
 
   index_iter->Seek(key);
   uint64_t result;
@@ -4102,8 +4032,9 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
   return result;
 }
 
-bool BlockBasedTable::TEST_filter_block_preloaded() const {
-  return rep_->filter != nullptr;
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+  assert(rep_ != nullptr);
+  return TEST_BlockInCache(rep_->filter_handle);
 }
 
 bool BlockBasedTable::TEST_IndexBlockInCache() const {
@@ -4167,8 +4098,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpTable(WritableFile* out_file,
-                                  const SliceTransform* prefix_extractor) {
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   // Output Footer
   out_file->Append(
       "Footer Details:\n"
@@ -4225,36 +4155,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
         "  ");
     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
     out_file->Append("\n");
-
-    // Output Filter blocks
-    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
-      // Support only BloomFilter as off now
-      rocksdb::BlockBasedTableOptions table_options;
-      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
-      if (table_properties->filter_policy_name.compare(
-              table_options.filter_policy->Name()) == 0) {
-        std::string filter_block_key = kFilterBlockPrefix;
-        filter_block_key.append(table_properties->filter_policy_name);
-        BlockHandle handle;
-        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
-          BlockContents block;
-          BlockFetcher block_fetcher(
-              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
-              ReadOptions(), handle, &block, rep_->ioptions,
-              false /*decompress*/, false /*maybe_compressed*/,
-              BlockType::kFilter, UncompressionDict::GetEmptyDict(),
-              rep_->persistent_cache_options);
-          s = block_fetcher.ReadBlockContents();
-          if (!s.ok()) {
-            rep_->filter.reset(new BlockBasedFilterBlockReader(
-                prefix_extractor, table_options,
-                table_options.whole_key_filtering, std::move(block),
-                rep_->ioptions.statistics));
-          }
-        }
-      }
-    }
   }
+
   if (rep_->filter) {
     out_file->Append(
         "Filter Details:\n"
@@ -4318,22 +4220,17 @@ void BlockBasedTable::Close() {
     return;
   }
 
-  Cache* const cache = rep_->table_options.block_cache.get();
-
   // cleanup index, filter, and compression dictionary blocks
   // to avoid accessing dangling pointers
   if (!rep_->table_options.no_block_cache) {
-    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-
-    // Get the filter block key
-    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                           rep_->filter_handle, cache_key);
-    cache->Erase(key);
-
     if (!rep_->compression_dict_handle.IsNull()) {
       // Get the compression dictionary block key
-      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                        rep_->compression_dict_handle, cache_key);
+      char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+      auto key =
+          GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                      rep_->compression_dict_handle, cache_key);
+
+      Cache* const cache = rep_->table_options.block_cache.get();
       cache->Erase(key);
     }
   }
@@ -4518,15 +4415,6 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
 
 namespace {
 
-void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
-  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
-  if (filter->statistics() != nullptr) {
-    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
-               filter->ApproximateMemoryUsage());
-  }
-  delete filter;
-}
-
 void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
   UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
   RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 750700813d3..189cd5d2e3a 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -172,8 +172,7 @@ class BlockBasedTable : public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // convert SST file to a human readable form
-  Status DumpTable(WritableFile* out_file,
-                   const SliceTransform* prefix_extractor = nullptr) override;
+  Status DumpTable(WritableFile* out_file) override;
 
   Status VerifyChecksum(TableReaderCaller caller) override;
 
@@ -181,7 +180,7 @@ class BlockBasedTable : public TableReader {
 
   ~BlockBasedTable();
 
-  bool TEST_filter_block_preloaded() const;
+  bool TEST_FilterBlockInCache() const;
   bool TEST_IndexBlockInCache() const;
 
   // IndexReader is the interface that provides the functionality for index
@@ -241,6 +240,8 @@ class BlockBasedTable : public TableReader {
 
   class PartitionedIndexIteratorState;
 
+  template <typename TBlocklike>
+  friend class FilterBlockReaderCommon;
   friend class PartitionIndexReader;
 
  protected:
@@ -278,21 +279,23 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
+  template <typename TBlocklike>
   Status MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      CachableEntry<Block>* block_entry, BlockType block_type,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
       BlockContents* contents) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
   // read options allow I/O).
+  template <typename TBlocklike>
   Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
                        const ReadOptions& ro, const BlockHandle& handle,
                        const UncompressionDict& uncompression_dict,
-                       CachableEntry<Block>* block_entry, BlockType block_type,
-                       GetContext* get_context,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
                        BlockCacheLookupContext* lookup_context,
                        bool for_compaction = false) const;
 
@@ -310,19 +313,6 @@ class BlockBasedTable : public TableReader {
         CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
       char* scratch, const UncompressionDict& uncompression_dict) const;
 
-  // For the following two functions:
-  // if `no_io == true`, we will not try to read filter/index from sst file
-  // were they not present in cache yet.
-  CachableEntry<FilterBlockReader> GetFilter(
-      const SliceTransform* prefix_extractor,
-      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context) const;
-  virtual CachableEntry<FilterBlockReader> GetFilter(
-      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-      const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context,
-      const SliceTransform* prefix_extractor) const;
-
   CachableEntry<UncompressionDict> GetUncompressionDict(
       FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) const;
@@ -348,12 +338,13 @@ class BlockBasedTable : public TableReader {
   // pointer to the block as well as its block handle.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
+  template <typename TBlocklike>
   Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
-      const ReadOptions& read_options, CachableEntry<Block>* block,
+      const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
       const UncompressionDict& uncompression_dict, BlockType block_type,
-      GetContext* get_context = nullptr) const;
+      GetContext* get_context) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -365,11 +356,12 @@ class BlockBasedTable : public TableReader {
   // PutDataBlockToCache(). After the call, the object will be invalid.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
+  template <typename TBlocklike>
   Status PutDataBlockToCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
-      CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
-      CompressionType raw_block_comp_type,
+      CachableEntry<TBlocklike>* cached_block,
+      BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
       const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
       MemoryAllocator* memory_allocator, BlockType block_type,
       GetContext* get_context) const;
@@ -387,13 +379,14 @@ class BlockBasedTable : public TableReader {
   Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
                            InternalIterator* preloaded_meta_index_iter,
                            bool use_cache, bool prefetch, bool pin,
-                           IndexReader** index_reader,
-                           BlockCacheLookupContext* lookup_context);
+                           BlockCacheLookupContext* lookup_context,
+                           std::unique_ptr<IndexReader>* index_reader);
 
   bool FullFilterKeyMayMatch(const ReadOptions& read_options,
                              FilterBlockReader* filter, const Slice& user_key,
                              const bool no_io,
                              const SliceTransform* prefix_extractor,
+                             GetContext* get_context,
                              BlockCacheLookupContext* lookup_context) const;
 
   void FullFilterKeysMayMatch(const ReadOptions& read_options,
@@ -435,10 +428,9 @@ class BlockBasedTable : public TableReader {
   Status VerifyChecksumInBlocks(InternalIteratorBase<IndexValue>* index_iter);
 
   // Create the filter from the filter block.
-  virtual FilterBlockReader* ReadFilter(
-      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
-      const bool is_a_filter_partition,
-      const SliceTransform* prefix_extractor = nullptr) const;
+  std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
 
   static void SetupCacheKeyPrefix(Rep* rep);
 
@@ -516,17 +508,7 @@ struct BlockBasedTable::Rep {
 
   // Footer contains the fixed table information
   Footer footer;
-  // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr)
-  // and used only when options.block_cache is nullptr or when
-  // `cache_index_and_filter_blocks == false`. Otherwise, we will get the
-  // filter and compression dictionary blocks via the block cache. In that case,
-  // `filter_handle`, and `compression_dict_handle` are used to lookup these
-  // meta-blocks in block cache.
-  //
-  // Note: the IndexReader object is always stored in this member variable;
-  // the index block itself, however, may or may not be in the block cache
-  // based on the settings above. We plan to change the handling of the
-  // filter and compression dictionary similarly.
+
   std::unique_ptr<IndexReader> index_reader;
   std::unique_ptr<FilterBlockReader> filter;
   std::unique_ptr<UncompressionDict> uncompression_dict;
@@ -553,13 +535,6 @@ struct BlockBasedTable::Rep {
   std::unique_ptr<SliceTransform> internal_prefix_transform;
   std::shared_ptr<const SliceTransform> table_prefix_extractor;
 
-  // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
-  // true or in all levels when pin_top_level_index_and_filter is set in
-  // combination with partitioned filters: then we do use the LRU cache,
-  // but we always keep the filter block's handle checked out here (=we
-  // don't call Release()), plus the parsed out objects the LRU cache will never
-  // push flush them out, hence they're pinned
-  CachableEntry<FilterBlockReader> filter_entry;
   std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
 
   // If global_seqno is used, all Keys in this file will have the same
diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h
index 5b5d16ef318..b4cd6ec6757 100644
--- a/table/block_based/cachable_entry.h
+++ b/table/block_based/cachable_entry.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <cassert>
+#include "port/likely.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/cleanable.h"
 
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index d54de5ae1ab..936281bde65 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -38,6 +38,7 @@ namespace rocksdb {
 const uint64_t kNotValid = ULLONG_MAX;
 class FilterPolicy;
 
+class GetContext;
 using MultiGetRange = MultiGetContext::Range;
 
 // A FilterBlockBuilder is used to construct all of the filters for a
@@ -78,16 +79,14 @@ class FilterBlockBuilder {
 // BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockReader {
  public:
-  explicit FilterBlockReader()
-      : whole_key_filtering_(true), size_(0), statistics_(nullptr) {}
-  explicit FilterBlockReader(size_t s, Statistics* stats,
-                             bool _whole_key_filtering)
-      : whole_key_filtering_(_whole_key_filtering),
-        size_(s),
-        statistics_(stats) {}
-  virtual ~FilterBlockReader() {}
+  FilterBlockReader() = default;
+  virtual ~FilterBlockReader() = default;
+
+  FilterBlockReader(const FilterBlockReader&) = delete;
+  FilterBlockReader& operator=(const FilterBlockReader&) = delete;
 
   virtual bool IsBlockBased() = 0;  // If is blockbased filter
+
   /**
    * If no_io is set, then it returns true if it cannot answer the query without
    * reading data from disk. This is used in PartitionedFilterBlockReader to
@@ -102,17 +101,19 @@ class FilterBlockReader {
                            const SliceTransform* prefix_extractor,
                            uint64_t block_offset, const bool no_io,
                            const Slice* const const_ikey_ptr,
-                           BlockCacheLookupContext* context) = 0;
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context) = 0;
 
   virtual void KeysMayMatch(MultiGetRange* range,
                             const SliceTransform* prefix_extractor,
                             uint64_t block_offset, const bool no_io,
-                            BlockCacheLookupContext* context) {
+                            BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
       if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
-                       context)) {
+                       get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -125,27 +126,26 @@ class FilterBlockReader {
                               const SliceTransform* prefix_extractor,
                               uint64_t block_offset, const bool no_io,
                               const Slice* const const_ikey_ptr,
-                              BlockCacheLookupContext* context) = 0;
+                              GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context) = 0;
 
   virtual void PrefixesMayMatch(MultiGetRange* range,
                                 const SliceTransform* prefix_extractor,
                                 uint64_t block_offset, const bool no_io,
-                                BlockCacheLookupContext* context) {
+                                BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
       if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                       block_offset, no_io, &ikey, context)) {
+                       block_offset, no_io, &ikey, get_context,
+                       lookup_context)) {
         range->SkipKey(iter);
       }
     }
   }
 
   virtual size_t ApproximateMemoryUsage() const = 0;
-  virtual size_t size() const { return size_; }
-  virtual Statistics* statistics() const { return statistics_; }
-
-  bool whole_key_filtering() const { return whole_key_filtering_; }
 
   // convert this object to a human readable form
   virtual std::string ToString() const {
@@ -153,30 +153,22 @@ class FilterBlockReader {
     return error_msg;
   }
 
-  virtual void CacheDependencies(bool /*pin*/,
-                                 const SliceTransform* /*prefix_extractor*/) {}
+  virtual void CacheDependencies(bool /*pin*/) {}
 
-  virtual bool RangeMayExist(
-      const Slice* /*iterate_upper_bound*/, const Slice& user_key,
-      const SliceTransform* prefix_extractor, const Comparator* /*comparator*/,
-      const Slice* const const_ikey_ptr, bool* filter_checked,
-      bool /*need_upper_bound_check*/, BlockCacheLookupContext* context) {
+  virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+                             const Slice& user_key,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* /*comparator*/,
+                             const Slice* const const_ikey_ptr,
+                             bool* filter_checked,
+                             bool /*need_upper_bound_check*/,
+                             BlockCacheLookupContext* lookup_context) {
     *filter_checked = true;
     Slice prefix = prefix_extractor->Transform(user_key);
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr, context);
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
   }
-
- protected:
-  bool whole_key_filtering_;
-
- private:
-  // No copying allowed
-  FilterBlockReader(const FilterBlockReader&);
-  void operator=(const FilterBlockReader&);
-  size_t size_;
-  Statistics* statistics_;
-  int level_ = -1;
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 00000000000..717a4ad0dff
--- /dev/null
+++ b/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+
+namespace rocksdb {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) {
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
+  assert(table);
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+
+  const Status s =
+      table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+                           UncompressionDict::GetEmptyDict(), filter_block,
+                           BlockType::kFilter, get_context, lookup_context);
+
+  return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+  assert(table_);
+
+  const BlockBasedTable::Rep* const rep = table_->get_rep();
+  assert(rep);
+
+  return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) const {
+  assert(filter_block);
+
+  if (!filter_block_.IsEmpty()) {
+    filter_block->SetUnownedValue(filter_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                         get_context, lookup_context, filter_block);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+    const {
+  assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+  return filter_block_.GetOwnValue()
+             ? filter_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<BlockContents>;
+template class FilterBlockReaderCommon<Block>;
+
+}  // namespace rocksdb
diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h
new file mode 100644
index 00000000000..3698d3f1e91
--- /dev/null
+++ b/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace rocksdb {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+  FilterBlockReaderCommon(const BlockBasedTable* t,
+                          CachableEntry<TBlocklike>&& filter_block)
+      : table_(t), filter_block_(std::move(filter_block)) {
+    assert(table_);
+  }
+
+ protected:
+  static Status ReadFilterBlock(const BlockBasedTable* table,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                const ReadOptions& read_options,
+                                GetContext* get_context,
+                                BlockCacheLookupContext* lookup_context,
+                                CachableEntry<TBlocklike>* filter_block);
+
+  const BlockBasedTable* table() const { return table_; }
+  const SliceTransform* table_prefix_extractor() const;
+  bool whole_key_filtering() const;
+
+  Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              CachableEntry<TBlocklike>* filter_block) const;
+
+  size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<TBlocklike> filter_block_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 6d2b9d70a50..553bd37d974 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -16,6 +16,7 @@
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -98,59 +99,91 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
 }
 
 FullFilterBlockReader::FullFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    const Slice& contents, FilterBitsReader* filter_bits_reader,
-    Statistics* stats)
-    : FilterBlockReader(contents.size(), stats, _whole_key_filtering),
-      prefix_extractor_(prefix_extractor),
-      contents_(contents) {
-  assert(filter_bits_reader != nullptr);
-  filter_bits_reader_.reset(filter_bits_reader);
-  if (prefix_extractor_ != nullptr) {
+    const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (prefix_extractor) {
     full_length_enabled_ =
-        prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_);
+        prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
   }
 }
 
-FullFilterBlockReader::FullFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    BlockContents&& contents, FilterBitsReader* filter_bits_reader,
-    Statistics* stats)
-    : FullFilterBlockReader(prefix_extractor, _whole_key_filtering,
-                            contents.data, filter_bits_reader, stats) {
-  block_contents_ = std::move(contents);
-}
-
 bool FullFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /*prefix_extractor*/,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     return true;
   }
-  return MayMatch(key);
+  return MayMatch(key, no_io, get_context, lookup_context);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<BlockContents> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new FullFilterBlockReader(table, std::move(filter_block)));
 }
 
 bool FullFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  return MayMatch(prefix);
+  return MayMatch(prefix, no_io, get_context, lookup_context);
 }
 
-bool FullFilterBlockReader::MayMatch(const Slice& entry) {
-  if (contents_.size() != 0)  {
-    if (filter_bits_reader_->MayMatch(entry)) {
+bool FullFilterBlockReader::MayMatch(
+    const Slice& entry, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  if (filter_block.GetValue()->data.size() != 0) {
+    assert(table());
+    assert(table()->get_rep());
+
+    std::unique_ptr<FilterBitsReader> filter_bits_reader(
+        table()->get_rep()->filter_policy->GetFilterBitsReader(
+            filter_block.GetValue()->data));
+    assert(filter_bits_reader != nullptr);
+
+    if (filter_bits_reader->MayMatch(entry)) {
       PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
       return true;
     } else {
@@ -163,38 +196,58 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
 
 void FullFilterBlockReader::KeysMayMatch(
     MultiGetRange* range, const SliceTransform* /*prefix_extractor*/,
-    uint64_t block_offset, const bool /*no_io*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     // Simply return. Don't skip any key - consider all keys as likely to be
     // present
     return;
   }
-  MayMatch(range);
+  MayMatch(range, no_io, lookup_context);
 }
 
 void FullFilterBlockReader::PrefixesMayMatch(
     MultiGetRange* range, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    BlockCacheLookupContext* /*context*/) {
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  MayMatch(range);
+  MayMatch(range, no_io, lookup_context);
 }
 
-void FullFilterBlockReader::MayMatch(MultiGetRange* range) {
-  if (contents_.size() == 0) {
+void FullFilterBlockReader::MayMatch(
+    MultiGetRange* range, bool no_io,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
+                                        lookup_context, &filter_block);
+  if (!s.ok()) {
     return;
   }
 
+  assert(filter_block.GetValue());
+
+  if (filter_block.GetValue()->data.size() == 0) {
+    return;
+  }
+
+  assert(table());
+  assert(table()->get_rep());
+
+  std::unique_ptr<FilterBitsReader> filter_bits_reader(
+      table()->get_rep()->filter_policy->GetFilterBitsReader(
+          filter_block.GetValue()->data));
+  assert(filter_bits_reader != nullptr);
+
   // We need to use an array instead of autovector for may_match since
   // &may_match[0] doesn't work for autovector<bool> (compiler error). So
   // declare both keys and may_match as arrays, which is also slightly less
@@ -205,7 +258,7 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) {
   for (auto iter = range->begin(); iter != range->end(); ++iter) {
     keys[num_keys++] = &iter->ukey;
   }
-  filter_bits_reader_->MayMatch(num_keys, &keys[0], &may_match[0]);
+  filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
 
   int i = 0;
   for (auto iter = range->begin(); iter != range->end(); ++iter) {
@@ -217,13 +270,11 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) {
 }
 
 size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
-  size_t usage = block_contents_.usable_size();
+  size_t usage = ApproximateFilterBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-  usage += malloc_usable_size((void*)this);
-  usage += malloc_usable_size(filter_bits_reader_.get());
+  usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
 #else
   usage += sizeof(*this);
-  usage += sizeof(*filter_bits_reader_.get());
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
   return usage;
 }
@@ -232,7 +283,7 @@ bool FullFilterBlockReader::RangeMayExist(
     const Slice* iterate_upper_bound, const Slice& user_key,
     const SliceTransform* prefix_extractor, const Comparator* comparator,
     const Slice* const const_ikey_ptr, bool* filter_checked,
-    bool need_upper_bound_check, BlockCacheLookupContext* context) {
+    bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) {
   if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
     *filter_checked = false;
     return true;
@@ -245,22 +296,23 @@ bool FullFilterBlockReader::RangeMayExist(
   } else {
     *filter_checked = true;
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr, context);
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
   }
 }
 
 bool FullFilterBlockReader::IsFilterCompatible(
     const Slice* iterate_upper_bound, const Slice& prefix,
-    const Comparator* comparator) {
+    const Comparator* comparator) const {
   // Try to reuse the bloom filter in the SST table if prefix_extractor in
   // mutable_cf_options has changed. If range [user_key, upper_bound) all
   // share the same prefix then we may still be able to use the bloom filter.
-  if (iterate_upper_bound != nullptr && prefix_extractor_) {
-    if (!prefix_extractor_->InDomain(*iterate_upper_bound)) {
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (iterate_upper_bound != nullptr && prefix_extractor) {
+    if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
       return false;
     }
-    Slice upper_bound_xform =
-        prefix_extractor_->Transform(*iterate_upper_bound);
+    Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
     // first check if user_key and upper_bound all share the same prefix
     if (!comparator->Equal(prefix, upper_bound_xform)) {
       // second check if user_key's prefix is the immediate predecessor of
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 99e5299b34f..08a41706e6b 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -15,7 +15,8 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/format.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -78,71 +79,58 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
 // A FilterBlockReader is used to parse filter from SST table.
 // KeyMayMatch and PrefixMayMatch would trigger filter checking
-class FullFilterBlockReader : public FilterBlockReader {
+class FullFilterBlockReader : public FilterBlockReaderCommon<BlockContents> {
  public:
-  // REQUIRES: "contents" and filter_bits_reader must stay live
-  // while *this is live.
-  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
-                                 bool whole_key_filtering,
-                                 const Slice& contents,
-                                 FilterBitsReader* filter_bits_reader,
-                                 Statistics* statistics);
-  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
-                                 bool whole_key_filtering,
-                                 BlockContents&& contents,
-                                 FilterBitsReader* filter_bits_reader,
-                                 Statistics* statistics);
-
-  // bits_reader is created in filter_policy, it should be passed in here
-  // directly. and be deleted here
-  ~FullFilterBlockReader() override {}
+  FullFilterBlockReader(const BlockBasedTable* t,
+                        CachableEntry<BlockContents>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
 
   bool IsBlockBased() override { return false; }
 
   bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
                    uint64_t block_offset, const bool no_io,
-                   const Slice* const const_ikey_ptr,
-                   BlockCacheLookupContext* context) override;
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
 
   bool PrefixMayMatch(const Slice& prefix,
                       const SliceTransform* prefix_extractor,
                       uint64_t block_offset, const bool no_io,
                       const Slice* const const_ikey_ptr,
-                      BlockCacheLookupContext* context) override;
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
 
   void KeysMayMatch(MultiGetRange* range,
                     const SliceTransform* prefix_extractor,
                     uint64_t block_offset, const bool no_io,
-                    BlockCacheLookupContext* context) override;
+                    BlockCacheLookupContext* lookup_context) override;
 
   void PrefixesMayMatch(MultiGetRange* range,
                         const SliceTransform* prefix_extractor,
                         uint64_t block_offset, const bool no_io,
-                        BlockCacheLookupContext* context) override;
+                        BlockCacheLookupContext* lookup_context) override;
   size_t ApproximateMemoryUsage() const override;
   bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
                      const SliceTransform* prefix_extractor,
                      const Comparator* comparator,
                      const Slice* const const_ikey_ptr, bool* filter_checked,
                      bool need_upper_bound_check,
-                     BlockCacheLookupContext* context) override;
+                     BlockCacheLookupContext* lookup_context) override;
+
+ private:
+  bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
+  void MayMatch(MultiGetRange* range, bool no_io,
+                BlockCacheLookupContext* lookup_context) const;
+  bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+                          const Comparator* comparator) const;
 
  private:
-  const SliceTransform* prefix_extractor_;
-  Slice contents_;
-  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
-  BlockContents block_contents_;
   bool full_length_enabled_;
   size_t prefix_extractor_full_length_;
-
-  // No copying allowed
-  FullFilterBlockReader(const FullFilterBlockReader&);
-  bool MayMatch(const Slice& entry);
-  void MayMatch(MultiGetRange* range);
-  void operator=(const FullFilterBlockReader&);
-  bool IsFilterCompatible(const Slice* iterate_upper_bound,
-                          const Slice& prefix, const Comparator* comparator);
-
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 57ff158c5c7..e8fcce07d75 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -6,6 +6,7 @@
 #include "table/block_based/full_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/full_filter_bits_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -40,6 +41,15 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
   std::vector<uint32_t> hash_entries_;
 };
 
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->cache_key_prefix_size = 10;
+  }
+};
+
 class TestFilterBitsReader : public FilterBitsReader {
  public:
   explicit TestFilterBitsReader(const Slice& contents)
@@ -95,26 +105,46 @@ class TestHashFilter : public FilterPolicy {
 
 class PluginFullFilterBlockTest : public testing::Test {
  public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
 
-  PluginFullFilterBlockTest() {
-    table_options_.filter_policy.reset(new TestHashFilter());
+  PluginFullFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.no_block_cache = true;
+    table_options_.filter_policy.reset(new TestHashFilter);
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table)));
   }
 };
 
 TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
   FullFilterBlockBuilder builder(
       nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  Slice block = builder.Finish();
-  ASSERT_EQ("", EscapeString(block));
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
 
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
+  FullFilterBlockReader reader(table_.get(), std::move(block));
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
 }
 
 TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
@@ -125,57 +155,90 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
   builder.Add("box");
   builder.Add("box");
   builder.Add("hello");
-  Slice block = builder.Finish();
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  Slice slice = builder.Finish();
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 }
 
 class FullFilterBlockTest : public testing::Test {
  public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
 
-  FullFilterBlockTest() {
+  FullFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  }
 
-  ~FullFilterBlockTest() override {}
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table)));
+  }
 };
 
 TEST_F(FullFilterBlockTest, EmptyBuilder) {
   FullFilterBlockBuilder builder(
       nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  Slice block = builder.Finish();
-  ASSERT_EQ("", EscapeString(block));
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
 
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
 }
 
 TEST_F(FullFilterBlockTest, DuplicateEntries) {
@@ -221,31 +284,46 @@ TEST_F(FullFilterBlockTest, SingleChunk) {
   builder.Add("box");
   builder.Add("hello");
   ASSERT_EQ(5, builder.NumAdded());
-  Slice block = builder.Finish();
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch(
-      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+  Slice slice = builder.Finish();
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
   ASSERT_TRUE(!reader.KeyMayMatch(
       "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
-      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr));
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
 }
 
 }  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index dcd985152bb..ae57e85dca6 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -119,113 +119,77 @@ Slice PartitionedFilterBlockBuilder::Finish(
 }
 
 PartitionedFilterBlockReader::PartitionedFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
-    Statistics* stats, const InternalKeyComparator comparator,
-    const BlockBasedTable* table, const bool index_key_includes_seq,
-    const bool index_value_is_full)
-    : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
-      prefix_extractor_(prefix_extractor),
-      comparator_(comparator),
-      table_(table),
-      index_key_includes_seq_(index_key_includes_seq),
-      index_value_is_full_(index_value_is_full) {
-  idx_on_fltr_blk_.reset(new Block(std::move(contents),
-                                   kDisableGlobalSequenceNumber,
-                                   0 /* read_amp_bytes_per_bit */, stats));
-}
+    const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
 
-PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
-  // TODO(myabandeh): if instead of filter object we store only the blocks in
-  // block cache, then we don't have to manually earse them from block cache
-  // here.
-  auto block_cache = table_->rep_->table_options.block_cache.get();
-  if (UNLIKELY(block_cache == nullptr)) {
-    return;
-  }
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  IndexBlockIter biter;
-  BlockHandle handle;
-  Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIndexIterator(
-      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      /* have_first_key */ false, index_key_includes_seq_,
-      index_value_is_full_);
-  biter.SeekToFirst();
-  for (; biter.Valid(); biter.Next()) {
-    handle = biter.value().handle;
-    auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
-                                            table_->rep_->cache_key_prefix_size,
-                                            handle, cache_key);
-    block_cache->Erase(key);
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<Block> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
   }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new PartitionedFilterBlockReader(table, std::move(filter_block)));
 }
 
 bool PartitionedFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* prefix_extractor,
     uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
-    BlockCacheLookupContext* context) {
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
-    return true;
-  }
-  if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
-    return true;
-  }
-  auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
-  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
-    return false;
-  }
-  auto filter_partition =
-      GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io,
-                         prefix_extractor, context);
-  if (UNLIKELY(!filter_partition.GetValue())) {
+  if (!whole_key_filtering()) {
     return true;
   }
-  return filter_partition.GetValue()->KeyMayMatch(
-      key, prefix_extractor, block_offset, no_io, /*const_ikey_ptr=*/nullptr,
-      context);
+
+  return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::KeyMayMatch);
 }
 
 bool PartitionedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* prefix_extractor,
     uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
-    BlockCacheLookupContext* context) {
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
-  if (!prefix_extractor_ && !prefix_extractor) {
+  if (!table_prefix_extractor() && !prefix_extractor) {
     return true;
   }
-  if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
-    return true;
-  }
-  auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
-  if (UNLIKELY(filter_handle.size() == 0)) {  // prefix is out of range
-    return false;
-  }
-  auto filter_partition =
-      GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io,
-                         prefix_extractor, context);
-  if (UNLIKELY(!filter_partition.GetValue())) {
-    return true;
-  }
-  return filter_partition.GetValue()->PrefixMayMatch(
-      prefix, prefix_extractor, kNotValid, no_io, /*const_ikey_ptr=*/nullptr,
-      context);
+
+  return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::PrefixMayMatch);
 }
 
 BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
-    const Slice& entry) {
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
   IndexBlockIter iter;
+  const InternalKeyComparator* const comparator = internal_comparator();
   Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIndexIterator(
-      &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
-      /* have_first_key */ false, index_key_includes_seq_,
-      index_value_is_full_);
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &iter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
     return BlockHandle(0, 0);
@@ -235,39 +199,78 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
   return fltr_blk_handle;
 }
 
-CachableEntry<FilterBlockReader>
-PartitionedFilterBlockReader::GetFilterPartition(
-    FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
-    const bool no_io, const SliceTransform* prefix_extractor,
-    BlockCacheLookupContext* context) {
-  const bool is_a_filter_partition = true;
-  auto block_cache = table_->rep_->table_options.block_cache.get();
-  if (LIKELY(block_cache != nullptr)) {
-    if (filter_map_.size() != 0) {
-      auto iter = filter_map_.find(fltr_blk_handle.offset());
-      // This is a possible scenario since block cache might not have had space
-      // for the partition
-      if (iter != filter_map_.end()) {
-        return {iter->second.GetValue(), nullptr /* cache */,
-          nullptr /* cache_handle */, false /* own_value */};
-      }
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<BlockContents>* filter_block) const {
+  assert(table());
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  if (!filter_map_.empty()) {
+    auto iter = filter_map_.find(fltr_blk_handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (iter != filter_map_.end()) {
+      filter_block->SetUnownedValue(iter->second.GetValue());
+      return Status::OK();
     }
-    return table_->GetFilter(/*prefetch_buffer=*/nullptr, fltr_blk_handle,
-                             is_a_filter_partition, no_io,
-                             /*get_context=*/nullptr, context,
-                             prefix_extractor);
-  } else {
-    auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
-                                     is_a_filter_partition, prefix_extractor);
-    return {filter, nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */};
   }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  const Status s =
+      table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+                             UncompressionDict::GetEmptyDict(), filter_block,
+                             BlockType::kFilter, get_context, lookup_context);
+
+  return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+    const Slice& slice, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    FilterFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return true;
+  }
+
+  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
+    return false;
+  }
+
+  CachableEntry<BlockContents> filter_partition_block;
+  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+                              no_io, get_context, lookup_context,
+                              &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  return (filter_partition.*filter_function)(
+      slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
+      lookup_context);
 }
 
 size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
-  size_t usage = idx_on_fltr_blk_->usable_size();
+  size_t usage = ApproximateFilterBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-  usage += malloc_usable_size((void*)this);
+  usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
 #else
   usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
@@ -276,16 +279,36 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
 }
 
 // TODO(myabandeh): merge this with the same function in IndexReader
-void PartitionedFilterBlockReader::CacheDependencies(
-    bool pin, const SliceTransform* prefix_extractor) {
-  // Before read partitions, prefetch them to avoid lots of IOs
+void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+  assert(table());
+
+  const BlockBasedTable::Rep* const rep = table()->get_rep();
+  assert(rep);
+
   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+  CachableEntry<Block> filter_block;
+
+  Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                                  &lookup_context, &filter_block);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep->ioptions.info_log,
+                   "Error retrieving top-level filter block while trying to "
+                   "cache filter partitions: %s",
+                   s.ToString().c_str());
+    return;
+  }
+
+  // Before read partitions, prefetch them to avoid lots of IOs
+  assert(filter_block.GetValue());
+
   IndexBlockIter biter;
+  const InternalKeyComparator* const comparator = internal_comparator();
   Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIndexIterator(
-      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      /* have_first_key */ false, index_key_includes_seq_,
-      index_value_is_full_);
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &biter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
   // Index partitions are assumed to be consecuitive. Prefetch them all.
   // Read the first block offset
   biter.SeekToFirst();
@@ -298,27 +321,55 @@ void PartitionedFilterBlockReader::CacheDependencies(
   uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-  auto& file = table_->rep_->file;
+
   prefetch_buffer.reset(new FilePrefetchBuffer());
-  Status s;
-  s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+  s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
     static_cast<size_t>(prefetch_len));
 
   // After prefetch, read the partitions one by one
-  biter.SeekToFirst();
-  for (; biter.Valid(); biter.Next()) {
+  ReadOptions read_options;
+  for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
     handle = biter.value().handle;
-    const bool no_io = true;
-    const bool is_a_filter_partition = true;
-    auto filter = table_->GetFilter(
-        prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
-        /*get_context=*/nullptr, &lookup_context, prefix_extractor);
-    if (LIKELY(filter.IsCached())) {
-      if (pin) {
-        filter_map_[handle.offset()] = std::move(filter);
+
+    CachableEntry<BlockContents> block;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), read_options, handle,
+        UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */);
+
+    assert(s.ok() || block.GetValue() == nullptr);
+    if (s.ok() && block.GetValue() != nullptr) {
+      if (block.IsCached()) {
+        if (pin) {
+          filter_map_[handle.offset()] = std::move(block);
+        }
       }
     }
   }
 }
 
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+    const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_value_is_full;
+}
+
 }  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 4b0fb523d0d..b73ae3baa75 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -14,8 +14,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/block_based/block.h"
-#include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block_reader_common.h"
 #include "table/block_based/full_filter_block.h"
 #include "util/autovector.h"
 
@@ -69,44 +68,57 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   BlockHandle last_encoded_handle_;
 };
 
-class PartitionedFilterBlockReader : public FilterBlockReader {
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
  public:
-  explicit PartitionedFilterBlockReader(
-      const SliceTransform* prefix_extractor, bool whole_key_filtering,
-      BlockContents&& contents, FilterBitsReader* filter_bits_reader,
-      Statistics* stats, const InternalKeyComparator comparator,
-      const BlockBasedTable* table, const bool index_key_includes_seq,
-      const bool index_value_is_full);
-  ~PartitionedFilterBlockReader() override;
+  PartitionedFilterBlockReader(const BlockBasedTable* t,
+                               CachableEntry<Block>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
 
   bool IsBlockBased() override { return false; }
   bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
                    uint64_t block_offset, const bool no_io,
-                   const Slice* const const_ikey_ptr,
-                   BlockCacheLookupContext* context) override;
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
   bool PrefixMayMatch(const Slice& prefix,
                       const SliceTransform* prefix_extractor,
                       uint64_t block_offset, const bool no_io,
                       const Slice* const const_ikey_ptr,
-                      BlockCacheLookupContext* context) override;
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+
   size_t ApproximateMemoryUsage() const override;
 
  private:
-  BlockHandle GetFilterPartitionHandle(const Slice& entry);
-  CachableEntry<FilterBlockReader> GetFilterPartition(
-      FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
-      const bool no_io, const SliceTransform* prefix_extractor,
-      BlockCacheLookupContext* context);
-  void CacheDependencies(bool bin,
-                         const SliceTransform* prefix_extractor) override;
-
-  const SliceTransform* prefix_extractor_;
-  std::unique_ptr<Block> idx_on_fltr_blk_;
-  const InternalKeyComparator comparator_;
-  const BlockBasedTable* table_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
-  std::unordered_map<uint64_t, CachableEntry<FilterBlockReader>> filter_map_;
+  BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+                                       const Slice& entry) const;
+  Status GetFilterPartitionBlock(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+      bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<BlockContents>* filter_block) const;
+
+  using FilterFunction = bool (FullFilterBlockReader::*)(
+      const Slice& slice, const SliceTransform* prefix_extractor,
+      uint64_t block_offset, const bool no_io,
+      const Slice* const const_ikey_ptr, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context);
+  bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor,
+                uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                FilterFunction filter_function) const;
+  void CacheDependencies(bool pin) override;
+
+  const InternalKeyComparator* internal_comparator() const;
+  bool index_key_includes_seq() const;
+  bool index_value_is_full() const;
+
+ protected:
+  std::unordered_map<uint64_t, CachableEntry<BlockContents>> filter_map_;
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 34ecfa4ac65..5e9e467723c 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -7,6 +7,7 @@
 
 #include "rocksdb/filter_policy.h"
 
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/full_filter_bits_builder.h"
 
@@ -23,34 +24,29 @@ std::map<uint64_t, Slice> slices;
 
 class MockedBlockBasedTable : public BlockBasedTable {
  public:
-  explicit MockedBlockBasedTable(Rep* rep)
+  MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
       : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
     rep->cache_key_prefix_size = 10;
+    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_value_is_full = !pib->get_use_value_delta_encoding();
   }
+};
 
-  CachableEntry<FilterBlockReader> GetFilter(
-      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
-      const bool /* unused */, bool /* unused */, GetContext* /* unused */,
-      BlockCacheLookupContext* /*context*/,
-      const SliceTransform* prefix_extractor) const override {
-    Slice slice = slices[filter_blk_handle.offset()];
-    auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice),
-        rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
-    return {obj, nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */};
-  }
-
-  FilterBlockReader* ReadFilter(
-      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
-      const bool /* unused */,
-      const SliceTransform* prefix_extractor) const override {
-    Slice slice = slices[filter_blk_handle.offset()];
-    auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice),
-        rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
-    return obj;
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+  MyPartitionedFilterBlockReader(BlockBasedTable* t,
+                                 CachableEntry<Block>&& filter_block)
+      : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+    for (const auto& pair : slices) {
+      const uint64_t offset = pair.first;
+      const Slice& slice = pair.second;
+
+      CachableEntry<BlockContents> block(
+          new BlockContents(slice), nullptr /* cache */,
+          nullptr /* cache_handle */, true /* own_value */);
+      filter_map_[offset] = std::move(block);
+    }
   }
 };
 
@@ -58,10 +54,18 @@ class PartitionedFilterBlockTest
     : public testing::Test,
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator());
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+  std::shared_ptr<Cache> cache_;
 
-  PartitionedFilterBlockTest() {
+  PartitionedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
     table_options_.no_block_cache = true;  // Otherwise BlockBasedTable::Close
                                            // will access variable that are not
@@ -70,7 +74,6 @@ class PartitionedFilterBlockTest
     table_options_.index_block_restart_interval = 3;
   }
 
-  std::shared_ptr<Cache> cache_;
   ~PartitionedFilterBlockTest() override {}
 
   const std::string keys[4] = {"afoo", "bar", "box", "hello"};
@@ -110,7 +113,7 @@ class PartitionedFilterBlockTest
   PartitionedIndexBuilder* NewIndexBuilder() {
     const bool kValueDeltaEncoded = true;
     return PartitionedIndexBuilder::CreateIndexBuilder(
-        &icomp, !kValueDeltaEncoded, table_options_);
+        &icomp_, !kValueDeltaEncoded, table_options_);
   }
 
   PartitionedFilterBlockBuilder* NewBuilder(
@@ -131,11 +134,8 @@ class PartitionedFilterBlockTest
         p_index_builder, partition_size);
   }
 
-  std::unique_ptr<MockedBlockBasedTable> table;
-
   PartitionedFilterBlockReader* NewReader(
-      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib,
-      const SliceTransform* prefix_extractor) {
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
     BlockHandle bh;
     Status status;
     Slice slice;
@@ -143,19 +143,21 @@ class PartitionedFilterBlockTest
       slice = builder->Finish(bh, &status);
       bh = Write(slice);
     } while (status.IsIncomplete());
-    const Options options;
-    const ImmutableCFOptions ioptions(options);
-    const MutableCFOptions moptions(options);
-    const EnvOptions env_options;
-    const bool kSkipFilters = true;
-    const bool kImmortal = true;
-    table.reset(new MockedBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp,
-                                 !kSkipFilters, 0, !kImmortal)));
-    auto reader = new PartitionedFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp,
-        table.get(), pib->seperator_is_key_plus_seq(),
-        !pib->get_use_value_delta_encoding());
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table),
+        pib));
+    BlockContents contents(slice);
+    CachableEntry<Block> block(
+        new Block(std::move(contents), kDisableGlobalSequenceNumber,
+                  0 /* read_amp_bytes_per_bit */, nullptr),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+    auto reader =
+        new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
     return reader;
   }
 
@@ -163,36 +165,37 @@ class PartitionedFilterBlockTest
                     PartitionedIndexBuilder* pib, bool empty = false,
                     const SliceTransform* prefix_extractor = nullptr) {
     std::unique_ptr<PartitionedFilterBlockReader> reader(
-        NewReader(builder, pib, prefix_extractor));
+        NewReader(builder, pib));
     // Querying added keys
     const bool no_io = true;
     for (auto key : keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
-                                      &ikey_slice, /*context=*/nullptr));
+                                      &ikey_slice, /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr));
     }
     {
       // querying a key twice
       auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
-      ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid,
-                                      !no_io, &ikey_slice,
-                                      /*context=*/nullptr));
+      ASSERT_TRUE(reader->KeyMayMatch(
+          keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice,
+          /*get_context=*/nullptr, /*lookup_context=*/nullptr));
     }
     // querying missing keys
     for (auto key : missing_keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       if (empty) {
-        ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                        !no_io, &ikey_slice,
-                                        /*context=*/nullptr));
+        ASSERT_TRUE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
       } else {
         // assuming a good hash function
-        ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                         !no_io, &ikey_slice,
-                                         /*context=*/nullptr));
+        ASSERT_FALSE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
       }
     }
   }
@@ -336,13 +339,14 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   builder->Add(pkeys[2]);
   CutABlock(pib.get(), pkeys[2]);
   std::unique_ptr<PartitionedFilterBlockReader> reader(
-      NewReader(builder.get(), pib.get(), prefix_extractor.get()));
+      NewReader(builder.get(), pib.get()));
   for (auto key : pkeys) {
     auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
     const Slice ikey_slice = Slice(*ikey.rep());
     ASSERT_TRUE(reader->PrefixMayMatch(
         prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
-        /*no_io=*/false, &ikey_slice, /*context=*/nullptr));
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
   }
 }
 
diff --git a/table/table_reader.h b/table/table_reader.h
index 1c879cb1f81..72d11a7bd24 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -116,8 +116,7 @@ class TableReader {
   }
 
   // convert db file to a human readable form
-  virtual Status DumpTable(WritableFile* /*out_file*/,
-                           const SliceTransform* /*prefix_extractor*/) {
+  virtual Status DumpTable(WritableFile* /*out_file*/) {
     return Status::NotSupported("DumpTable() not supported");
   }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index c3a1f82ed37..c54933b781a 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2296,7 +2296,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
 
   // preloading filter/index blocks is enabled.
   auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-  ASSERT_TRUE(reader->TEST_filter_block_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
   ASSERT_FALSE(reader->TEST_IndexBlockInCache());
 
   {
@@ -2343,7 +2343,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
   auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
   ASSERT_TRUE(reader->TEST_IndexBlockInCache());
 
   // -- PART 1: Open with regular block cache.
@@ -2476,7 +2476,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   MutableCFOptions moptions4(options);
   ASSERT_OK(c3.Reopen(ioptions4, moptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
-  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
   PinnableSlice value;
   GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, user_key, &value, nullptr,
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 260d15f303c..44a733b57c6 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -150,8 +150,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
   std::unique_ptr<WritableFile> out_file;
   Env* env = Env::Default();
   env->NewWritableFile(out_filename, &out_file, soptions_);
-  Status s = table_reader_->DumpTable(out_file.get(),
-                                      moptions_.prefix_extractor.get());
+  Status s = table_reader_->DumpTable(out_file.get());
   out_file->Close();
   return s;
 }

From 0acaa1a8464f35d0f4cf83a1bafbad662bfe0c99 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Tue, 16 Jul 2019 15:19:45 -0700
Subject: [PATCH 225/572] WriteUnPrepared: use tracked_keys_ to track keys
 needed for rollback (#5562)

Summary:
Currently, we are tracking keys we need to rollback via a separate structure specific to WriteUnprepared in write_set_keys_.

We already have a data structure called tracked_keys_ used to track which keys to unlock on transaction termination. This is exactly what we want, since we should only rollback keys that we have locked anyway.

Save some memory by reusing that data structure instead of making our own.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5562

Differential Revision: D16206484

Pulled By: lth

fbshipit-source-id: 5894d2b824a4b19062d84adbd6e6e86f00047488
---
 utilities/transactions/transaction_base.h     |  12 +-
 .../transactions/write_unprepared_txn.cc      | 119 ++++++++++++++----
 utilities/transactions/write_unprepared_txn.h |  22 ++--
 .../transactions/write_unprepared_txn_db.cc   |  36 +-----
 .../transactions/write_unprepared_txn_db.h    |  27 ----
 5 files changed, 111 insertions(+), 105 deletions(-)

diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 04274866aab..26efd51b378 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -317,6 +317,12 @@ class TransactionBaseImpl : public Transaction {
   // Records writes pending in this transaction
   WriteBatchWithIndex write_batch_;
 
+  // Map from column_family_id to map of keys that are involved in this
+  // transaction.
+  // For Pessimistic Transactions this is the list of locked keys.
+  // Optimistic Transactions will wait till commit time to do conflict checking.
+  TransactionKeyMap tracked_keys_;
+
  private:
   friend class WritePreparedTxn;
   // Extra data to be persisted with the commit. Note this is only used when
@@ -327,12 +333,6 @@ class TransactionBaseImpl : public Transaction {
   // nullptr if there was no snapshot at the time SetSavePoint() was called.
   std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint, autovector<TransactionBaseImpl::SavePoint>>> save_points_;
 
-  // Map from column_family_id to map of keys that are involved in this
-  // transaction.
-  // For Pessimistic Transactions this is the list of locked keys.
-  // Optimistic Transactions will wait till commit time to do conflict checking.
-  TransactionKeyMap tracked_keys_;
-
   // If true, future Put/Merge/Deletes will be indexed in the
   // WriteBatchWithIndex.
   // If false, future Put/Merge/Deletes will be inserted directly into the
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 54d478c9466..d127220e47d 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -42,7 +42,9 @@ SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber(
 WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const WriteOptions& write_options,
                                        const TransactionOptions& txn_options)
-    : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) {
+    : WritePreparedTxn(txn_db, write_options, txn_options),
+      wupt_db_(txn_db),
+      recovered_txn_(false) {
   max_write_batch_size_ = txn_options.max_write_batch_size;
   // We set max bytes to zero so that we don't get a memory limit error.
   // Instead of trying to keep write batch strictly under the size limit, we
@@ -69,6 +71,12 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() {
           log_number_);
     }
   }
+
+  // Call tracked_keys_.clear() so that ~PessimisticTransaction does not
+  // try to unlock keys for recovered transactions.
+  if (recovered_txn_) {
+    tracked_keys_.clear();
+  }
 }
 
 void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
@@ -76,7 +84,7 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   max_write_batch_size_ = txn_options.max_write_batch_size;
   write_batch_.SetMaxBytes(0);
   unprep_seqs_.clear();
-  write_set_keys_.clear();
+  recovered_txn_ = false;
 }
 
 Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
@@ -148,6 +156,72 @@ Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
   return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
 }
 
+// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For
+// WriteUnprepared, the write batches have already been written into the
+// database during WAL replay, so all we have to do is just to "retrack" the key
+// so that rollbacks are possible.
+//
+// Calling TryLock instead of TrackKey is also possible, but as an optimization,
+// recovered transactions do not hold locks on their keys. This follows the
+// implementation in PessimisticTransactionDB::Initialize where we set
+// skip_concurrency_control to true.
+Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) {
+  struct TrackKeyHandler : public WriteBatch::Handler {
+    WriteUnpreparedTxn* txn_;
+    bool rollback_merge_operands_;
+
+    TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+        : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+      if (rollback_merge_operands_) {
+        txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                       false /* read_only */, true /* exclusive */);
+      }
+      return Status::OK();
+    }
+
+    // Recovered batches do not contain 2PC markers.
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  TrackKeyHandler handler(this,
+                          wupt_db_->txn_db_options_.rollback_merge_operands);
+  return wb->Iterate(&handler);
+}
+
 Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
   const bool kPrepared = true;
   Status s;
@@ -159,25 +233,11 @@ Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
   return s;
 }
 
-void WriteUnpreparedTxn::UpdateWriteKeySet(uint32_t cfid, const Slice& key) {
-  // TODO(lth): write_set_keys_ can just be a std::string instead of a vector.
-  write_set_keys_[cfid].push_back(key.ToString());
-}
-
 Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
   if (name_.empty()) {
     return Status::InvalidArgument("Cannot write to DB without SetName.");
   }
 
-  // Update write_key_set_ for rollback purposes.
-  KeySetBuilder keyset_handler(
-      this, wupt_db_->txn_db_options_.rollback_merge_operands);
-  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&keyset_handler);
-  assert(s.ok());
-  if (!s.ok()) {
-    return s;
-  }
-
   // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
   WriteOptions write_options = write_options_;
   write_options.disableWAL = false;
@@ -204,10 +264,10 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
   // WriteImpl should not overwrite that value, so set log_used to nullptr if
   // log_number_ is already set.
   uint64_t* log_used = log_number_ ? nullptr : &log_number_;
-  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                          /*callback*/ nullptr, log_used, /*log ref*/
-                          0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
-                          &add_prepared_callback);
+  auto s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                               /*callback*/ nullptr, log_used, /*log ref*/
+                               0, !DISABLE_MEMTABLE, &seq_used,
+                               prepare_batch_cnt_, &add_prepared_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   auto prepare_seq = seq_used;
 
@@ -317,7 +377,6 @@ Status WriteUnpreparedTxn::CommitInternal() {
       wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
     }
     unprep_seqs_.clear();
-    write_set_keys_.clear();
     return s;
   }  // else do the 2nd write to publish seq
 
@@ -349,7 +408,6 @@ Status WriteUnpreparedTxn::CommitInternal() {
     wpt_db_->RemovePrepared(seq.first, seq.second);
   }
   unprep_seqs_.clear();
-  write_set_keys_.clear();
   return s;
 }
 
@@ -359,19 +417,21 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0);
   assert(GetId() != kMaxSequenceNumber);
   assert(GetId() > 0);
+  Status s;
   const auto& cf_map = *wupt_db_->GetCFHandleMap();
   auto read_at_seq = kMaxSequenceNumber;
-  Status s;
 
   ReadOptions roptions;
   // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
   // need to read our own writes when reading prior versions of the key for
   // rollback.
+  const auto& tracked_keys = GetTrackedKeys();
   WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
-  for (const auto& cfkey : write_set_keys_) {
+  for (const auto& cfkey : tracked_keys) {
     const auto cfid = cfkey.first;
     const auto& keys = cfkey.second;
-    for (const auto& key : keys) {
+    for (const auto& pair : keys) {
+      const auto& key = pair.first;
       const auto& cf_handle = cf_map.at(cfid);
       PinnableSlice pinnable_val;
       bool not_used;
@@ -426,7 +486,6 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       wpt_db_->RemovePrepared(seq.first, seq.second);
     }
     unprep_seqs_.clear();
-    write_set_keys_.clear();
     return s;
   }  // else do the 2nd write for commit
   uint64_t& prepare_seq = seq_used;
@@ -453,10 +512,16 @@ Status WriteUnpreparedTxn::RollbackInternal() {
   }
 
   unprep_seqs_.clear();
-  write_set_keys_.clear();
   return s;
 }
 
+void WriteUnpreparedTxn::Clear() {
+  if (!recovered_txn_) {
+    txn_db_impl_->UnLock(this, &GetTrackedKeys());
+  }
+  TransactionBaseImpl::Clear();
+}
+
 Status WriteUnpreparedTxn::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 751d36c23b9..15a76d13437 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -94,20 +94,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                               const SliceParts& key,
                               const bool assume_tracked = false) override;
 
-  virtual Status RebuildFromWriteBatch(WriteBatch*) override {
-    // This function was only useful for recovering prepared transactions, but
-    // is unused for write prepared because a transaction may consist of
-    // multiple write batches.
-    //
-    // If there are use cases outside of recovery that can make use of this,
-    // then support could be added.
-    return Status::NotSupported("Not supported for WriteUnprepared");
-  }
+  virtual Status RebuildFromWriteBatch(WriteBatch*) override;
 
   const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
 
-  void UpdateWriteKeySet(uint32_t cfid, const Slice& key);
-
  protected:
   void Initialize(const TransactionOptions& txn_options) override;
 
@@ -118,6 +108,8 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   Status RollbackInternal() override;
 
+  void Clear() override;
+
   // Get and GetIterator needs to be overridden so that a ReadCallback to
   // handle read-your-own-write is used.
   using Transaction::Get;
@@ -157,10 +149,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // commit callbacks.
   std::map<SequenceNumber, size_t> unprep_seqs_;
 
-  // Set of keys that have written to that have already been written to DB
-  // (ie. not in write_batch_).
-  //
-  std::map<uint32_t, std::vector<std::string>> write_set_keys_;
+  // Recovered transactions have tracked_keys_ populated, but are not actually
+  // locked for efficiency reasons. For recovered transactions, skip unlocking
+  // keys when transaction ends.
+  bool recovered_txn_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 9382edfad2b..c4be058bb96 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -252,12 +252,13 @@ Status WriteUnpreparedTxnDB::Initialize(
     assert(real_trx);
     auto wupt =
         static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
+    wupt->recovered_txn_ = true;
 
     real_trx->SetLogNumber(first_log_number);
     real_trx->SetId(first_seq);
     Status s = real_trx->SetName(recovered_trx->name_);
     if (!s.ok()) {
-      break;
+      return s;
     }
     wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
 
@@ -270,12 +271,11 @@ Status WriteUnpreparedTxnDB::Initialize(
       ordered_seq_cnt[seq] = cnt;
       assert(wupt->unprep_seqs_.count(seq) == 0);
       wupt->unprep_seqs_[seq] = cnt;
-      KeySetBuilder keyset_handler(wupt,
-                                   txn_db_options_.rollback_merge_operands);
-      s = batch_info.batch_->Iterate(&keyset_handler);
+
+      s = wupt->RebuildFromWriteBatch(batch_info.batch_);
       assert(s.ok());
       if (!s.ok()) {
-        break;
+        return s;
       }
     }
 
@@ -284,7 +284,7 @@ Status WriteUnpreparedTxnDB::Initialize(
 
     real_trx->SetState(Transaction::PREPARED);
     if (!s.ok()) {
-      break;
+      return s;
     }
   }
   // AddPrepared must be called in order
@@ -397,29 +397,5 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   return db_iter;
 }
 
-Status KeySetBuilder::PutCF(uint32_t cf, const Slice& key,
-                            const Slice& /*val*/) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::DeleteCF(uint32_t cf, const Slice& key) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::SingleDeleteCF(uint32_t cf, const Slice& key) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::MergeCF(uint32_t cf, const Slice& key,
-                              const Slice& /*val*/) {
-  if (rollback_merge_operands_) {
-    txn_->UpdateWriteKeySet(cf, key);
-  }
-  return Status::OK();
-}
-
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h
index 6405ba68381..65cb4b9195a 100644
--- a/utilities/transactions/write_unprepared_txn_db.h
+++ b/utilities/transactions/write_unprepared_txn_db.h
@@ -144,32 +144,5 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
   SequenceNumber rollback_seq_;
 };
 
-struct KeySetBuilder : public WriteBatch::Handler {
-  WriteUnpreparedTxn* txn_;
-  bool rollback_merge_operands_;
-
-  KeySetBuilder(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
-      : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
-
-  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override;
-
-  Status DeleteCF(uint32_t cf, const Slice& key) override;
-
-  Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
-
-  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override;
-
-  // Recovered batches do not contain 2PC markers.
-  Status MarkNoop(bool) override { return Status::InvalidArgument(); }
-  Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
-  Status MarkEndPrepare(const Slice&) override {
-    return Status::InvalidArgument();
-  }
-  Status MarkCommit(const Slice&) override { return Status::InvalidArgument(); }
-  Status MarkRollback(const Slice&) override {
-    return Status::InvalidArgument();
-  }
-};
-
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE

From 699a569c523c1d1083c2da79c5b42a3f70d74181 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 16 Jul 2019 16:27:32 -0700
Subject: [PATCH 226/572] Remove RandomAccessFileReader.for_compaction_ (#5572)

Summary:
RandomAccessFileReader.for_compaction_ doesn't seem to be used anymore. Remove it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5572

Test Plan: USE_CLANG=1 make all check -j

Differential Revision: D16286178

fbshipit-source-id: aa338049761033dfbe5e8b1707bbb0be2df5be7e
---
 db/table_cache.cc          | 5 ++---
 db/table_cache.h           | 3 +--
 db/version_set.cc          | 3 +--
 util/file_reader_writer.cc | 1 -
 util/file_reader_writer.h  | 5 +----
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/db/table_cache.cc b/db/table_cache.cc
index b98d4b074ff..121d4941fc0 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -93,7 +93,7 @@ Status TableCache::GetTableReader(
     bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
     std::unique_ptr<TableReader>* table_reader,
     const SliceTransform* prefix_extractor, bool skip_filters, int level,
-    bool prefetch_index_and_filter_in_cache, bool for_compaction) {
+    bool prefetch_index_and_filter_in_cache) {
   std::string fname =
       TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
   std::unique_ptr<RandomAccessFile> file;
@@ -109,8 +109,7 @@ Status TableCache::GetTableReader(
         new RandomAccessFileReader(
             std::move(file), fname, ioptions_.env,
             record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, for_compaction,
-            ioptions_.listeners));
+            file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, prefix_extractor, env_options,
                            internal_comparator, skip_filters, immortal_tables_,
diff --git a/db/table_cache.h b/db/table_cache.h
index f274337e952..f9fd4815228 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -179,8 +179,7 @@ class TableCache {
                         std::unique_ptr<TableReader>* table_reader,
                         const SliceTransform* prefix_extractor = nullptr,
                         bool skip_filters = false, int level = -1,
-                        bool prefetch_index_and_filter_in_cache = true,
-                        bool for_compaction = false);
+                        bool prefetch_index_and_filter_in_cache = true);
 
   const ImmutableCFOptions& ioptions_;
   const EnvOptions& env_options_;
diff --git a/db/version_set.cc b/db/version_set.cc
index 32dd61db830..0d3b9fb4e32 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1231,8 +1231,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
       new RandomAccessFileReader(
           std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
           0 /* hist_type */, nullptr /* file_read_hist */,
-          nullptr /* rate_limiter */, false /* for_compaction*/,
-          ioptions->listeners));
+          nullptr /* rate_limiter */, ioptions->listeners));
   s = ReadTableProperties(
       file_reader.get(), file_meta->fd.GetFileSize(),
       Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index db16e82ae11..15f41bf3a06 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -197,7 +197,6 @@ Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
   Status s;
   uint64_t elapsed = 0;
   assert(!use_direct_io());
-  assert(!for_compaction_);
   {
     StopWatch sw(env_, stats_, hist_type_,
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 0c5089d0758..3052ca8f4e0 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -108,7 +108,6 @@ class RandomAccessFileReader {
   uint32_t        hist_type_;
   HistogramImpl*  file_read_hist_;
   RateLimiter* rate_limiter_;
-  bool for_compaction_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
 
  public:
@@ -116,7 +115,7 @@ class RandomAccessFileReader {
       std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
       Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
       HistogramImpl* file_read_hist = nullptr,
-      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
+      RateLimiter* rate_limiter = nullptr,
       const std::vector<std::shared_ptr<EventListener>>& listeners = {})
       : file_(std::move(raf)),
         file_name_(std::move(_file_name)),
@@ -125,7 +124,6 @@ class RandomAccessFileReader {
         hist_type_(hist_type),
         file_read_hist_(file_read_hist),
         rate_limiter_(rate_limiter),
-        for_compaction_(for_compaction),
         listeners_() {
 #ifndef ROCKSDB_LITE
     std::for_each(listeners.begin(), listeners.end(),
@@ -151,7 +149,6 @@ class RandomAccessFileReader {
     hist_type_ = std::move(o.hist_type_);
     file_read_hist_ = std::move(o.file_read_hist_);
     rate_limiter_ = std::move(o.rate_limiter_);
-    for_compaction_ = std::move(o.for_compaction_);
     return *this;
   }
 

From 0f4d90e6e4b3295cb5b6df6bbc36d2e2101b95f0 Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Tue, 16 Jul 2019 18:18:07 -0700
Subject: [PATCH 227/572] Added support for sequential read-ahead file (#5580)

Summary:
Added support for sequential read-ahead file that can prefetch the read data and later serve it from internal cache buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5580

Differential Revision: D16287082

Pulled By: elipoz

fbshipit-source-id: a3e7ad9643d377d39352ff63058ce050ec31dcf3
---
 test_util/testutil.h            | 221 ++++++++++++++++----------------
 util/file_reader_writer.cc      | 214 +++++++++++++++++++++++++++----
 util/file_reader_writer.h       |  18 ++-
 util/file_reader_writer_test.cc | 119 ++++++++++++++++-
 4 files changed, 429 insertions(+), 143 deletions(-)

diff --git a/test_util/testutil.h b/test_util/testutil.h
index bb732ff3a5a..716ae7d26e8 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -492,13 +492,11 @@ inline std::string EncodeInt(uint64_t x) {
   return result;
 }
 
-class StringEnv : public EnvWrapper {
- public:
   class SeqStringSource : public SequentialFile {
    public:
     explicit SeqStringSource(const std::string& data)
         : data_(data), offset_(0) {}
-    ~SeqStringSource() {}
+    ~SeqStringSource() override {}
     Status Read(size_t n, Slice* result, char* scratch) override {
       std::string output;
       if (offset_ < data_.size()) {
@@ -527,129 +525,136 @@ class StringEnv : public EnvWrapper {
     size_t offset_;
   };
 
-  class StringSink : public WritableFile {
+  class StringEnv : public EnvWrapper {
    public:
-    explicit StringSink(std::string* contents)
-        : WritableFile(), contents_(contents) {}
-    virtual Status Truncate(uint64_t size) override {
-      contents_->resize(static_cast<size_t>(size));
-      return Status::OK();
-    }
-    virtual Status Close() override { return Status::OK(); }
-    virtual Status Flush() override { return Status::OK(); }
-    virtual Status Sync() override { return Status::OK(); }
-    virtual Status Append(const Slice& slice) override {
-      contents_->append(slice.data(), slice.size());
-      return Status::OK();
-    }
+    class StringSink : public WritableFile {
+     public:
+      explicit StringSink(std::string* contents)
+          : WritableFile(), contents_(contents) {}
+      virtual Status Truncate(uint64_t size) override {
+        contents_->resize(static_cast<size_t>(size));
+        return Status::OK();
+      }
+      virtual Status Close() override { return Status::OK(); }
+      virtual Status Flush() override { return Status::OK(); }
+      virtual Status Sync() override { return Status::OK(); }
+      virtual Status Append(const Slice& slice) override {
+        contents_->append(slice.data(), slice.size());
+        return Status::OK();
+      }
 
-   private:
-    std::string* contents_;
-  };
+     private:
+      std::string* contents_;
+    };
 
-  explicit StringEnv(Env* t) : EnvWrapper(t) {}
-  virtual ~StringEnv() {}
+    explicit StringEnv(Env* t) : EnvWrapper(t) {}
+    ~StringEnv() override {}
 
-  const std::string& GetContent(const std::string& f) { return files_[f]; }
+    const std::string& GetContent(const std::string& f) { return files_[f]; }
 
-  const Status WriteToNewFile(const std::string& file_name,
-                              const std::string& content) {
-    std::unique_ptr<WritableFile> r;
-    auto s = NewWritableFile(file_name, &r, EnvOptions());
-    if (!s.ok()) {
-      return s;
+    const Status WriteToNewFile(const std::string& file_name,
+                                const std::string& content) {
+      std::unique_ptr<WritableFile> r;
+      auto s = NewWritableFile(file_name, &r, EnvOptions());
+      if (!s.ok()) {
+        return s;
+      }
+      r->Append(content);
+      r->Flush();
+      r->Close();
+      assert(files_[file_name] == content);
+      return Status::OK();
     }
-    r->Append(content);
-    r->Flush();
-    r->Close();
-    assert(files_[file_name] == content);
-    return Status::OK();
-  }
 
-  // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f,
-                           std::unique_ptr<SequentialFile>* r,
+    // The following text is boilerplate that forwards all methods to target()
+    Status NewSequentialFile(const std::string& f,
+                             std::unique_ptr<SequentialFile>* r,
+                             const EnvOptions& /*options*/) override {
+      auto iter = files_.find(f);
+      if (iter == files_.end()) {
+        return Status::NotFound("The specified file does not exist", f);
+      }
+      r->reset(new SeqStringSource(iter->second));
+      return Status::OK();
+    }
+    Status NewRandomAccessFile(const std::string& /*f*/,
+                               std::unique_ptr<RandomAccessFile>* /*r*/,
+                               const EnvOptions& /*options*/) override {
+      return Status::NotSupported();
+    }
+    Status NewWritableFile(const std::string& f,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& /*options*/) override {
-    auto iter = files_.find(f);
-    if (iter == files_.end()) {
-      return Status::NotFound("The specified file does not exist", f);
+      auto iter = files_.find(f);
+      if (iter != files_.end()) {
+        return Status::IOError("The specified file already exists", f);
+      }
+      r->reset(new StringSink(&files_[f]));
+      return Status::OK();
     }
-    r->reset(new SeqStringSource(iter->second));
-    return Status::OK();
-  }
-  Status NewRandomAccessFile(const std::string& /*f*/,
-                             std::unique_ptr<RandomAccessFile>* /*r*/,
-                             const EnvOptions& /*options*/) override {
-    return Status::NotSupported();
-  }
-  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
-                         const EnvOptions& /*options*/) override {
-    auto iter = files_.find(f);
-    if (iter != files_.end()) {
-      return Status::IOError("The specified file already exists", f);
+    virtual Status NewDirectory(
+        const std::string& /*name*/,
+        std::unique_ptr<Directory>* /*result*/) override {
+      return Status::NotSupported();
     }
-    r->reset(new StringSink(&files_[f]));
-    return Status::OK();
-  }
-  virtual Status NewDirectory(const std::string& /*name*/,
-                              std::unique_ptr<Directory>* /*result*/) override {
-    return Status::NotSupported();
-  }
-  Status FileExists(const std::string& f) override {
-    if (files_.find(f) == files_.end()) {
-      return Status::NotFound();
+    Status FileExists(const std::string& f) override {
+      if (files_.find(f) == files_.end()) {
+        return Status::NotFound();
+      }
+      return Status::OK();
     }
-    return Status::OK();
-  }
-  Status GetChildren(const std::string& /*dir*/,
-                     std::vector<std::string>* /*r*/) override {
-    return Status::NotSupported();
-  }
-  Status DeleteFile(const std::string& f) override {
-    files_.erase(f);
-    return Status::OK();
-  }
-  Status CreateDir(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status CreateDirIfMissing(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status DeleteDir(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status GetFileSize(const std::string& f, uint64_t* s) override {
-    auto iter = files_.find(f);
-    if (iter == files_.end()) {
-      return Status::NotFound("The specified file does not exist:", f);
+    Status GetChildren(const std::string& /*dir*/,
+                       std::vector<std::string>* /*r*/) override {
+      return Status::NotSupported();
+    }
+    Status DeleteFile(const std::string& f) override {
+      files_.erase(f);
+      return Status::OK();
+    }
+    Status CreateDir(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status CreateDirIfMissing(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status DeleteDir(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status GetFileSize(const std::string& f, uint64_t* s) override {
+      auto iter = files_.find(f);
+      if (iter == files_.end()) {
+        return Status::NotFound("The specified file does not exist:", f);
+      }
+      *s = iter->second.size();
+      return Status::OK();
     }
-    *s = iter->second.size();
-    return Status::OK();
-  }
 
-  Status GetFileModificationTime(const std::string& /*fname*/,
-                                 uint64_t* /*file_mtime*/) override {
-    return Status::NotSupported();
-  }
+    Status GetFileModificationTime(const std::string& /*fname*/,
+                                   uint64_t* /*file_mtime*/) override {
+      return Status::NotSupported();
+    }
 
-  Status RenameFile(const std::string& /*s*/,
-                    const std::string& /*t*/) override {
-    return Status::NotSupported();
-  }
+    Status RenameFile(const std::string& /*s*/,
+                      const std::string& /*t*/) override {
+      return Status::NotSupported();
+    }
 
-  Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override {
-    return Status::NotSupported();
-  }
+    Status LinkFile(const std::string& /*s*/,
+                    const std::string& /*t*/) override {
+      return Status::NotSupported();
+    }
 
-  Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
-    return Status::NotSupported();
-  }
+    Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
+      return Status::NotSupported();
+    }
 
-  Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); }
+    Status UnlockFile(FileLock* /*l*/) override {
+      return Status::NotSupported();
+    }
 
- protected:
-  std::unordered_map<std::string, std::string> files_;
-};
+   protected:
+    std::unordered_map<std::string, std::string> files_;
+  };
 
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 15f41bf3a06..9175fa502f9 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -639,6 +639,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 
  Status Read(uint64_t offset, size_t n, Slice* result,
              char* scratch) const override {
+   // Read-ahead only make sense if we have some slack left after reading
    if (n + alignment_ >= readahead_size_) {
      return file_->Read(offset, n, result, scratch);
    }
@@ -646,14 +647,13 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
    std::unique_lock<std::mutex> lk(lock_);
 
    size_t cached_len = 0;
-   // Check if there is a cache hit, means that [offset, offset + n) is either
-   // completely or partially in the buffer
+   // Check if there is a cache hit, meaning that [offset, offset + n) is either
+   // completely or partially in the buffer.
    // If it's completely cached, including end of file case when offset + n is
-   // greater than EOF, return
+   // greater than EOF, then return.
    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
-       (cached_len == n ||
-        // End of file
-        buffer_.CurrentSize() < readahead_size_)) {
+       (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+     // We read exactly what we needed, or we hit end of file - return.
      *result = Slice(scratch, cached_len);
      return Status::OK();
    }
@@ -661,25 +661,14 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
    // In the case of cache hit advanced_offset is already aligned, means that
    // chunk_offset equals to advanced_offset
    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
-   Slice readahead_result;
 
    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
    if (s.ok()) {
-     // In the case of cache miss, i.e. when cached_len equals 0, an offset can
-     // exceed the file end position, so the following check is required
-     if (advanced_offset < chunk_offset + buffer_.CurrentSize()) {
-       // In the case of cache miss, the first chunk_padding bytes in buffer_
-       // are
-       // stored for alignment only and must be skipped
-       size_t chunk_padding = advanced_offset - chunk_offset;
-       auto remaining_len =
-           std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len);
-       memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
-              remaining_len);
-       *result = Slice(scratch, cached_len + remaining_len);
-     } else {
-       *result = Slice(scratch, cached_len);
-     }
+     // The data we need is now in cache, so we can safely read it
+     size_t remaining_len;
+     TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                      scratch + cached_len);
+     *result = Slice(scratch, cached_len + remaining_len);
    }
    return s;
  }
@@ -690,6 +679,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
      return Status::OK();
    }
+
+   std::unique_lock<std::mutex> lk(lock_);
+
    size_t offset_ = static_cast<size_t>(offset);
    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
    if (prefetch_offset == buffer_offset_) {
@@ -706,12 +698,18 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
 
  Status InvalidateCache(size_t offset, size_t length) override {
+   std::unique_lock<std::mutex> lk(lock_);
+   buffer_.Clear();
    return file_->InvalidateCache(offset, length);
  }
 
  bool use_direct_io() const override { return file_->use_direct_io(); }
 
 private:
+ // Tries to read from buffer_ n bytes starting at offset. If anything was read
+ // from the cache, it sets cached_len to the number of bytes actually read,
+ // copies these number of bytes to scratch and returns true.
+ // If nothing was read sets cached_len to 0 and returns false.
  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
                        char* scratch) const {
    if (offset < buffer_offset_ ||
@@ -726,6 +724,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
    return true;
   }
 
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
   Status ReadIntoBuffer(uint64_t offset, size_t n) const {
     if (n > buffer_.Capacity()) {
       n = buffer_.Capacity();
@@ -742,14 +743,171 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     return s;
   }
 
-  std::unique_ptr<RandomAccessFile> file_;
+  const std::unique_ptr<RandomAccessFile> file_;
   const size_t alignment_;
-  size_t               readahead_size_;
+  const size_t readahead_size_;
 
   mutable std::mutex lock_;
+  // The buffer storing the prefetched data
   mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
   mutable uint64_t buffer_offset_;
 };
+
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public SequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<SequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    n -= cached_len;
+
+    Status s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, result, scratch + cached_len);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    Status s = Status::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return file_->PositionedRead(offset, n, result, scratch);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(size_t n) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<SequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
 }  // namespace
 
 Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
@@ -866,6 +1024,14 @@ std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
   return result;
 }
 
+std::unique_ptr<SequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<SequentialFile>&& file, size_t readahead_size) {
+  std::unique_ptr<SequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+
 Status NewWritableFile(Env* env, const std::string& fname,
                        std::unique_ptr<WritableFile>* result,
                        const EnvOptions& options) {
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index 3052ca8f4e0..a93274644c4 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -43,12 +43,18 @@ class SequentialFileReader {
  private:
   std::unique_ptr<SequentialFile> file_;
   std::string file_name_;
-  std::atomic<size_t> offset_;  // read offset
+  std::atomic<size_t> offset_{0};  // read offset
 
  public:
   explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
                                 const std::string& _file_name)
-      : file_(std::move(_file)), file_name_(_file_name), offset_(0) {}
+      : file_(std::move(_file)), file_name_(_file_name) {}
+
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name,
+                                size_t _readahead_size)
+      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
+        file_name_(_file_name) {}
 
   SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
     *this = std::move(o);
@@ -66,13 +72,17 @@ class SequentialFileReader {
 
   Status Skip(uint64_t n);
 
-  void Rewind();
-
   SequentialFile* file() { return file_.get(); }
 
   std::string file_name() { return file_name_; }
 
   bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<SequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<SequentialFile>&& file, size_t readahead_size);
 };
 
 // RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index a4a9458d642..aa74303b8fc 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -275,7 +275,7 @@ TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSizeTest) {
 }
 
 TEST_P(ReadaheadRandomAccessFileTest,
-       SourceStrLenCanBeGreaterThanReadaheadSizeTest) {
+       SourceStrLenGreaterThanReadaheadSizeTest) {
   Random rng(42);
   for (int k = 0; k < 100; ++k) {
     size_t strLen = k * GetReadaheadSize() +
@@ -286,13 +286,13 @@ TEST_P(ReadaheadRandomAccessFileTest,
     for (int test = 1; test <= 100; ++test) {
       size_t offset = rng.Uniform(static_cast<int>(strLen));
       size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
-      ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+      ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
                 Read(offset, n));
     }
   }
 }
 
-TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) {
+TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSizeTest) {
   Random rng(7);
   size_t strLen = 4 * GetReadaheadSize() +
                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
@@ -303,7 +303,7 @@ TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) {
     size_t offset = rng.Uniform(static_cast<int>(strLen));
     size_t n =
         GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize()));
-    ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+    ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
               Read(offset, n));
   }
 }
@@ -315,13 +315,118 @@ INSTANTIATE_TEST_CASE_P(
     SourceStrLenLessThanReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 INSTANTIATE_TEST_CASE_P(
-    SourceStrLenCanBeGreaterThanReadaheadSizeTest,
-    ReadaheadRandomAccessFileTest,
+    SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 INSTANTIATE_TEST_CASE_P(
-    NExceedReadaheadTest, ReadaheadRandomAccessFileTest,
+    ReadExceedsReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 
+class ReadaheadSequentialFileTest : public testing::Test,
+                                    public testing::WithParamInterface<size_t> {
+ public:
+  static std::vector<size_t> GetReadaheadSizeList() {
+    return {1lu << 12, 1lu << 16};
+  }
+  void SetUp() override {
+    readahead_size_ = GetParam();
+    scratch_.reset(new char[2 * readahead_size_]);
+    ResetSourceStr();
+  }
+  ReadaheadSequentialFileTest() {}
+  std::string Read(size_t n) {
+    Slice result;
+    test_read_holder_->Read(n, &result, scratch_.get());
+    return std::string(result.data(), result.size());
+  }
+  void Skip(size_t n) { test_read_holder_->Skip(n); }
+  void ResetSourceStr(const std::string& str = "") {
+    auto read_holder =
+        std::unique_ptr<SequentialFile>(new test::SeqStringSource(str));
+    test_read_holder_.reset(new SequentialFileReader(std::move(read_holder),
+                                                     "test", readahead_size_));
+  }
+  size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+  size_t readahead_size_;
+  std::unique_ptr<SequentialFileReader> test_read_holder_;
+  std::unique_ptr<char[]> scratch_;
+};
+
+TEST_P(ReadaheadSequentialFileTest, EmptySourceStrTest) {
+  ASSERT_EQ("", Read(0));
+  ASSERT_EQ("", Read(1));
+  ASSERT_EQ("", Read(13));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSizeTest) {
+  std::string str = "abcdefghijklmnopqrs";
+  ResetSourceStr(str);
+  ASSERT_EQ(str.substr(0, 3), Read(3));
+  ASSERT_EQ(str.substr(3, 1), Read(1));
+  ASSERT_EQ(str.substr(4), Read(str.size()));
+  ASSERT_EQ("", Read(100));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSizeTest) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str =
+          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSizeTest) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str =
+          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = GetReadaheadSize() +
+                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    EmptySourceStrTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenLessThanReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    ReadExceedsReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 74fb7f0ba53ecce443ff0a619199c0e2cb74ab35 Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Tue, 16 Jul 2019 19:13:35 -0700
Subject: [PATCH 228/572] Cleaned up and simplified LRU cache implementation
 (#5579)

Summary:
The 'refs' field in LRUHandle now counts only external references, since anyway we already have the IN_CACHE flag. This simplifies reference accounting logic a bit. Also cleaned up few asserts code as well as the comments - to be more readable.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5579

Differential Revision: D16286747

Pulled By: elipoz

fbshipit-source-id: 7186d88f80f512ce584d0a303437494b5cbefd7f
---
 .gitignore          |   1 +
 cache/cache_test.cc |   4 +-
 cache/lru_cache.cc  | 124 ++++++++++++++++++++------------------------
 cache/lru_cache.h   |  78 +++++++++++++++-------------
 4 files changed, 100 insertions(+), 107 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6364dfdc401..180fb4c5007 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ ldb
 manifest_dump
 sst_dump
 blob_dump
+block_cache_trace_analyzer
 column_aware_encoding_exp
 util/build_version.cc
 build_tools/VALGRIND_LOGS/
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 46ce78db68f..b728c67c7d7 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -562,6 +562,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
     ASSERT_OK(s);
     ASSERT_NE(nullptr, handles[i]);
   }
+  ASSERT_EQ(10, cache->GetUsage());
 
   // test2: set the flag to true. Insert and check if it fails.
   std::string extra_key = "extra";
@@ -571,6 +572,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
   ASSERT_TRUE(s.IsIncomplete());
   ASSERT_EQ(nullptr, handle);
+  ASSERT_EQ(10, cache->GetUsage());
 
   for (size_t i = 0; i < 10; i++) {
     cache->Release(handles[i]);
@@ -591,7 +593,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache2->Insert(extra_key, extra_value, 1, &deleter);
   // AS if the key have been inserted into cache but get evicted immediately.
   ASSERT_OK(s);
-  ASSERT_EQ(5, cache->GetUsage());
+  ASSERT_EQ(5, cache2->GetUsage());
   ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
 
   for (size_t i = 0; i < 5; i++) {
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 676bed3051c..7c04cb909d5 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -24,7 +24,7 @@ LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
 
 LRUHandleTable::~LRUHandleTable() {
   ApplyToAllCacheEntries([](LRUHandle* h) {
-    if (h->refs == 1) {
+    if (!h->HasRefs()) {
       h->Free();
     }
   });
@@ -113,29 +113,17 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
   SetCapacity(capacity);
 }
 
-LRUCacheShard::~LRUCacheShard() {}
-
-bool LRUCacheShard::Unref(LRUHandle* e) {
-  assert(e->refs > 0);
-  e->refs--;
-  return e->refs == 0;
-}
-
-// Call deleter and free
-
 void LRUCacheShard::EraseUnRefEntries() {
   autovector<LRUHandle*> last_reference_list;
   {
     MutexLock l(&mutex_);
     while (lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
-      assert(old->InCache());
-      assert(old->refs ==
-             1);  // LRU list contains elements which may be evicted
+      // LRU list contains only elements which can be evicted
+      assert(old->InCache() && !old->HasRefs());
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
       old->SetInCache(false);
-      Unref(old);
       usage_ -= old->charge;
       last_reference_list.push_back(old);
     }
@@ -148,22 +136,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 
 void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                            bool thread_safe) {
+  const auto applyCallback = [&]() {
+    table_.ApplyToAllCacheEntries(
+        [callback](LRUHandle* h) { callback(h->value, h->charge); });
+  };
+
   if (thread_safe) {
-    mutex_.Lock();
-  }
-  table_.ApplyToAllCacheEntries(
-      [callback](LRUHandle* h) { callback(h->value, h->charge); });
-  if (thread_safe) {
-    mutex_.Unlock();
+    MutexLock l(&mutex_);
+    applyCallback();
+  } else {
+    applyCallback();
   }
 }
 
 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
+  MutexLock l(&mutex_);
   *lru = &lru_;
   *lru_low_pri = lru_low_pri_;
 }
 
 size_t LRUCacheShard::TEST_GetLRUSize() {
+  MutexLock l(&mutex_);
   LRUHandle* lru_handle = lru_.next;
   size_t lru_size = 0;
   while (lru_handle != &lru_) {
@@ -231,14 +224,13 @@ void LRUCacheShard::MaintainPoolSize() {
 
 void LRUCacheShard::EvictFromLRU(size_t charge,
                                  autovector<LRUHandle*>* deleted) {
-  while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
     LRUHandle* old = lru_.next;
-    assert(old->InCache());
-    assert(old->refs == 1);  // LRU list contains elements which may be evicted
+    // LRU list contains only elements which can be evicted
+    assert(old->InCache() && !old->HasRefs());
     LRU_Remove(old);
     table_.Remove(old->key(), old->hash);
     old->SetInCache(false);
-    Unref(old);
     usage_ -= old->charge;
     deleted->push_back(old);
   }
@@ -252,8 +244,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
     high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
     EvictFromLRU(0, &last_reference_list);
   }
-  // we free the entries here outside of mutex for
-  // performance reasons
+
+  // Free the entries outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -269,22 +261,22 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
     assert(e->InCache());
-    if (e->refs == 1) {
+    if (!e->HasRefs()) {
+      // The entry is in LRU since it's in hash and has no external references
       LRU_Remove(e);
     }
-    e->refs++;
+    e->Ref();
     e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
 
 bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* handle = reinterpret_cast<LRUHandle*>(h);
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
   MutexLock l(&mutex_);
-  if (handle->InCache() && handle->refs == 1) {
-    LRU_Remove(handle);
-  }
-  handle->refs++;
+  // To create another reference - entry must be already externally referenced
+  assert(e->HasRefs());
+  e->Ref();
   return true;
 }
 
@@ -303,30 +295,27 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
   bool last_reference = false;
   {
     MutexLock l(&mutex_);
-    last_reference = Unref(e);
-    if (last_reference) {
-      usage_ -= e->charge;
-    }
-    if (e->refs == 1 && e->InCache()) {
+    last_reference = e->Unref();
+    if (last_reference && e->InCache()) {
       // The item is still in cache, and nobody else holds a reference to it
       if (usage_ > capacity_ || force_erase) {
-        // the cache is full
         // The LRU list must be empty since the cache is full
-        assert(!(usage_ > capacity_) || lru_.next == &lru_);
-        // take this opportunity and remove the item
+        assert(lru_.next == &lru_ || force_erase);
+        // Take this opportunity and remove the item
         table_.Remove(e->key(), e->hash);
         e->SetInCache(false);
-        Unref(e);
-        usage_ -= e->charge;
-        last_reference = true;
       } else {
-        // put the item on the list to be potentially freed
+        // Put the item back on the LRU list, and don't free it
         LRU_Insert(e);
+        last_reference = false;
       }
     }
+    if (last_reference) {
+      usage_ -= e->charge;
+    }
   }
 
-  // free outside of mutex
+  // Free the entry here outside of mutex for performance reasons
   if (last_reference) {
     e->Free();
   }
@@ -342,7 +331,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   // It shouldn't happen very often though.
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       new char[sizeof(LRUHandle) - 1 + key.size()]);
-  Status s;
+  Status s = Status::OK();
   autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
@@ -351,9 +340,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   e->key_length = key.size();
   e->flags = 0;
   e->hash = hash;
-  e->refs = (handle == nullptr
-                 ? 1
-                 : 2);  // One from LRUCache, one for the returned handle
+  e->refs = 0;
   e->next = e->prev = nullptr;
   e->SetInCache(true);
   e->SetPriority(priority);
@@ -366,11 +353,12 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
     // is freed or the lru list is empty
     EvictFromLRU(charge, &last_reference_list);
 
-    if (usage_ - lru_usage_ + charge > capacity_ &&
+    if ((usage_ + charge) > capacity_ &&
         (strict_capacity_limit_ || handle == nullptr)) {
       if (handle == nullptr) {
         // Don't insert the entry but still return ok, as if the entry inserted
         // into cache and get evicted immediately.
+        e->SetInCache(false);
         last_reference_list.push_back(e);
       } else {
         delete[] reinterpret_cast<char*>(e);
@@ -378,32 +366,30 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
         s = Status::Incomplete("Insert failed due to LRU cache being full.");
       }
     } else {
-      // insert into the cache
-      // note that the cache might get larger than its capacity if not enough
-      // space was freed
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
       LRUHandle* old = table_.Insert(e);
       usage_ += e->charge;
       if (old != nullptr) {
+        assert(old->InCache());
         old->SetInCache(false);
-        if (Unref(old)) {
-          usage_ -= old->charge;
-          // old is on LRU because it's in cache and its reference count
-          // was just 1 (Unref returned 0)
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
           LRU_Remove(old);
+          usage_ -= old->charge;
           last_reference_list.push_back(old);
         }
       }
       if (handle == nullptr) {
         LRU_Insert(e);
       } else {
+        e->Ref();
         *handle = reinterpret_cast<Cache::Handle*>(e);
       }
-      s = Status::OK();
     }
   }
 
-  // we free the entries here outside of mutex for
-  // performance reasons
+  // Free the entries here outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -418,18 +404,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
     MutexLock l(&mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
-      last_reference = Unref(e);
-      if (last_reference) {
-        usage_ -= e->charge;
-      }
-      if (last_reference && e->InCache()) {
+      assert(e->InCache());
+      e->SetInCache(false);
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
         LRU_Remove(e);
+        usage_ -= e->charge;
+        last_reference = true;
       }
-      e->SetInCache(false);
     }
   }
 
-  // mutex not held here
+  // Free the entry here outside of mutex for performance reasons
   // last_reference will only be true if e != nullptr
   if (last_reference) {
     e->Free();
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 0d9a317486e..1ff765d1592 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -17,31 +17,34 @@
 
 namespace rocksdb {
 
-// LRU cache implementation
+// LRU cache implementation. This class is not thread-safe.
 
 // An entry is a variable length heap-allocated structure.
 // Entries are referenced by cache and/or by any external entity.
-// The cache keeps all its entries in table. Some elements
+// The cache keeps all its entries in a hash table. Some elements
 // are also stored on LRU list.
 //
 // LRUHandle can be in these states:
 // 1. Referenced externally AND in hash table.
-//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
-// 2. Not referenced externally and in hash table. In that case the entry is
-// in the LRU and can be freed. (refs == 1 && in_cache == true)
-// 3. Referenced externally and not in hash table. In that case the entry is
-// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//    In that case the entry is *not* in the LRU list
+//    (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+//    In that case the entry is in the LRU list and can be freed.
+//    (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+//    In that case the entry is not in the LRU list and not in hash table.
+//    The entry can be freed when refs becomes 0.
+//    (refs >= 1 && in_cache == false)
 //
 // All newly created LRUHandles are in state 1. If you call
-// LRUCacheShard::Release
-// on entry in state 1, it will go into state 2. To move from state 1 to
-// state 3, either call LRUCacheShard::Erase or LRUCacheShard::Insert with the
-// same key.
+// LRUCacheShard::Release on entry in state 1, it will go into state 2.
+// To move from state 1 to state 3, either call LRUCacheShard::Erase or
+// LRUCacheShard::Insert with the same key (but possibly different value).
 // To move from state 2 to state 1, use LRUCacheShard::Lookup.
 // Before destruction, make sure that no handles are in state 1. This means
 // that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a
-// matching
-// RUCache::Release (to move into state 2) or LRUCacheShard::Erase (for state 3)
+// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase
+// (to move into state 3).
 
 struct LRUHandle {
   void* value;
@@ -51,37 +54,42 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;  // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;     // a number of refs to this entry
-                     // cache itself is counted as 1
-
-  // Include the following flags:
-  //   IN_CACHE:         whether this entry is referenced by the hash table.
-  //   IS_HIGH_PRI:      whether this entry is high priority entry.
-  //   IN_HIGH_PRI_POOL: whether this entry is in high-pri pool.
-  //   HAS_HIT:          whether this entry has had any lookups (hits).
+  // The hash of key(). Used for fast sharding and comparisons.
+  uint32_t hash;
+  // The number of external refs to this entry. The cache itself is not counted.
+  uint32_t refs;
+
   enum Flags : uint8_t {
+    // Whether this entry is referenced by the hash table.
     IN_CACHE = (1 << 0),
+    // Whether this entry is high priority entry.
     IS_HIGH_PRI = (1 << 1),
+    // Whether this entry is in high-pri pool.
     IN_HIGH_PRI_POOL = (1 << 2),
+    // Wwhether this entry has had any lookups (hits).
     HAS_HIT = (1 << 3),
   };
 
   uint8_t flags;
 
-  uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
+  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+  char key_data[1];
 
-  char key_data[1];  // Beginning of key
+  Slice key() const { return Slice(key_data, key_length); }
 
-  Slice key() const {
-    // For cheaper lookups, we allow a temporary Handle object
-    // to store a pointer to a key in "value".
-    if (next == this) {
-      return *(reinterpret_cast<Slice*>(value));
-    } else {
-      return Slice(key_data, key_length);
-    }
+  // Increase the reference count by 1.
+  void Ref() { refs++; }
+
+  // Just reduce the reference count by 1. Return true if it was last reference.
+  bool Unref() {
+    assert(refs > 0);
+    refs--;
+    return refs == 0;
   }
 
+  // Return true if there are external refs, false otherwise.
+  bool HasRefs() const { return refs > 0; }
+
   bool InCache() const { return flags & IN_CACHE; }
   bool IsHighPri() const { return flags & IS_HIGH_PRI; }
   bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
@@ -114,7 +122,7 @@ struct LRUHandle {
   void SetHit() { flags |= HAS_HIT; }
 
   void Free() {
-    assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
+    assert(refs == 0);
     if (deleter) {
       (*deleter)(key(), value);
     }
@@ -169,7 +177,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio, bool use_adaptive_mutex);
-  virtual ~LRUCacheShard();
+  virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
   // if current usage is more than new capacity, the function will attempt to
@@ -225,10 +233,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // high-pri pool is no larger than the size specify by high_pri_pool_pct.
   void MaintainPoolSize();
 
-  // Just reduce the reference count by 1.
-  // Return true if last reference
-  bool Unref(LRUHandle* e);
-
   // Free some space following strict LRU policy until enough space
   // to hold (usage_ + charge) is freed or the lru list is empty
   // This function is not thread safe - it needs to be executed while

From a3c1832e862ab4b76ccf1299d6e95b15eb50730e Mon Sep 17 00:00:00 2001
From: Yuqi Gu <yuqi.gu@arm.com>
Date: Wed, 17 Jul 2019 11:19:06 -0700
Subject: [PATCH 229/572] Arm64 CRC32 parallel computation optimization for
 RocksDB (#5494)

Summary:
Crc32c Parallel computation optimization:
Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf)
 Input data is divided into three equal-sized blocks
Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes

1. crc32c_test:
```
[==========] Running 4 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 4 tests from CRC
[ RUN      ] CRC.StandardResults
[       OK ] CRC.StandardResults (1 ms)
[ RUN      ] CRC.Values
[       OK ] CRC.Values (0 ms)
[ RUN      ] CRC.Extend
[       OK ] CRC.Extend (0 ms)
[ RUN      ] CRC.Mask
[       OK ] CRC.Mask (0 ms)
[----------] 4 tests from CRC (1 ms total)

[----------] Global test environment tear-down
[==========] 4 tests from 1 test case ran. (1 ms total)
[  PASSED  ] 4 tests.
```

2. RocksDB benchmark: db_bench --benchmarks="crc32c"

```
Linear Arm crc32c:
  crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op)
```

```
Parallel optimization with Armv8 crypto extension:
  crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op)
```

It gets ~2.4x speedup compared to linear Arm crc32c instructions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494

Differential Revision: D16340806

fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945
---
 Makefile             |  4 +--
 util/crc32c_arm64.cc | 83 ++++++++++++++++++++++++++++++++++----------
 util/crc32c_arm64.h  | 15 ++++++--
 3 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 1828b833b02..100f160ca31 100644
--- a/Makefile
+++ b/Makefile
@@ -144,8 +144,8 @@ HAVE_POWER8=1
 endif
 
 ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
-CXXFLAGS += -march=armv8-a+crc
-CFLAGS += -march=armv8-a+crc
+CXXFLAGS += -march=armv8-a+crc+crypto
+CFLAGS += -march=armv8-a+crc+crypto
 ARMCRC_SOURCE=1
 endif
 
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 62fabe99e3c..8743f8c721c 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -19,35 +19,82 @@ uint32_t crc32c_runtime_check(void) {
 
 uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
                              unsigned len) {
-  const uint8_t *buf1;
-  const uint16_t *buf2;
-  const uint32_t *buf4;
-  const uint64_t *buf8;
+  const uint8_t *buf8;
+  const uint64_t *buf64 = (uint64_t *)data;
+  int length = (int)len;
+  crc ^= 0xffffffff;
 
-  int64_t length = (int64_t)len;
+#ifdef HAVE_ARM64_CRYPTO
+  /* Crc32c Parallel computation
+   *   Algorithm comes from Intel whitepaper:
+   *   crc-iscsi-polynomial-crc32-instruction-paper
+   *
+   * Input data is divided into three equal-sized blocks
+   *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+   *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+   */
+  #define BLK_LENGTH 42
+  while (length >= 1024) {
+    uint64_t t0, t1;
+    uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
 
-  crc ^= 0xffffffff;
-  buf8 = (const uint64_t *)data;
-  while ((length -= sizeof(uint64_t)) >= 0) {
-    crc = __crc32cd(crc, *buf8++);
+    /* Parallel Param:
+     *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+     *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+     */
+    uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+    /* First 8 bytei for better pipelining */
+    crc0 = crc32c_u64(crc, *buf64++);
+
+    /* 3 blocks crc32c parallel computation
+     *
+     * 42 * 8 * 3 = 1008 (bytes)
+     */
+    for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
+      crc0 = crc32c_u64(crc0, *buf64);
+      crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
+      crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
+    }
+    buf64 += (BLK_LENGTH * 2);
+
+    /* Last 8 bytes */
+    crc = crc32c_u64(crc2, *buf64++);
+
+    t0 = (uint64_t)vmull_p64(crc0, k0);
+    t1 = (uint64_t)vmull_p64(crc1, k1);
+
+    /* Merge (crc0, crc1, crc2) -> crc */
+    crc1 = crc32c_u64(0, t1);
+    crc ^= crc1;
+    crc0 = crc32c_u64(0, t0);
+    crc ^= crc0;
+
+    length -= 1024;
+  }
+#endif
+  buf8 = (const uint8_t *)buf64;
+  while (length >= 8) {
+    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
+    buf8 += 8;
+    length -= 8;
   }
 
   /* The following is more efficient than the straight loop */
-  buf4 = (const uint32_t *)buf8;
-  if (length & sizeof(uint32_t)) {
-    crc = __crc32cw(crc, *buf4++);
+  if (length >= 4) {
+    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
+    buf8 += 4;
     length -= 4;
   }
 
-  buf2 = (const uint16_t *)buf4;
-  if (length & sizeof(uint16_t)) {
-    crc = __crc32ch(crc, *buf2++);
+  if (length >= 2) {
+    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
+    buf8 += 2;
     length -= 2;
   }
 
-  buf1 = (const uint8_t *)buf2;
-  if (length & sizeof(uint8_t))
-    crc = __crc32cb(crc, *buf1);
+  if (length >= 1)
+    crc = crc32c_u8(crc, *buf8);
 
   crc ^= 0xffffffff;
   return crc;
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 80b3aca361a..fb727ce4020 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -9,13 +9,24 @@
 #include <cinttypes>
 
 #if defined(__aarch64__) || defined(__AARCH64__)
+
 #ifdef __ARM_FEATURE_CRC32
 #define HAVE_ARM64_CRC
 #include <arm_acle.h>
+#define crc32c_u8(crc, v)  __crc32cb(crc, v)
+#define crc32c_u16(crc, v) __crc32ch(crc, v)
+#define crc32c_u32(crc, v) __crc32cw(crc, v)
+#define crc32c_u64(crc, v) __crc32cd(crc, v)
+
 extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
 extern uint32_t crc32c_runtime_check(void);
-#endif
-#endif
 
+#ifdef __ARM_FEATURE_CRYPTO
+#define HAVE_ARM64_CRYPTO
+#include <arm_neon.h>
+#endif  // __ARM_FEATURE_CRYPTO
+#endif  // __ARM_FEATURE_CRC32
+
+#endif  // defined(__aarch64__) || defined(__AARCH64__)
 
 #endif

From 22ce4624509694b8c35a15ef1fc49d3013f05a96 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venki@cohesity.com>
Date: Wed, 17 Jul 2019 12:22:21 -0700
Subject: [PATCH 230/572] Export Import sst files (#5495)

Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135

This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"

We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.

(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
//   is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
                                  const std::string& export_dir,
                                  ExportImportFilesMetaData** metadata);

Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.

(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
//     an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
    const ColumnFamilyOptions& options, const std::string& column_family_name,
    const ImportColumnFamilyOptions& import_options,
    const ExportImportFilesMetaData& metadata,
    ColumnFamilyHandle** handle);

Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.

If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.

Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495

Differential Revision: D16018881

fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
---
 CMakeLists.txt                           |   1 +
 Makefile                                 |   5 +
 TARGETS                                  |   1 +
 db/compacted_db_impl.h                   |   9 +
 db/db_impl/db_impl.cc                    | 122 +++++
 db/db_impl/db_impl.h                     |  11 +-
 db/db_impl/db_impl_readonly.h            |  10 +
 db/db_test.cc                            |  10 +
 db/import_column_family_job.cc           | 257 +++++++++++
 db/import_column_family_job.h            |  70 +++
 db/import_column_family_test.cc          | 565 +++++++++++++++++++++++
 include/rocksdb/db.h                     |  21 +
 include/rocksdb/metadata.h               |   7 +
 include/rocksdb/options.h                |   6 +
 include/rocksdb/utilities/checkpoint.h   |  14 +
 include/rocksdb/utilities/stackable_db.h |  10 +
 src.mk                                   |   1 +
 utilities/checkpoint/checkpoint_impl.cc  | 185 ++++++++
 utilities/checkpoint/checkpoint_impl.h   |  23 +
 utilities/checkpoint/checkpoint_test.cc  | 126 +++++
 20 files changed, 1453 insertions(+), 1 deletion(-)
 create mode 100644 db/import_column_family_job.cc
 create mode 100644 db/import_column_family_job.h
 create mode 100644 db/import_column_family_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65904b8cae6..b49a13572bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -520,6 +520,7 @@ set(SOURCES
         db/flush_job.cc
         db/flush_scheduler.cc
         db/forward_iterator.cc
+	db/import_column_family_job.cc
         db/internal_stats.cc
         db/logs_with_prep_tracker.cc
         db/log_reader.cc
diff --git a/Makefile b/Makefile
index 100f160ca31..f8a904bd39d 100644
--- a/Makefile
+++ b/Makefile
@@ -500,6 +500,7 @@ TESTS = \
 	plain_table_db_test \
 	comparator_db_test \
 	external_sst_file_test \
+	import_column_family_test \
 	prefix_test \
 	skiplist_test \
 	write_buffer_manager_test \
@@ -577,6 +578,7 @@ PARALLEL_TEST = \
 	db_universal_compaction_test \
 	db_wal_test \
 	external_sst_file_test \
+	import_column_family_test \
 	fault_injection_test \
 	inlineskiplist_test \
 	manual_compaction_test \
@@ -1274,6 +1276,9 @@ external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util.
 external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index eda1051396d..cfd9ef73d40 100644
--- a/TARGETS
+++ b/TARGETS
@@ -113,6 +113,7 @@ cpp_library(
         "db/flush_job.cc",
         "db/flush_scheduler.cc",
         "db/forward_iterator.cc",
+        "db/import_column_family_job.cc",
         "db/internal_stats.cc",
         "db/log_reader.cc",
         "db/log_writer.cc",
diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index c1b8da9a782..e71ce249411 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -85,6 +85,15 @@ class CompactedDBImpl : public DBImpl {
       const IngestExternalFileOptions& /*ingestion_options*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
 
  private:
   friend class DB;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 6f2ebdc8098..af9aea011a3 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -33,6 +33,7 @@
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
+#include "db/import_column_family_job.h"
 #include "db/flush_job.h"
 #include "db/forward_iterator.h"
 #include "db/job_context.h"
@@ -3894,6 +3895,127 @@ Status DBImpl::IngestExternalFiles(
   return status;
 }
 
+Status DBImpl::CreateColumnFamilyWithImport(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    const ImportColumnFamilyOptions& import_options,
+    const ExportImportFilesMetaData& metadata,
+    ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  assert(*handle == nullptr);
+  std::string cf_comparator_name = options.comparator->Name();
+  if (cf_comparator_name != metadata.db_comparator_name) {
+    return Status::InvalidArgument("Comparator name mismatch");
+  }
+
+  // Create column family.
+  auto status = CreateColumnFamily(options, column_family_name, handle);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Import sst files from metadata.
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+  auto cfd = cfh->cfd();
+  ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
+                                   immutable_db_options_, env_options_,
+                                   import_options, metadata.files);
+
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
+  std::list<uint64_t>::iterator pending_output_elem;
+  {
+    // Lock db mutex
+    InstrumentedMutexLock l(&mutex_);
+    if (error_handler_.IsDBStopped()) {
+      // Don't import files when there is a bg_error
+      status = error_handler_.GetBGError();
+    }
+
+    // Make sure that bg cleanup wont delete the files that we are importing
+    pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+
+    if (status.ok()) {
+      // If crash happen after a hard link established, Recover function may
+      // reuse the file number that has already assigned to the internal file,
+      // and this will overwrite the external file. To protect the external
+      // file, we have to make sure the file number will never being reused.
+      next_file_number =
+          versions_->FetchAddFileNumber(metadata.files.size());
+      auto cf_options = cfd->GetLatestMutableCFOptions();
+      status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                      directories_.GetDbDir());
+      if (status.ok()) {
+        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+      }
+    }
+  }
+  dummy_sv_ctx.Clean();
+
+  if (status.ok()) {
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+    status = import_job.Prepare(next_file_number, sv);
+    CleanupSuperVersion(sv);
+  }
+
+  if (status.ok()) {
+    SuperVersionContext sv_context(true /*create_superversion*/);
+    {
+      // Lock db mutex
+      InstrumentedMutexLock l(&mutex_);
+
+      // Stop writes to the DB by entering both write threads
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WriteThread::Writer nonmem_w;
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+
+      num_running_ingest_file_++;
+      assert(!cfd->IsDropped());
+      status = import_job.Run();
+
+      // Install job edit [Mutex will be unlocked here]
+      if (status.ok()) {
+        auto cf_options = cfd->GetLatestMutableCFOptions();
+        status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+                                        &mutex_, directories_.GetDbDir());
+        if (status.ok()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+        }
+      }
+
+      // Resume writes to the DB
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+      write_thread_.ExitUnbatched(&w);
+
+      num_running_ingest_file_--;
+      if (num_running_ingest_file_ == 0) {
+        bg_cv_.SignalAll();
+      }
+    }
+    // mutex_ is unlocked here
+
+    sv_context.Clean();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+  }
+
+  import_job.Cleanup(status);
+  if (!status.ok()) {
+    DropColumnFamily(*handle);
+    DestroyColumnFamilyHandle(*handle);
+    *handle = nullptr;
+  }
+  return status;
+}
+
 Status DBImpl::VerifyChecksum() {
   Status s;
   std::vector<ColumnFamilyData*> cfd_list;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index d417035b1ef..547e3e1d6be 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -27,6 +27,7 @@
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
 #include "db/internal_stats.h"
 #include "db/log_writer.h"
 #include "db/logs_with_prep_tracker.h"
@@ -356,6 +357,13 @@ class DBImpl : public DB {
   virtual Status IngestExternalFiles(
       const std::vector<IngestExternalFileArg>& args) override;
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override;
+
   virtual Status VerifyChecksum() override;
 
   using DB::StartTrace;
@@ -1803,7 +1811,8 @@ class DBImpl : public DB {
 
   std::string db_absolute_path_;
 
-  // Number of running IngestExternalFile() calls.
+  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+  // calls.
   // REQUIRES: mutex held
   int num_running_ingest_file_;
 
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index 18df900cba0..ad307677ccc 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -115,6 +115,16 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
  private:
   friend class DB;
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 69e91923cd6..36bdda59e21 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2492,6 +2492,16 @@ class ModelDB : public DB {
     return Status::NotSupported("Not implemented");
   }
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
   Status VerifyChecksum() override {
     return Status::NotSupported("Not implemented.");
   }
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
new file mode 100644
index 00000000000..3c00a25917d
--- /dev/null
+++ b/db/import_column_family_job.cc
@@ -0,0 +1,257 @@
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <cinttypes>
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "util/file_reader_writer.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+                                      SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are importing
+  for (const auto& file_metadata : metadata_) {
+    const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+    IngestedFileInfo file_to_import;
+    status = GetIngestedFileInfo(file_path, &file_to_import, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_import_.push_back(file_to_import);
+  }
+
+  const auto ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_import_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges in any particular
+    // level.
+    int min_level = 1;  // Check for overlaps in Level 1 and above.
+    int max_level = -1;
+    for (const auto& file_metadata : metadata_) {
+      if (file_metadata.level > max_level) {
+        max_level = file_metadata.level;
+      }
+    }
+    for (int level = min_level; level <= max_level; ++level) {
+      autovector<const IngestedFileInfo*> sorted_files;
+      for (size_t i = 0; i < num_files; i++) {
+        if (metadata_[i].level == level) {
+          sorted_files.push_back(&files_to_import_[i]);
+        }
+      }
+
+      std::sort(sorted_files.begin(), sorted_files.end(),
+                [&ucmp](const IngestedFileInfo* info1,
+                        const IngestedFileInfo* info2) {
+                  return ucmp->Compare(info1->smallest_user_key,
+                                       info2->smallest_user_key) < 0;
+                });
+
+      for (size_t i = 0; i < sorted_files.size() - 1; i++) {
+        if (ucmp->Compare(sorted_files[i]->largest_user_key,
+                          sorted_files[i + 1]->smallest_user_key) >= 0) {
+          return Status::InvalidArgument("Files have overlapping ranges");
+        }
+      }
+    }
+  }
+
+  for (const auto& f : files_to_import_) {
+    if (f.num_entries == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key().Valid() ||
+        !f.largest_internal_key().Valid()) {
+      return Status::Corruption("File has corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  auto hardlink_files = import_options_.move_files;
+  for (auto& f : files_to_import_) {
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+
+    const auto path_outside_db = f.external_file_path;
+    const auto path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+    if (hardlink_files) {
+      status = env_->LinkFile(path_outside_db, path_inside_db);
+      if (status.IsNotSupported()) {
+        // Original file is on a different FS, use copy instead of hard linking
+        hardlink_files = false;
+      }
+    }
+    if (!hardlink_files) {
+      status = CopyFile(env_, path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync);
+    }
+    if (!status.ok()) {
+      break;
+    }
+    f.copy_file = !hardlink_files;
+    f.internal_file_path = path_inside_db;
+  }
+
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (const auto& f : files_to_import_) {
+      if (f.internal_file_path.empty()) {
+        break;
+      }
+      const auto s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+  Status status;
+  edit_.SetColumnFamily(cfd_->GetID());
+
+  for (size_t i = 0; i < files_to_import_.size(); ++i) {
+    const auto& f = files_to_import_[i];
+    const auto& file_metadata = metadata_[i];
+    edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+                  f.fd.GetFileSize(), f.smallest_internal_key(),
+                  f.largest_internal_key(), file_metadata.smallest_seqno,
+                  file_metadata.largest_seqno, false);
+
+    // If incoming sequence number is higher, update local sequence number.
+    if (file_metadata.largest_seqno > versions_->LastSequence()) {
+      versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+      versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+      versions_->SetLastSequence(file_metadata.largest_seqno);
+    }
+  }
+
+  return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add files to the database remove all the files we copied.
+    for (const auto& f : files_to_import_) {
+      const auto s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  } else if (status.ok() && import_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_import_) {
+      const auto s = env_->DeleteFile(f.external_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+    const std::string& external_file, IngestedFileInfo* file_to_import,
+    SuperVersion* sv) {
+  file_to_import->external_file_path = external_file;
+
+  // Get external file size
+  auto status = env_->GetFileSize(external_file, &file_to_import->file_size);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status = env_->NewRandomAccessFile(external_file, &sst_file, env_options_);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
+                                                   external_file));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(*cfd_->ioptions(),
+                         sv->mutable_cf_options.prefix_extractor.get(),
+                         env_options_, cfd_->internal_comparator()),
+      std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+
+  // Set original_seqno to 0.
+  file_to_import->original_seqno = 0;
+
+  // Get number of entries in table
+  file_to_import->num_entries = props->num_entries;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->smallest_user_key = key.user_key.ToString();
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->largest_user_key = key.user_key.ToString();
+
+  file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_import->table_properties = *props;
+
+  return status;
+}
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h
new file mode 100644
index 00000000000..5b8577df1d5
--- /dev/null
+++ b/db/import_column_family_job.h
@@ -0,0 +1,70 @@
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+  ImportColumnFamilyJob(
+      Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+      const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+      const ImportColumnFamilyOptions& import_options,
+      const std::vector<LiveFileMetaData>& metadata)
+      : env_(env),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        env_options_(env_options),
+        import_options_(import_options),
+        metadata_(metadata) {}
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+  // Will execute the import job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_import() const {
+    return files_to_import_;
+  }
+
+ private:
+  // Open the external file and populate `file_to_import` with all the
+  // external information we need to import this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             IngestedFileInfo* file_to_import,
+                             SuperVersion* sv);
+
+  Env* env_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const EnvOptions& env_options_;
+  autovector<IngestedFileInfo> files_to_import_;
+  VersionEdit edit_;
+  const ImportColumnFamilyOptions& import_options_;
+  std::vector<LiveFileMetaData> metadata_;
+};
+
+}  // namespace rocksdb
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
new file mode 100644
index 00000000000..a93ecbf1173
--- /dev/null
+++ b/db/import_column_family_test.cc
@@ -0,0 +1,565 @@
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+  ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+    export_files_dir_ = test::TmpDir(env_) + "/export";
+    import_cfh_ = nullptr;
+    import_cfh2_ = nullptr;
+    metadata_ptr_ = nullptr;
+  }
+
+  ~ImportColumnFamilyTest() {
+    if (import_cfh_) {
+      db_->DropColumnFamily(import_cfh_);
+      db_->DestroyColumnFamilyHandle(import_cfh_);
+      import_cfh_ = nullptr;
+    }
+    if (import_cfh2_) {
+      db_->DropColumnFamily(import_cfh2_);
+      db_->DestroyColumnFamilyHandle(import_cfh2_);
+      import_cfh2_ = nullptr;
+    }
+    if (metadata_ptr_) {
+      delete metadata_ptr_;
+      metadata_ptr_ = nullptr;
+    }
+    test::DestroyDir(env_, sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  LiveFileMetaData LiveFileMetaDataInit(std::string name,
+                                        std::string path,
+                                        int level,
+                                        SequenceNumber smallest_seqno,
+                                        SequenceNumber largest_seqno) {
+    LiveFileMetaData metadata;
+    metadata.name = name;
+    metadata.db_path = path;
+    metadata.smallest_seqno = smallest_seqno;
+    metadata.largest_seqno = largest_seqno;
+    metadata.level = level;
+    return metadata;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::string export_files_dir_;
+  ColumnFamilyHandle* import_cfh_;
+  ColumnFamilyHandle* import_cfh2_;
+  ExportImportFilesMetaData *metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // cf1.sst
+  const std::string cf1_sst_name = "cf1.sst";
+  const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst_name = "cf_unknown.sst";
+  const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  {
+    // Import sst file corresponding to cf1 onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+    ASSERT_EQ(value, "V2");
+    ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+    import_cfh_ = nullptr;
+  }
+
+  {
+    // Import sst file corresponding to unknown cf onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+    ASSERT_EQ(value, "V2");
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+  // file3.sst
+  const std::string file3_sst_name = "file3.sst";
+  const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file3_sst));
+  for (int i = 0; i < 100; ++i) {
+    sfw_cf1.Put(Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file2.sst
+  const std::string file2_sst_name = "file2.sst";
+  const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file2_sst));
+  for (int i = 0; i < 100; i += 2) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite1");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1a.sst
+  const std::string file1a_sst_name = "file1a.sst";
+  const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1a_sst));
+  for (int i = 0; i < 52; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1b.sst
+  const std::string file1b_sst_name = "file1b.sst";
+  const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1b_sst));
+  for (int i = 52; i < 100; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0a.sst
+  const std::string file0a_sst_name = "file0a.sst";
+  const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0a_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite3");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0b.sst
+  const std::string file0b_sst_name = "file0b.sst";
+  const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0b_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite4");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // Import sst files and verify
+  ExportImportFilesMetaData metadata;
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+  metadata.db_comparator_name = options.comparator->Name();
+
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(
+      options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  for (int i = 0; i < 100; i += 5) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+  }
+
+  // Flush and check again
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  // Compact and check again.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+
+  ImportColumnFamilyOptions import_options;
+  import_options.move_files = false;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+                                              *metadata_ptr_, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  import_options.move_files = true;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+                                              *metadata_ptr_, &import_cfh2_));
+  ASSERT_NE(import_cfh2_, nullptr);
+  delete metadata_ptr_;
+  metadata_ptr_ = NULL;
+
+  std::string value1, value2;
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Get(1, Key(i)), value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Modify keys in cf1 and verify.
+  for (int i = 0; i < 25; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+  }
+  for (int i = 25; i < 50; i++) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+  }
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Compact and check again.
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  // Compact to create a L1 file.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 50; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+
+  for (int i = 0; i < 25; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  for (int i = 0; i < 100; ++i) {
+    std::string value;
+    db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+    ASSERT_EQ(Get(1, Key(i)), value);
+  }
+  db_copy->DropColumnFamily(cfh);
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  {
+    // Create column family with existing cf name.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Column family already exists"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with no files specified.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("The list of files is empty"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with overlapping keys in sst files.
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file2_sst_name = "file2.sst";
+    const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file2_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Files have overlapping ranges"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with a mismatching comparator, should fail with appropriate error.
+    ExportImportFilesMetaData metadata;
+    Options mismatch_options = CurrentOptions();
+    mismatch_options.comparator = ReverseBytewiseComparator();
+    SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Comparator name mismatch"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with non existent sst file should fail with appropriate error
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file3_sst_name = "file3.sst";
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::IOError("No such file or directory"));
+    ASSERT_EQ(import_cfh_, nullptr);
+
+    // Test successful import after a failure with the same CF name. Ensures
+    // there is no side effect with CF when there is a failed import
+    metadata.files.pop_back();
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+  }
+
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Import are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 0f8573e4319..d90ca900f45 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1174,6 +1174,27 @@ class DB {
   virtual Status IngestExternalFiles(
       const std::vector<IngestExternalFileArg>& args) = 0;
 
+  // CreateColumnFamilyWithImport() will create a new column family with
+  // column_family_name and import external SST files specified in metadata into
+  // this column family.
+  // (1) External SST files can be created using SstFileWriter.
+  // (2) External SST files can be exported from a particular column family in
+  //     an existing DB.
+  // Option in import_options specifies whether the external files are copied or
+  // moved (default is copy). When option specifies copy, managing files at
+  // external_file_path is caller's responsibility. When option specifies a
+  // move, the call ensures that the specified files at external_file_path are
+  // deleted on successful return and files are not modified on any error
+  // return.
+  // On error return, column family handle returned will be nullptr.
+  // ColumnFamily will be present on successful return and will not be present
+  // on error return. ColumnFamily may be present on any crash during this call.
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) = 0;
+
   virtual Status VerifyChecksum() = 0;
 
   // AddFile() is deprecated, please use IngestExternalFile()
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index a0ab41efdfb..7b251eb7203 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -108,4 +108,11 @@ struct LiveFileMetaData : SstFileMetaData {
   int level;                       // Level at which this file resides.
   LiveFileMetaData() : column_family_name(), level(0) {}
 };
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+  std::string db_comparator_name;       // Used to safety check at import.
+  std::vector<LiveFileMetaData> files;  // Vector of file metadata.
+};
 }  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 8ebcd292dba..09dc8e54c5c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1491,4 +1491,10 @@ struct TraceOptions {
   uint64_t filter = kTraceFilterNone;
 };
 
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+};
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h
index aa0a394d4d0..5f12922c454 100644
--- a/include/rocksdb/utilities/checkpoint.h
+++ b/include/rocksdb/utilities/checkpoint.h
@@ -9,11 +9,15 @@
 #ifndef ROCKSDB_LITE
 
 #include <string>
+#include <vector>
 #include "rocksdb/status.h"
 
 namespace rocksdb {
 
 class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
 
 class Checkpoint {
  public:
@@ -36,6 +40,16 @@ class Checkpoint {
   virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
                                   uint64_t log_size_for_flush = 0);
 
+  // Exports all live SST files of a specified Column Family onto export_dir,
+  // returning SST files information in metadata.
+  // - SST files will be created as hard links when the directory specified
+  //   is in the same partition as the db directory, copied otherwise.
+  // - export_dir should not already exist and will be created by this API.
+  // - Always triggers a flush.
+  virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                                    const std::string& export_dir,
+                                    ExportImportFilesMetaData** metadata);
+
   virtual ~Checkpoint() {}
 };
 
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 8535952cd3e..a52aff5d8b1 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -120,6 +120,16 @@ class StackableDB : public DB {
     return db_->IngestExternalFiles(args);
   }
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamilyWithImport(options, column_family_name,
+                                             import_options, metadata, handle);
+  }
+
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
   using DB::KeyMayExist;
diff --git a/src.mk b/src.mk
index fe930d5f49b..4d635173b89 100644
--- a/src.mk
+++ b/src.mk
@@ -36,6 +36,7 @@ LIB_SOURCES =                                                   \
   db/flush_job.cc                                               \
   db/flush_scheduler.cc                                         \
   db/forward_iterator.cc                                        \
+  db/import_column_family_job.cc                                \
   db/internal_stats.cc                                          \
   db/logs_with_prep_tracker.cc                                  \
   db/log_reader.cc                                              \
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 4835f26da6e..0639ed2f2b4 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -22,6 +22,7 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "test_util/sync_point.h"
@@ -60,6 +61,12 @@ void CheckpointImpl::CleanStagingDirectory(
                  full_private_path.c_str(), s.ToString().c_str());
 }
 
+Status Checkpoint::ExportColumnFamily(
+    ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
+    ExportImportFilesMetaData** /*metadata*/) {
+  return Status::NotSupported("");
+}
+
 // Builds an openable snapshot of RocksDB
 Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
                                         uint64_t log_size_for_flush) {
@@ -322,6 +329,184 @@ Status CheckpointImpl::CreateCustomCheckpoint(
   return s;
 }
 
+// Exports all live SST files of a specified Column Family onto export_dir,
+// returning SST files information in metadata.
+Status CheckpointImpl::ExportColumnFamily(
+    ColumnFamilyHandle* handle, const std::string& export_dir,
+    ExportImportFilesMetaData** metadata) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handle);
+  const auto cf_name = cfh->GetName();
+  const auto db_options = db_->GetDBOptions();
+
+  assert(metadata != nullptr);
+  assert(*metadata == nullptr);
+  auto s = db_->GetEnv()->FileExists(export_dir);
+  if (s.ok()) {
+    return Status::InvalidArgument("Specified export_dir exists");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  const auto final_nonslash_idx = export_dir.find_last_not_of('/');
+  if (final_nonslash_idx == std::string::npos) {
+    return Status::InvalidArgument("Specified export_dir invalid");
+  }
+  ROCKS_LOG_INFO(db_options.info_log,
+                 "[%s] export column family onto export directory %s",
+                 cf_name.c_str(), export_dir.c_str());
+
+  // Create a temporary export directory.
+  const auto tmp_export_dir =
+      export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+  s = db_->GetEnv()->CreateDir(tmp_export_dir);
+
+  if (s.ok()) {
+    s = db_->Flush(rocksdb::FlushOptions(), handle);
+  }
+
+  ColumnFamilyMetaData db_metadata;
+  if (s.ok()) {
+    // Export live sst files with file deletions disabled.
+    s = db_->DisableFileDeletions();
+    if (s.ok()) {
+      db_->GetColumnFamilyMetaData(handle, &db_metadata);
+
+      s = ExportFilesInMetaData(
+          db_options, db_metadata,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
+                           cf_name.c_str(), fname.c_str());
+            return db_->GetEnv()->LinkFile(src_dirname + fname,
+                                           tmp_export_dir + fname);
+          } /*link_file_cb*/,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
+                           cf_name.c_str(), fname.c_str());
+            return CopyFile(db_->GetEnv(), src_dirname + fname,
+                            tmp_export_dir + fname, 0, db_options.use_fsync);
+          } /*copy_file_cb*/);
+
+      const auto enable_status = db_->EnableFileDeletions(false /*force*/);
+      if (s.ok()) {
+        s = enable_status;
+      }
+    }
+  }
+
+  auto moved_to_user_specified_dir = false;
+  if (s.ok()) {
+    // Move temporary export directory to the actual export directory.
+    s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
+  }
+
+  if (s.ok()) {
+    // Fsync export directory.
+    moved_to_user_specified_dir = true;
+    std::unique_ptr<Directory> dir_ptr;
+    s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr);
+    if (s.ok()) {
+      assert(dir_ptr != nullptr);
+      s = dir_ptr->Fsync();
+    }
+  }
+
+  if (s.ok()) {
+    // Export of files succeeded. Fill in the metadata information.
+    auto result_metadata = new ExportImportFilesMetaData();
+    result_metadata->db_comparator_name = handle->GetComparator()->Name();
+    for (const auto& level_metadata : db_metadata.levels) {
+      for (const auto& file_metadata : level_metadata.files) {
+        LiveFileMetaData live_file_metadata;
+        live_file_metadata.size = file_metadata.size;
+        live_file_metadata.name = std::move(file_metadata.name);
+        live_file_metadata.db_path = export_dir;
+        live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
+        live_file_metadata.largest_seqno = file_metadata.largest_seqno;
+        live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
+        live_file_metadata.largestkey = std::move(file_metadata.largestkey);
+        live_file_metadata.level = level_metadata.level;
+        result_metadata->files.push_back(live_file_metadata);
+      }
+      *metadata = result_metadata;
+    }
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
+                   cf_name.c_str());
+  } else {
+    // Failure: Clean up all the files/directories created.
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
+                   cf_name.c_str(), s.ToString().c_str());
+    std::vector<std::string> subchildren;
+    const auto cleanup_dir =
+        moved_to_user_specified_dir ? export_dir : tmp_export_dir;
+    db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
+    for (const auto& subchild : subchildren) {
+      const auto subchild_path = cleanup_dir + "/" + subchild;
+      const auto status = db_->GetEnv()->DeleteFile(subchild_path);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
+                       subchild_path.c_str(), status.ToString().c_str());
+      }
+    }
+    const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
+                     cleanup_dir.c_str(), status.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status CheckpointImpl::ExportFilesInMetaData(
+    const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        link_file_cb,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        copy_file_cb) {
+  Status s;
+  auto hardlink_file = true;
+
+  // Copy/hard link files in metadata.
+  size_t num_files = 0;
+  for (const auto& level_metadata : metadata.levels) {
+    for (const auto& file_metadata : level_metadata.files) {
+      uint64_t number;
+      FileType type;
+      const auto ok = ParseFileName(file_metadata.name, &number, &type);
+      if (!ok) {
+        s = Status::Corruption("Could not parse file name");
+        break;
+      }
+
+      // We should only get sst files here.
+      assert(type == kTableFile);
+      assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
+      const auto src_fname = file_metadata.name;
+      ++num_files;
+
+      if (hardlink_file) {
+        s = link_file_cb(db_->GetName(), src_fname);
+        if (num_files == 1 && s.IsNotSupported()) {
+          // Fallback to copy if link failed due to cross-device directories.
+          hardlink_file = false;
+          s = Status::OK();
+        }
+      }
+      if (!hardlink_file) {
+        s = copy_file_cb(db_->GetName(), src_fname);
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
+                 num_files);
+
+  return s;
+}
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h
index d26a9f66bfc..0d87b635b8d 100644
--- a/utilities/checkpoint/checkpoint_impl.h
+++ b/utilities/checkpoint/checkpoint_impl.h
@@ -30,6 +30,17 @@ class CheckpointImpl : public Checkpoint {
   virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
                                   uint64_t log_size_for_flush) override;
 
+  // Exports all live SST files of a specified Column Family onto export_dir
+  // and returning SST files information in metadata.
+  //  - SST files will be created as hard links when the directory specified
+  //    is in the same partition as the db directory, copied otherwise.
+  //  - export_dir should not already exist and will be created by this API.
+  //  - Always triggers a flush.
+  using Checkpoint::ExportColumnFamily;
+  virtual Status ExportColumnFamily(
+      ColumnFamilyHandle* handle, const std::string& export_dir,
+      ExportImportFilesMetaData** metadata) override;
+
   // Checkpoint logic can be customized by providing callbacks for link, copy,
   // or create.
   Status CreateCustomCheckpoint(
@@ -48,6 +59,18 @@ class CheckpointImpl : public Checkpoint {
 
  private:
   void CleanStagingDirectory(const std::string& path, Logger* info_log);
+
+  // Export logic customization by providing callbacks for link or copy.
+  Status ExportFilesInMetaData(
+      const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          link_file_cb,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          copy_file_cb);
+
+ private:
   DB* db_;
 };
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index d7d2548af3e..d748f500ebc 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -26,6 +26,7 @@
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 class CheckpointTest : public testing::Test {
@@ -44,6 +45,9 @@ class CheckpointTest : public testing::Test {
   Options last_options_;
   std::vector<ColumnFamilyHandle*> handles_;
   std::string snapshot_name_;
+  std::string export_path_;
+  ColumnFamilyHandle* cfh_reverse_comp_;
+  ExportImportFilesMetaData* metadata_;
 
   CheckpointTest() : env_(Env::Default()) {
     env_->SetBackgroundThreads(1, Env::LOW);
@@ -64,12 +68,24 @@ class CheckpointTest : public testing::Test {
     EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
     env_->DeleteDir(snapshot_tmp_name);
     Reopen(options);
+    export_path_ = test::TmpDir(env_) + "/export";
+    test::DestroyDir(env_, export_path_);
+    cfh_reverse_comp_ = nullptr;
+    metadata_ = nullptr;
   }
 
   ~CheckpointTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->LoadDependency({});
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    if (cfh_reverse_comp_) {
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_));
+      cfh_reverse_comp_ = nullptr;
+    }
+    if (metadata_) {
+      delete metadata_;
+      metadata_ = nullptr;
+    }
     Close();
     Options options;
     options.db_paths.emplace_back(dbname_, 0);
@@ -78,6 +94,7 @@ class CheckpointTest : public testing::Test {
     options.db_paths.emplace_back(dbname_ + "_4", 0);
     EXPECT_OK(DestroyDB(dbname_, options));
     EXPECT_OK(DestroyDB(snapshot_name_, options));
+    test::DestroyDir(env_, export_path_);
   }
 
   // Return the current option configuration.
@@ -140,6 +157,12 @@ class CheckpointTest : public testing::Test {
     ASSERT_OK(TryReopen(options));
   }
 
+  void CompactAll() {
+    for (auto h : handles_) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr));
+    }
+  }
+
   void Close() {
     for (auto h : handles_) {
       delete h;
@@ -289,6 +312,109 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
   }
 }
 
+TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
+    // Create a database
+    Status s;
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    CreateAndReopenWithCF({}, options);
+
+    // Helper to verify the number of files in metadata and export dir
+    auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
+                                     int num_files_expected) {
+      ASSERT_EQ(metadata.files.size(), num_files_expected);
+      std::vector<std::string> subchildren;
+      env_->GetChildren(export_path_, &subchildren);
+      int num_children = 0;
+      for (const auto& child : subchildren) {
+        if (child != "." && child != "..") {
+          ++num_children;
+        }
+      }
+      ASSERT_EQ(num_children, num_files_expected);
+    };
+
+    // Test DefaultColumnFamily
+    {
+      const auto key = std::string("foo");
+      ASSERT_OK(Put(key, "v1"));
+
+      Checkpoint* checkpoint;
+      ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+      // Export the Tables and verify
+      ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                               export_path_, &metadata_));
+      verify_files_exported(*metadata_, 1);
+      ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+      test::DestroyDir(env_, export_path_);
+      delete metadata_;
+      metadata_ = nullptr;
+
+      // Check again after compaction
+      CompactAll();
+      ASSERT_OK(Put(key, "v2"));
+      ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                               export_path_, &metadata_));
+      verify_files_exported(*metadata_, 2);
+      ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+      test::DestroyDir(env_, export_path_);
+      delete metadata_;
+      metadata_ = nullptr;
+      delete checkpoint;
+    }
+
+    // Test non default column family with non default comparator
+    {
+      auto cf_options = CurrentOptions();
+      cf_options.comparator = ReverseBytewiseComparator();
+      ASSERT_OK(
+          db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
+
+      const auto key = std::string("foo");
+      ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
+
+      Checkpoint* checkpoint;
+      ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+      // Export the Tables and verify
+      ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
+                                               &metadata_));
+      verify_files_exported(*metadata_, 1);
+      ASSERT_EQ(metadata_->db_comparator_name,
+                ReverseBytewiseComparator()->Name());
+      delete checkpoint;
+    }
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
+    // Create a database
+    Status s;
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    CreateAndReopenWithCF({}, options);
+
+    const auto key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+    // Export onto existing directory
+    env_->CreateDirIfMissing(export_path_);
+    ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_),
+              Status::InvalidArgument("Specified export_dir exists"));
+    test::DestroyDir(env_, export_path_);
+
+    // Export with invalid directory specification
+    export_path_ = "";
+    ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_),
+              Status::InvalidArgument("Specified export_dir invalid"));
+    delete checkpoint;
+}
+
 TEST_F(CheckpointTest, CheckpointCF) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);

From 8a008d41701823af69c2185c7460280c5d8fac74 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Wed, 17 Jul 2019 13:02:00 -0700
Subject: [PATCH 231/572] Block access tracing: Trace referenced key for Get on
 non-data blocks. (#5548)

Summary:
This PR traces the referenced key for Get for all types of blocks. This is useful when evaluating hybrid row-block caches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5548

Test Plan: make clean && USE_CLANG=1 make check -j32

Differential Revision: D16157979

Pulled By: HaoyuHuang

fbshipit-source-id: f6327411c9deb74e35e22a35f66cdbae09ab9d87
---
 table/block_based/block_based_table_reader.cc |  58 ++--
 table/table_test.cc                           | 276 +++++++++++++++++-
 tools/block_cache_trace_analyzer.h            |   4 +-
 trace_replay/block_cache_tracer.cc            |  34 ++-
 trace_replay/block_cache_tracer.h             |  40 ++-
 trace_replay/block_cache_tracer_test.cc       |   9 +-
 utilities/simulator_cache/cache_simulator.cc  |  19 +-
 utilities/simulator_cache/cache_simulator.h   |   1 -
 .../simulator_cache/cache_simulator_test.cc   |  22 +-
 9 files changed, 386 insertions(+), 77 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index a888603d72b..fde11c0d362 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1983,10 +1983,12 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
         /*block_size=*/usage, rep_->cf_id_for_tracing(),
         /*cf_name=*/"", rep_->level_for_tracing(),
         rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-        /*no_insert=*/no_io, lookup_context->get_id);
+        /*no_insert=*/no_io, lookup_context->get_id,
+        lookup_context->get_from_user_specified_snapshot,
+        /*referenced_key=*/"");
     block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
                                           rep_->cf_name_for_tracing(),
-                                          /*referenced_key=*/nullptr);
+                                          lookup_context->referenced_key);
   }
   return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
           cache_handle, false /* own_value */};
@@ -2237,7 +2239,6 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
   Slice key /* key to the block cache */;
   Slice ckey /* key to the compressed block cache */;
   bool is_cache_hit = false;
-  bool no_insert = true;
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
     // create key for block cache
     if (block_cache != nullptr) {
@@ -2265,7 +2266,6 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
-      no_insert = false;
       Statistics* statistics = rep_->ioptions.statistics;
       const bool maybe_compressed =
           block_type != BlockType::kFilter && rep_->blocks_maybe_compressed;
@@ -2332,11 +2332,11 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
         assert(false);
         break;
     }
-    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
+    bool no_insert = no_io || !ro.fill_cache;
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
             trace_block_type, lookup_context->caller)) {
       // Defer logging the access to Get() and MultiGet() to trace additional
-      // information, e.g., the referenced key,
-      // referenced_key_exist_in_block.
+      // information, e.g., referenced_key_exist_in_block.
 
       // Make a copy of the block key here since it will be logged later.
       lookup_context->FillLookupContext(
@@ -2351,10 +2351,12 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
           /*block_size=*/usage, rep_->cf_id_for_tracing(),
           /*cf_name=*/"", rep_->level_for_tracing(),
           rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-          no_insert, lookup_context->get_id);
+          no_insert, lookup_context->get_id,
+          lookup_context->get_from_user_specified_snapshot,
+          /*referenced_key=*/"");
       block_cache_tracer_->WriteBlockAccess(access_record, key,
                                             rep_->cf_name_for_tracing(),
-                                            /*referenced_key=*/nullptr);
+                                            lookup_context->referenced_key);
     }
   }
 
@@ -3288,12 +3290,18 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   // First check the full filter
   // If full filter not useful, Then go into each block
   uint64_t tracing_get_id = get_context->get_tracing_get_id();
-  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet,
-                                         tracing_get_id};
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserGet, tracing_get_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+    // Trace the key since it contains both user key and sequence number.
+    lookup_context.referenced_key = key.ToString();
+    lookup_context.get_from_user_specified_snapshot =
+        read_options.snapshot != nullptr;
+  }
   const bool may_match =
       FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
                             get_context, &lookup_context);
-
   if (!may_match) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
@@ -3347,7 +3355,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       }
 
       BlockCacheLookupContext lookup_data_block_context{
-          TableReaderCaller::kUserGet, tracing_get_id};
+          TableReaderCaller::kUserGet, tracing_get_id,
+          /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+              nullptr};
       bool does_referenced_key_exist = false;
       DataBlockIter biter;
       uint64_t referenced_data_size = 0;
@@ -3406,7 +3416,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         if (does_referenced_key_exist) {
           referenced_key = biter.key();
         } else {
-          referenced_key = ExtractUserKey(key);
+          referenced_key = key;
         }
         BlockCacheTraceRecord access_record(
             rep_->ioptions.env->NowMicros(),
@@ -3417,6 +3427,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
             lookup_data_block_context.is_cache_hit,
             lookup_data_block_context.no_insert,
             lookup_data_block_context.get_id,
+            lookup_data_block_context.get_from_user_specified_snapshot,
             /*referenced_key=*/"", referenced_data_size,
             lookup_data_block_context.num_keys_in_block,
             does_referenced_key_exist);
@@ -3460,8 +3471,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
   }
-  BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet,
-                                         tracing_mget_id};
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
   FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
                          prefix_extractor, &lookup_context);
 
@@ -3492,11 +3504,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
-      BlockCacheLookupContext lookup_compression_dict_context(
-          TableReaderCaller::kUserMultiGet);
-      auto uncompression_dict_storage = GetUncompressionDict(nullptr, no_io,
-                                          sst_file_range.begin()->get_context,
-                                          &lookup_compression_dict_context);
+      auto uncompression_dict_storage = GetUncompressionDict(
+          nullptr, no_io, sst_file_range.begin()->get_context, &lookup_context);
       const UncompressionDict& uncompression_dict =
           uncompression_dict_storage.GetValue() == nullptr
               ? UncompressionDict::GetEmptyDict()
@@ -3591,7 +3600,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         uint64_t referenced_data_size = 0;
         bool does_referenced_key_exist = false;
         BlockCacheLookupContext lookup_data_block_context(
-            TableReaderCaller::kUserMultiGet, tracing_mget_id);
+            TableReaderCaller::kUserMultiGet, tracing_mget_id,
+            /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+                nullptr);
         if (first_block) {
           if (!block_handles[idx_in_batch].IsNull() ||
               !results[idx_in_batch].IsEmpty()) {
@@ -3685,7 +3696,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           if (does_referenced_key_exist) {
             referenced_key = biter->key();
           } else {
-            referenced_key = ExtractUserKey(key);
+            referenced_key = key;
           }
           BlockCacheTraceRecord access_record(
               rep_->ioptions.env->NowMicros(),
@@ -3696,6 +3707,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
               lookup_data_block_context.is_cache_hit,
               lookup_data_block_context.no_insert,
               lookup_data_block_context.get_id,
+              lookup_data_block_context.get_from_user_specified_snapshot,
               /*referenced_key=*/"", referenced_data_size,
               lookup_data_block_context.num_keys_in_block,
               does_referenced_key_exist);
diff --git a/table/table_test.cc b/table/table_test.cc
index c54933b781a..bb034311668 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -63,6 +63,8 @@ extern const uint64_t kPlainTableMagicNumber;
 
 namespace {
 
+const std::string kDummyValue(10000, 'o');
+
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
  public:
@@ -312,7 +314,9 @@ class TableConstructor: public Constructor {
       : Constructor(cmp),
         largest_seqno_(largest_seqno),
         convert_to_internal_key_(convert_to_internal_key),
-        level_(level) {}
+        level_(level) {
+    env_ = rocksdb::Env::Default();
+  }
   ~TableConstructor() override { Reset(); }
 
   Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
@@ -371,7 +375,7 @@ class TableConstructor: public Constructor {
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            internal_comparator, !kSkipFilters, !kImmortal,
-                           level_, largest_seqno_, nullptr),
+                           level_, largest_seqno_, &block_cache_tracer_),
         std::move(file_reader_), TEST_GetSink()->contents().size(),
         &table_reader_);
   }
@@ -425,6 +429,8 @@ class TableConstructor: public Constructor {
     return static_cast<test::StringSink*>(file_writer_->writable_file());
   }
 
+  BlockCacheTracer block_cache_tracer_;
+
  private:
   void Reset() {
     uniq_id_ = 0;
@@ -445,6 +451,7 @@ class TableConstructor: public Constructor {
 
   static uint64_t cur_uniq_id_;
   EnvOptions soptions;
+  Env* env_;
 };
 uint64_t TableConstructor::cur_uniq_id_ = 1;
 
@@ -1063,7 +1070,9 @@ class BlockBasedTableTest
     : public TableTest,
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
-  BlockBasedTableTest() : format_(GetParam()) {}
+  BlockBasedTableTest() : format_(GetParam()) {
+    env_ = rocksdb::Env::Default();
+  }
 
   BlockBasedTableOptions GetBlockBasedTableOptions() {
     BlockBasedTableOptions options;
@@ -1071,11 +1080,91 @@ class BlockBasedTableTest
     return options;
   }
 
+  void SetupTracingTest(TableConstructor* c) {
+    test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace_file";
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
+                                 &trace_writer));
+    c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer));
+    {
+      std::string user_key = "k01";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+    {
+      std::string user_key = "k02";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+  }
+
+  void VerifyBlockAccessTrace(
+      TableConstructor* c,
+      const std::vector<BlockCacheTraceRecord>& expected_records) {
+    c->block_cache_tracer_.EndTrace();
+
+    std::unique_ptr<TraceReader> trace_reader;
+    Status s =
+        NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+    EXPECT_OK(s);
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    EXPECT_OK(reader.ReadHeader(&header));
+    uint32_t index = 0;
+    while (s.ok()) {
+      BlockCacheTraceRecord access;
+      s = reader.ReadAccess(&access);
+      if (!s.ok()) {
+        break;
+      }
+      ASSERT_LT(index, expected_records.size());
+      EXPECT_NE("", access.block_key);
+      EXPECT_EQ(access.block_type, expected_records[index].block_type);
+      EXPECT_GT(access.block_size, 0);
+      EXPECT_EQ(access.caller, expected_records[index].caller);
+      EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+      EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+      // Get
+      if (access.caller == TableReaderCaller::kUserGet) {
+        EXPECT_EQ(access.referenced_key,
+                  expected_records[index].referenced_key);
+        EXPECT_EQ(access.get_id, expected_records[index].get_id);
+        EXPECT_EQ(access.get_from_user_specified_snapshot,
+                  expected_records[index].get_from_user_specified_snapshot);
+        if (access.block_type == TraceType::kBlockTraceDataBlock) {
+          EXPECT_GT(access.referenced_data_size, 0);
+          EXPECT_GT(access.num_keys_in_block, 0);
+          EXPECT_EQ(access.referenced_key_exist_in_block,
+                    expected_records[index].referenced_key_exist_in_block);
+        }
+      } else {
+        EXPECT_EQ(access.referenced_key, "");
+        EXPECT_EQ(access.get_id, 0);
+        EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
+        EXPECT_EQ(access.referenced_data_size, 0);
+        EXPECT_EQ(access.num_keys_in_block, 0);
+        EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
+      }
+      index++;
+    }
+    EXPECT_EQ(index, expected_records.size());
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
  protected:
   uint64_t IndexUncompressedHelper(bool indexCompress);
 
  private:
   uint32_t format_;
+  Env* env_;
+  std::string trace_file_path_;
+  std::string test_path_;
 };
 class PlainTableTest : public TableTest {};
 class TablePropertyTest : public testing::Test {};
@@ -2211,6 +2300,187 @@ TEST_P(BlockBasedTableTest, NumBlockStat) {
   c.ResetTableReader();
 }
 
+TEST_P(BlockBasedTableTest, TracingGetTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  std::string encoded_key = internal_key.Encode().ToString();
+  for (uint32_t i = 1; i <= 2; i++) {
+    PinnableSlice value;
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, user_key, &value, nullptr,
+                           nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                           nullptr, /*get_id=*/i);
+    get_perf_context()->Reset();
+    ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
+                                      moptions.prefix_extractor.get()));
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value.ToString(), kDummyValue);
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for one index, one filter, and one data
+  // block access.
+  record.get_id = 1;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kFalse;
+  record.referenced_key = encoded_key;
+  record.referenced_key_exist_in_block = Boolean::kTrue;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  record.is_cache_hit = Boolean::kFalse;
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  // The second get should all observe cache hits.
+  record.is_cache_hit = Boolean::kTrue;
+  record.get_id = 2;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kFalse;
+  record.referenced_key = encoded_key;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::string user_key = "k01";
+    InternalKey internal_key(user_key, 0, kTypeValue);
+    std::string encoded_key = internal_key.Encode().ToString();
+    c.GetTableReader()->ApproximateOffsetOf(
+        encoded_key, TableReaderCaller::kUserApproximateSize);
+  }
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have two records for only index blocks.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserApproximateSize;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingIterator) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
+        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUserIterator));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    iter.reset();
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for index and two data block access.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserIterator;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  record.is_cache_hit = Boolean::kFalse;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  // When we iterate this file for the second time, we should observe all cache
+  // hits.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
 // A simple tool that takes the snapshot of block cache statistics.
 class BlockCachePropertiesSnapshot {
  public:
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index feb7c21f22c..32a90342cb1 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -57,8 +57,8 @@ struct BlockAccessInfo {
     const uint64_t timestamp_in_seconds =
         access.access_timestamp / kMicrosInSecond;
     caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
-    if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type,
-                                                        access.caller)) {
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
+                                                          access.caller)) {
       num_keys = access.num_keys_in_block;
       if (access.referenced_key_exist_in_block == Boolean::kTrue) {
         if (key_num_access_map.find(access.referenced_key) ==
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index a74dc4d58cb..4f320ef2d0f 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -35,14 +35,13 @@ const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
     "UnknownColumnFamily";
 const uint64_t BlockCacheTraceHelper::kReservedGetId = 0;
 
-bool BlockCacheTraceHelper::ShouldTraceReferencedKey(TraceType block_type,
-                                                     TableReaderCaller caller) {
+bool BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+    TraceType block_type, TableReaderCaller caller) {
   return (block_type == TraceType::kBlockTraceDataBlock) &&
-         (caller == TableReaderCaller::kUserGet ||
-          caller == TableReaderCaller::kUserMultiGet);
+         IsGetOrMultiGet(caller);
 }
 
-bool BlockCacheTraceHelper::ShouldTraceGetId(TableReaderCaller caller) {
+bool BlockCacheTraceHelper::IsGetOrMultiGet(TableReaderCaller caller) {
   return caller == TableReaderCaller::kUserGet ||
          caller == TableReaderCaller::kUserMultiGet;
 }
@@ -81,12 +80,13 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
   trace.payload.push_back(record.caller);
   trace.payload.push_back(record.is_cache_hit);
   trace.payload.push_back(record.no_insert);
-  if (BlockCacheTraceHelper::ShouldTraceGetId(record.caller)) {
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record.caller)) {
     PutFixed64(&trace.payload, record.get_id);
-  }
-  if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type,
-                                                      record.caller)) {
+    trace.payload.push_back(record.get_from_user_specified_snapshot);
     PutLengthPrefixedSlice(&trace.payload, referenced_key);
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record.block_type,
+                                                        record.caller)) {
     PutFixed64(&trace.payload, record.referenced_data_size);
     PutFixed64(&trace.payload, record.num_keys_in_block);
     trace.payload.push_back(record.referenced_key_exist_in_block);
@@ -216,20 +216,28 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   }
   record->no_insert = static_cast<Boolean>(enc_slice[0]);
   enc_slice.remove_prefix(kCharSize);
-  if (BlockCacheTraceHelper::ShouldTraceGetId(record->caller)) {
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record->caller)) {
     if (!GetFixed64(&enc_slice, &record->get_id)) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read the get id.");
     }
-  }
-  if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type,
-                                                      record->caller)) {
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "get_from_user_specified_snapshot.");
+    }
+    record->get_from_user_specified_snapshot =
+        static_cast<Boolean>(enc_slice[0]);
+    enc_slice.remove_prefix(kCharSize);
     Slice referenced_key;
     if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read the referenced key.");
     }
     record->referenced_key = referenced_key.ToString();
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record->block_type,
+                                                        record->caller)) {
     if (!GetFixed64(&enc_slice, &record->referenced_data_size)) {
       return Status::Incomplete(
           "Incomplete access record: Failed to read the referenced data size.");
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 3b26a18d639..b1a258843e5 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -23,9 +23,9 @@ extern const uint64_t kSecondInHour;
 
 class BlockCacheTraceHelper {
  public:
-  static bool ShouldTraceReferencedKey(TraceType block_type,
-                                       TableReaderCaller caller);
-  static bool ShouldTraceGetId(TableReaderCaller caller);
+  static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
+                                         TableReaderCaller caller);
+  static bool IsGetOrMultiGet(TableReaderCaller caller);
   static bool IsUserAccess(TableReaderCaller caller);
 
   static const std::string kUnknownColumnFamilyName;
@@ -53,8 +53,11 @@ class BlockCacheTraceHelper {
 // kUserApproximateSize).
 struct BlockCacheLookupContext {
   BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
-  BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id)
-      : caller(_caller), get_id(_get_id) {}
+  BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
+                          bool _get_from_user_specified_snapshot)
+      : caller(_caller),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
   const TableReaderCaller caller;
   // These are populated when we perform lookup/insert on block cache. The block
   // cache tracer uses these inforation when logging the block access at
@@ -69,6 +72,8 @@ struct BlockCacheLookupContext {
   // how many blocks a Get/MultiGet request accesses. We can also measure the
   // impact of row cache vs block cache.
   uint64_t get_id = 0;
+  std::string referenced_key;
+  bool get_from_user_specified_snapshot = false;
 
   void FillLookupContext(bool _is_cache_hit, bool _no_insert,
                          TraceType _block_type, uint64_t _block_size,
@@ -100,23 +105,25 @@ struct BlockCacheTraceRecord {
   Boolean no_insert = Boolean::kFalse;
   // Required field for Get and MultiGet
   uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
-  // Required fields for data block and user Get/Multi-Get only.
+  Boolean get_from_user_specified_snapshot = Boolean::kFalse;
   std::string referenced_key;
+  // Required fields for data block and user Get/Multi-Get only.
   uint64_t referenced_data_size = 0;
   uint64_t num_keys_in_block = 0;
   Boolean referenced_key_exist_in_block = Boolean::kFalse;
 
   BlockCacheTraceRecord() {}
 
-  BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
-                        TraceType _block_type, uint64_t _block_size,
-                        uint64_t _cf_id, std::string _cf_name, uint32_t _level,
-                        uint64_t _sst_fd_number, TableReaderCaller _caller,
-                        bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
-                        std::string _referenced_key = "",
-                        uint64_t _referenced_data_size = 0,
-                        uint64_t _num_keys_in_block = 0,
-                        bool _referenced_key_exist_in_block = false)
+  BlockCacheTraceRecord(
+      uint64_t _access_timestamp, std::string _block_key, TraceType _block_type,
+      uint64_t _block_size, uint64_t _cf_id, std::string _cf_name,
+      uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller,
+      bool _is_cache_hit, bool _no_insert,
+      uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId,
+      bool _get_from_user_specified_snapshot = false,
+      std::string _referenced_key = "", uint64_t _referenced_data_size = 0,
+      uint64_t _num_keys_in_block = 0,
+      bool _referenced_key_exist_in_block = false)
       : access_timestamp(_access_timestamp),
         block_key(_block_key),
         block_type(_block_type),
@@ -129,6 +136,9 @@ struct BlockCacheTraceRecord {
         is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
         no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
         get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot
+                                             ? Boolean::kTrue
+                                             : Boolean::kFalse),
         referenced_key(_referenced_key),
         referenced_data_size(_referenced_data_size),
         num_keys_in_block(_num_keys_in_block),
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index aae513ad5d7..c9983aee190 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -74,6 +74,7 @@ class BlockCacheTracerTest : public testing::Test {
       // Provide get_id for all callers. The writer should only write get_id
       // when the caller is either GET or MGET.
       record.get_id = key_id + 1;
+      record.get_from_user_specified_snapshot = Boolean::kTrue;
       // Provide these fields for all block types.
       // The writer should only write these fields for data blocks and the
       // caller is either GET or MGET.
@@ -126,20 +127,22 @@ class BlockCacheTracerTest : public testing::Test {
       if (record.caller == TableReaderCaller::kUserGet ||
           record.caller == TableReaderCaller::kUserMultiGet) {
         ASSERT_EQ(key_id + 1, record.get_id);
+        ASSERT_EQ(Boolean::kTrue, record.get_from_user_specified_snapshot);
+        ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
+                  record.referenced_key);
       } else {
         ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id);
+        ASSERT_EQ(Boolean::kFalse, record.get_from_user_specified_snapshot);
+        ASSERT_EQ("", record.referenced_key);
       }
       if (block_type == TraceType::kBlockTraceDataBlock &&
           (record.caller == TableReaderCaller::kUserGet ||
            record.caller == TableReaderCaller::kUserMultiGet)) {
-        ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
-                  record.referenced_key);
         ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block);
         ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
         ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size);
         continue;
       }
-      ASSERT_EQ("", record.referenced_key);
       ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block);
       ASSERT_EQ(0, record.num_keys_in_block);
       ASSERT_EQ(0, record.referenced_data_size);
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
index ebfc4cd0eb0..90433df11bf 100644
--- a/utilities/simulator_cache/cache_simulator.cc
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -110,19 +110,22 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
 std::string HybridRowBlockCacheSimulator::ComputeRowKey(
     const BlockCacheTraceRecord& access) {
   assert(access.get_id != BlockCacheTraceHelper::kReservedGetId);
-  Slice key;
-  if (access.referenced_key_exist_in_block == Boolean::kTrue) {
-    key = ExtractUserKey(access.referenced_key);
-  } else {
-    key = access.referenced_key;
-  }
-  return std::to_string(access.sst_fd_number) + "_" + key.ToString();
+  Slice key = ExtractUserKey(access.referenced_key);
+  uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse
+                        ? 0
+                        : 1 + GetInternalKeySeqno(access.referenced_key);
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" +
+         std::to_string(seq_no);
 }
 
 void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
   bool is_cache_miss = true;
   bool admitted = true;
-  if (access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+  // TODO (haoyu): We only support Get for now. We need to extend the tracing
+  // for MultiGet, i.e., non-data block accesses must log all keys in a
+  // MultiGet.
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
     // This is a Get/MultiGet request.
     const std::string& row_key = ComputeRowKey(access);
     if (getid_getkeys_map_[access.get_id].find(row_key) ==
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
index b6667eeed12..82972688658 100644
--- a/utilities/simulator_cache/cache_simulator.h
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -137,7 +137,6 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
  private:
   // Row key is a concatenation of the access's fd_number and the referenced
   // user key.
-  // TODO(haoyu): the row key should contain sequence number.
   std::string ComputeRowKey(const BlockCacheTraceRecord& access);
 
   enum InsertResult : char {
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
index fb0c9e84976..f435785e6a1 100644
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -174,10 +174,11 @@ TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
 TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   uint64_t block_id = 100;
   BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  first_get.get_from_user_specified_snapshot = Boolean::kTrue;
   BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1);
   second_get.referenced_data_size = 0;
   second_get.referenced_key_exist_in_block = Boolean::kFalse;
-  second_get.referenced_key = kRefKeyPrefix + std::to_string(kGetId);
+  second_get.get_from_user_specified_snapshot = Boolean::kTrue;
   BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2);
   third_get.referenced_data_size = 0;
   third_get.referenced_key_exist_in_block = Boolean::kFalse;
@@ -203,9 +204,10 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   ASSERT_EQ(100, cache_simulator->miss_ratio());
   ASSERT_EQ(10, cache_simulator->user_accesses());
   ASSERT_EQ(100, cache_simulator->user_miss_ratio());
-  auto handle =
-      sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) +
-                                       "_" + first_get.referenced_key));
+  auto handle = sim_cache->Lookup(
+      std::to_string(first_get.sst_fd_number) + "_" +
+      ExtractUserKey(first_get.referenced_key).ToString() + "_" +
+      std::to_string(1 + GetInternalKeySeqno(first_get.referenced_key)));
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   for (uint32_t i = 100; i < block_id; i++) {
@@ -227,8 +229,10 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->miss_ratio()));
   ASSERT_EQ(15, cache_simulator->user_accesses());
   ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
-  handle = sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
-                             second_get.referenced_key);
+  handle = sim_cache->Lookup(
+      std::to_string(second_get.sst_fd_number) + "_" +
+      ExtractUserKey(second_get.referenced_key).ToString() + "_" +
+      std::to_string(1 + GetInternalKeySeqno(second_get.referenced_key)));
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   for (uint32_t i = 100; i < block_id; i++) {
@@ -283,9 +287,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
     cache_simulator->Access(first_get);
     block_id++;
   }
-  auto handle =
-      sim_cache->Lookup(ExtractUserKey(std::to_string(first_get.sst_fd_number) +
-                                       "_" + first_get.referenced_key));
+  auto handle = sim_cache->Lookup(
+      std::to_string(first_get.sst_fd_number) + "_" +
+      ExtractUserKey(first_get.referenced_key).ToString() + "_0");
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   // All blocks are missing from the cache since insert_blocks_row_kvpair_misses

From 9f5cfb8e7142fe7b8fe4668aefd481e881f5bb42 Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Wed, 17 Jul 2019 17:01:30 -0700
Subject: [PATCH 232/572] Fix for ReadaheadSequentialFile crash in ldb_cmd_test
 (#5586)

Summary:
Fixing a corner case crash when there was no data read from file, but status is still OK
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5586

Differential Revision: D16348117

Pulled By: elipoz

fbshipit-source-id: f97973308024f020d8be79ca3c56466b84d80656
---
 util/file_reader_writer.cc      | 9 +++++++--
 util/file_reader_writer_test.cc | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 9175fa502f9..b6a5eefcfdb 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -738,7 +738,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     if (s.ok()) {
       buffer_offset_ = offset;
       buffer_.Size(result.size());
-      assert(buffer_.BufferStart() == result.data());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
     }
     return s;
   }
@@ -886,7 +886,7 @@ class ReadaheadSequentialFile : public SequentialFile {
     if (s.ok()) {
       buffer_offset_ = read_offset_;
       buffer_.Size(result.size());
-      assert(buffer_.BufferStart() == result.data());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
     }
     return s;
   }
@@ -1027,6 +1027,11 @@ std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
 std::unique_ptr<SequentialFile>
 SequentialFileReader::NewReadaheadSequentialFile(
     std::unique_ptr<SequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
   std::unique_ptr<SequentialFile> result(
       new ReadaheadSequentialFile(std::move(file), readahead_size));
   return result;
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index aa74303b8fc..1b86f798f7f 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -325,7 +325,7 @@ class ReadaheadSequentialFileTest : public testing::Test,
                                     public testing::WithParamInterface<size_t> {
  public:
   static std::vector<size_t> GetReadaheadSizeList() {
-    return {1lu << 12, 1lu << 16};
+    return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18};
   }
   void SetUp() override {
     readahead_size_ = GetParam();

From ec2b996b29ab45d7d33a124f499344c8fb054229 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 17 Jul 2019 22:02:49 -0700
Subject: [PATCH 233/572] Fix LITE mode build failure

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5588

Test Plan: make LITE=1 all check

Differential Revision: D16354543

Pulled By: anand1976

fbshipit-source-id: 327a171439e183ac3a5e5057c511d6bca445e97d
---
 db/import_column_family_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index a93ecbf1173..76a8b90fadd 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -555,7 +555,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as External SST File Writer and Import are not supported "
           "in ROCKSDB_LITE\n");

From 3a6e83b56bbbebbd351c6666b31398be960c135d Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venki@cohesity.com>
Date: Thu, 18 Jul 2019 10:13:05 -0700
Subject: [PATCH 234/572] HISTORY update for export and import column family
 APIs

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5587

Differential Revision: D16359919

fbshipit-source-id: cfd9c448d79a8b8e7ac1d2b661d10151df269dba
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index 2e1e03f68de..b9d0f741317 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -17,6 +17,7 @@
 * db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
 * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
 * Overload GetAllKeyVersions() to support non-default column family.
+* Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.

From abd1fdddef8c72a3ffa736284c03ea550ace211b Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 18 Jul 2019 14:38:23 -0700
Subject: [PATCH 235/572] Fix asan_check failures

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5589

Test Plan: TEST_TMPDIR=/dev/shm/rocksdb COMPILE_WITH_ASAN=1 OPT=-g make J=64 -j64 asan_check

Differential Revision: D16361081

Pulled By: anand1976

fbshipit-source-id: 09474832b9cfb318a840d4b633e22dfad105d58c
---
 db/import_column_family_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index 76a8b90fadd..bc239c699ba 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -298,6 +298,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
 
   ImportColumnFamilyOptions import_options;
   import_options.move_files = false;
@@ -407,6 +408,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
 
   // Create a new db and import the files.
   DB* db_copy;
@@ -424,6 +426,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
     ASSERT_EQ(Get(1, Key(i)), value);
   }
   db_copy->DropColumnFamily(cfh);
+  db_copy->DestroyColumnFamilyHandle(cfh);
   test::DestroyDir(env_, dbname_ + "/db_copy");
 }
 

From 6bb3b4b567452ff88b6023d3db61bba2e4125d6c Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 19 Jul 2019 11:31:52 -0700
Subject: [PATCH 236/572] ldb idump to support non-default column families.
 (#5594)

Summary:
ldb idump now only works for default column family. Extend it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5594

Test Plan: Compile and run the tool against a multiple CF DB.

Differential Revision: D16380684

fbshipit-source-id: bfb8af36fdad1806837c90aaaab492d71528aceb
---
 tools/ldb_cmd.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 8f4258cf36e..22b2399a278 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1301,7 +1301,8 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, from_, to_, max_keys_, &key_versions);
+  Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
+                                &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;

From c129c75fb7810959a3da548d03bd3cededcb0a8f Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Fri, 19 Jul 2019 11:54:38 -0700
Subject: [PATCH 237/572] Added log_readahead_size option to control
 prefetching for Log::Reader (#5592)

Summary:
Added log_readahead_size option to control prefetching for Log::Reader.
This is mostly useful for reading a remotely located log, as it can save the number of round-trips when reading it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5592

Differential Revision: D16362989

Pulled By: elipoz

fbshipit-source-id: c5d4d5245a44008cd59879640efff70c091ad3e8
---
 db/db_impl/db_impl_open.cc       |  3 ++-
 db/db_impl/db_impl_secondary.cc  |  3 ++-
 db/version_set.cc                |  9 ++++++---
 include/rocksdb/env.h            |  4 ++--
 include/rocksdb/options.h        | 11 +++++++++++
 options/db_options.cc            |  6 +++++-
 options/db_options.h             |  1 +
 options/options_helper.cc        |  5 ++++-
 options/options_settable_test.cc |  3 ++-
 9 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 82e61a260b8..0e0fcfbf2c3 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -721,7 +721,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           continue;
         }
       }
-      file_reader.reset(new SequentialFileReader(std::move(file), fname));
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
     }
 
     // Create the log reader.
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index e14e53e55c3..a73cd6ba296 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -150,7 +150,8 @@ Status DBImplSecondary::MaybeInitLogReader(
         *log_reader = nullptr;
         return status;
       }
-      file_reader.reset(new SequentialFileReader(std::move(file), fname));
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
     }
 
     // Create the log reader.
diff --git a/db/version_set.cc b/db/version_set.cc
index 0d3b9fb4e32..559a4190f16 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4267,7 +4267,8 @@ Status VersionSet::Recover(
       return s;
     }
     manifest_file_reader.reset(
-        new SequentialFileReader(std::move(manifest_file), manifest_path));
+        new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                 db_options_->log_readahead_size));
   }
   uint64_t current_manifest_file_size;
   s = env_->GetFileSize(manifest_path, &current_manifest_file_size);
@@ -4597,7 +4598,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     if (!s.ok()) {
       return s;
     }
-    file_reader.reset(new SequentialFileReader(std::move(file), dscname));
+    file_reader.reset(new SequentialFileReader(
+        std::move(file), dscname, db_options_->log_readahead_size));
   }
 
   bool have_prev_log_number = false;
@@ -5721,7 +5723,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
     std::unique_ptr<SequentialFileReader> manifest_file_reader;
     if (s.ok()) {
       manifest_file_reader.reset(
-          new SequentialFileReader(std::move(manifest_file), manifest_path));
+          new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                   db_options_->log_readahead_size));
       manifest_reader->reset(new log::FragmentBufferedReader(
           nullptr, std::move(manifest_file_reader), reporter,
           true /* checksum */, 0 /* log_number */));
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 67464cc5c55..126f25747ff 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -118,10 +118,10 @@ struct EnvOptions {
   bool fallocate_with_keep_size = true;
 
   // See DBOptions doc
-  size_t compaction_readahead_size;
+  size_t compaction_readahead_size = 0;
 
   // See DBOptions doc
-  size_t random_access_max_buffer_size;
+  size_t random_access_max_buffer_size = 0;
 
   // See DBOptions doc
   size_t writable_file_max_buffer_size = 1024 * 1024;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 09dc8e54c5c..234af6a31eb 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1087,6 +1087,17 @@ struct DBOptions {
   // If set to true, takes precedence over
   // ReadOptions::background_purge_on_iterator_cleanup.
   bool avoid_unnecessary_blocking_io = false;
+
+  // The number of bytes to prefetch when reading the log. This is mostly useful
+  // for reading a remotely located log, as it can save the number of
+  // round-trips. If 0, then the prefetching is disabled.
+
+  // If non-zero, we perform bigger reads when reading the log.
+  // This is mostly useful for reading a remotely located log, as it can save
+  // the number of round-trips. If 0, then the prefetching is disabled.
+  //
+  // Default: 0
+  size_t log_readahead_size = 0;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
diff --git a/options/db_options.cc b/options/db_options.cc
index 490a3708030..3756c555ceb 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -85,7 +85,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       manual_wal_flush(options.manual_wal_flush),
       atomic_flush(options.atomic_flush),
       avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
-      persist_stats_to_disk(options.persist_stats_to_disk) {
+      persist_stats_to_disk(options.persist_stats_to_disk),
+      log_readahead_size(options.log_readahead_size) {
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -225,6 +226,9 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    avoid_unnecessary_blocking_io);
   ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
                    persist_stats_to_disk);
+  ROCKS_LOG_HEADER(
+      log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
+      log_readahead_size);
 }
 
 MutableDBOptions::MutableDBOptions()
diff --git a/options/db_options.h b/options/db_options.h
index 92eea4ecfa1..e39e2903ff3 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -82,6 +82,7 @@ struct ImmutableDBOptions {
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
   bool persist_stats_to_disk;
+  size_t log_readahead_size;
 };
 
 struct MutableDBOptions {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 47aba7ad035..922ece3a81a 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -138,7 +138,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.atomic_flush = immutable_db_options.atomic_flush;
   options.avoid_unnecessary_blocking_io =
       immutable_db_options.avoid_unnecessary_blocking_io;
-
+  options.log_readahead_size = immutable_db_options.log_readahead_size;
   return options;
 }
 
@@ -1664,6 +1664,9 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct DBOptions, avoid_unnecessary_blocking_io),
           OptionType::kBoolean, OptionVerificationType::kNormal, false,
           offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}},
+        {"log_readahead_size",
+         {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
 };
 
 std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index f0b79e372f7..e60fd6f9ebf 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -295,7 +295,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "manual_wal_flush=false;"
                              "seq_per_batch=false;"
                              "atomic_flush=false;"
-                             "avoid_unnecessary_blocking_io=false",
+                             "avoid_unnecessary_blocking_io=false;"
+                             "log_readahead_size=0",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),

From 4f7ba3aaed08b0f29a2828a44fb9eed525f47610 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Fri, 19 Jul 2019 13:20:45 -0700
Subject: [PATCH 238/572] Fix tsan and valgrind failures in
 import_column_family_test

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5598

Test Plan:
tsan_check
valgrind_test

Differential Revision: D16380167

Pulled By: anand1976

fbshipit-source-id: 2d0caea7d2d02a9606457f62811175d762b89d5c
---
 db/import_column_family_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index bc239c699ba..4f695d33f90 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -427,6 +427,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   }
   db_copy->DropColumnFamily(cfh);
   db_copy->DestroyColumnFamilyHandle(cfh);
+  delete db_copy;
   test::DestroyDir(env_, dbname_ + "/db_copy");
 }
 

From 0be1feec216cfdbc1c8feab95c88dad2eefab3df Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Fri, 19 Jul 2019 14:55:07 -0700
Subject: [PATCH 239/572] Added .watchmanconfig file to rocksdb repo (#5593)

Summary:
Added .watchmanconfig file to rocksdb repo. It is currently .gitignored.
This allows to auto sync modified files with watchman when editing them remotely.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5593

Differential Revision: D16363860

Pulled By: elipoz

fbshipit-source-id: 5ae221e21c6c757ceb08877771550d508f773d55
---
 .gitignore      | 1 +
 .watchmanconfig | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 .watchmanconfig

diff --git a/.gitignore b/.gitignore
index 180fb4c5007..7a799c09a9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ make_config.mk
 *.vcxproj.filters
 *.sln
 *.cmake
+.watchmanconfig
 CMakeCache.txt
 CMakeFiles/
 build/
diff --git a/.watchmanconfig b/.watchmanconfig
new file mode 100644
index 00000000000..e5b450d7bbb
--- /dev/null
+++ b/.watchmanconfig
@@ -0,0 +1,6 @@
+{
+  "content_hash_warming": true,
+  "content_hash_max_items": 333333,
+  "hint_num_files_per_dir": 8,
+  "fsevents_latency": 0.05
+}

From a78503bd6c80a3c4137df1962a972fe406b4d90b Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 22 Jul 2019 14:35:03 -0700
Subject: [PATCH 240/572] Temporarily disable snapshot list refresh for atomic
 flush stress test (#5581)

Summary:
Atomic flush test started to fail after https://github.com/facebook/rocksdb/issues/5099. Then https://github.com/facebook/rocksdb/issues/5278 provided a fix after
which the same error occurred much less frequently. However it still occur
occasionally. Not sure what the root cause is. This PR disables the feature of
snapshot list refresh, and we should keep an eye on the failure in the future.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5581

Differential Revision: D16295985

Pulled By: riversand963

fbshipit-source-id: c9e62e65133c52c21b07097de359632ca62571e4
---
 tools/db_crashtest.py | 1 +
 tools/db_stress.cc    | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 2a38d4c96d9..709406e56f4 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -141,6 +141,7 @@ def is_direct_io_supported(dbname):
     "write_buffer_size": 1024 * 1024,
     # disable pipelined write when test_atomic_flush is true
     "enable_pipelined_write": 0,
+    "snap_refresh_nanos": 0,
 }
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 3f767a9e76a..98d088e345e 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -512,6 +512,10 @@ DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
 static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 
+DEFINE_uint64(
+    snap_refresh_nanos, 100 * 1000 * 1000,
+    "If non-zero, compactions will periodically refresh snapshot list.");
+
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
@@ -2724,6 +2728,8 @@ class StressTest {
         fprintf(stdout, "  %s\n", p.c_str());
       }
     }
+    fprintf(stdout, "Snapshot refresh nanos    : %" PRIu64 "\n",
+            FLAGS_snap_refresh_nanos);
 
     fprintf(stdout, "------------------------------------------------\n");
   }
@@ -2873,6 +2879,7 @@ class StressTest {
     } else {
       options_.merge_operator = MergeOperators::CreatePutOperator();
     }
+    options_.snap_refresh_nanos = FLAGS_snap_refresh_nanos;
 
     fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
 

From 3778470061c77f773fab1e433c2ecad7ff02f293 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 22 Jul 2019 17:47:54 -0700
Subject: [PATCH 241/572] Block cache analyzer: Compute correlation of features
 and human readable trace file. (#5596)

Summary:
- Compute correlation between a few features and predictions, e.g., number of accesses since the last access vs number of accesses till the next access on a block.
- Output human readable trace file so python can consume it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5596

Test Plan: make clean && USE_CLANG=1 make check -j32

Differential Revision: D16373200

Pulled By: HaoyuHuang

fbshipit-source-id: c848d26bc2e9210461f317d7dbee42d55be5a0cc
---
 tools/block_cache_trace_analyzer.cc           | 475 ++++++++++++++++--
 tools/block_cache_trace_analyzer.h            |  95 +++-
 tools/block_cache_trace_analyzer_test.cc      |  76 ++-
 trace_replay/block_cache_tracer.cc            |  14 +
 trace_replay/block_cache_tracer.h             |   5 +-
 utilities/simulator_cache/cache_simulator.cc  |  66 +--
 utilities/simulator_cache/cache_simulator.h   |  85 ++--
 .../simulator_cache/cache_simulator_test.cc   |  79 +--
 8 files changed, 753 insertions(+), 142 deletions(-)

diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 76633846257..08143ebcf88 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -7,11 +7,16 @@
 #ifdef GFLAGS
 #include "tools/block_cache_trace_analyzer.h"
 
+#include <algorithm>
 #include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <random>
 #include <sstream>
+
 #include "monitoring/histogram.h"
 #include "util/gflags_compat.h"
 #include "util/string_util.h"
@@ -122,6 +127,20 @@ DEFINE_string(analyze_get_spatial_locality_labels, "",
               "Group data blocks using these labels.");
 DEFINE_string(analyze_get_spatial_locality_buckets, "",
               "Group data blocks by their statistics using these buckets.");
+DEFINE_bool(mrc_only, false,
+            "Evaluate alternative cache policies only. When this flag is true, "
+            "the analyzer does NOT maintain states of each block in memory for "
+            "analysis. It only feeds the accesses into the cache simulators.");
+DEFINE_string(
+    analyze_correlation_coefficients_labels, "",
+    "Analyze the correlation coefficients of features such as number of past "
+    "accesses with regard to the number of accesses till the next access.");
+DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000,
+             "The maximum number of values for a feature. If the number of "
+             "values for a feature is larger than this max, it randomly "
+             "selects 'max' number of values.");
+DEFINE_string(human_readable_trace_file_path, "",
+              "The filt path that saves human readable access records.");
 
 namespace rocksdb {
 namespace {
@@ -143,7 +162,10 @@ const std::string kSupportedCacheNames =
     "ghost_lru_hybrid_no_insert_on_row_miss ";
 
 // The suffix for the generated csv files.
+const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline";
+const std::string kFileNameSuffixMissTimeline = "miss_timeline";
 const std::string kFileNameSuffixAccessTimeline = "access_timeline";
+const std::string kFileNameSuffixCorrelation = "correlation_input";
 const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
     "avg_reuse_interval_naccesses";
 const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval";
@@ -279,6 +301,18 @@ double percent(uint64_t numerator, uint64_t denomenator) {
   return static_cast<double>(numerator * 100.0 / denomenator);
 }
 
+std::map<uint64_t, uint64_t> adjust_time_unit(
+    const std::map<uint64_t, uint64_t>& time_stats, uint64_t time_unit) {
+  if (time_unit == 1) {
+    return time_stats;
+  }
+  std::map<uint64_t, uint64_t> adjusted_time_stats;
+  for (auto const& time : time_stats) {
+    adjusted_time_stats[static_cast<uint64_t>(time.first / time_unit)] +=
+        time.second;
+  }
+  return adjusted_time_stats;
+}
 }  // namespace
 
 void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
@@ -288,8 +322,12 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   if (output_dir_.empty()) {
     return;
   }
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  uint64_t total_accesses = access_sequence_number_;
   const std::string output_miss_ratio_curve_path =
-      output_dir_ + "/" + kMissRatioCurveFileName;
+      output_dir_ + "/" + std::to_string(trace_duration) + "_" +
+      std::to_string(total_accesses) + "_" + kMissRatioCurveFileName;
   std::ofstream out(output_miss_ratio_curve_path);
   if (!out.is_open()) {
     return;
@@ -302,7 +340,8 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
   for (auto const& config_caches : cache_simulator_->sim_caches()) {
     const CacheConfiguration& config = config_caches.first;
     for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
-      double miss_ratio = config_caches.second[i]->miss_ratio();
+      double miss_ratio =
+          config_caches.second[i]->miss_ratio_stats().miss_ratio();
       // Write the body.
       out << config.cache_name;
       out << ",";
@@ -314,13 +353,287 @@ void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
       out << ",";
       out << std::fixed << std::setprecision(4) << miss_ratio;
       out << ",";
-      out << config_caches.second[i]->total_accesses();
+      out << config_caches.second[i]->miss_ratio_stats().total_accesses();
       out << std::endl;
     }
   }
   out.close();
 }
 
+void BlockCacheTraceAnalyzer::UpdateFeatureVectors(
+    const std::vector<uint64_t>& access_sequence_number_timeline,
+    const std::vector<uint64_t>& access_timeline, const std::string& label,
+    std::map<std::string, Features>* label_features,
+    std::map<std::string, Predictions>* label_predictions) const {
+  if (access_sequence_number_timeline.empty() || access_timeline.empty()) {
+    return;
+  }
+  assert(access_timeline.size() == access_sequence_number_timeline.size());
+  uint64_t prev_access_sequence_number = access_sequence_number_timeline[0];
+  uint64_t prev_access_timestamp = access_timeline[0];
+  for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) {
+    uint64_t num_accesses_since_last_access =
+        access_sequence_number_timeline[i] - prev_access_sequence_number;
+    uint64_t elapsed_time_since_last_access =
+        access_timeline[i] - prev_access_timestamp;
+    prev_access_sequence_number = access_sequence_number_timeline[i];
+    prev_access_timestamp = access_timeline[i];
+    if (i < access_sequence_number_timeline.size() - 1) {
+      (*label_features)[label].num_accesses_since_last_access.push_back(
+          num_accesses_since_last_access);
+      (*label_features)[label].num_past_accesses.push_back(i);
+      (*label_features)[label].elapsed_time_since_last_access.push_back(
+          elapsed_time_since_last_access);
+    }
+    if (i >= 1) {
+      (*label_predictions)[label].num_accesses_till_next_access.push_back(
+          num_accesses_since_last_access);
+      (*label_predictions)[label].elapsed_time_till_next_access.push_back(
+          elapsed_time_since_last_access);
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, double>>>
+      cs_name_timeline;
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  const std::map<uint64_t, uint64_t>& trace_num_accesses =
+      adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit);
+  assert(trace_num_misses.size() == trace_num_accesses.size());
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    auto it = trace_num_accesses.find(time);
+    assert(it != trace_num_accesses.end());
+    uint64_t access = it->second;
+    cs_name_timeline[port::kMaxUint64]["trace"][time] = percent(miss, access);
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      const std::map<uint64_t, uint64_t>& num_accesses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(),
+          time_unit);
+      assert(num_misses.size() == num_accesses.size());
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        auto it = num_accesses.find(time);
+        assert(it != num_accesses.end());
+        uint64_t access = it->second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] =
+            percent(miss, access);
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, uint64_t>>>
+      cs_name_timeline;
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    cs_name_timeline[port::kMaxUint64]["trace"][time] = miss;
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss;
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
+    const std::string& label_str, uint32_t max_number_of_values) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t fd, uint32_t level,
+          TraceType block_type, const std::string& /*block_key*/,
+          uint64_t /*block_key_id*/, const BlockAccessInfo& block) {
+        if (labels.find(kGroupbyCaller) != labels.end()) {
+          // Group by caller.
+          for (auto const& caller_map : block.caller_access_timeline) {
+            const std::string label =
+                BuildLabel(labels, cf_name, fd, level, block_type,
+                           caller_map.first, /*block_id=*/0);
+            auto it = block.caller_access_sequence__number_timeline.find(
+                caller_map.first);
+            assert(it != block.caller_access_sequence__number_timeline.end());
+            UpdateFeatureVectors(it->second, caller_map.second, label,
+                                 &label_features, &label_predictions);
+          }
+          return;
+        }
+        const std::string label = BuildLabel(
+            labels, cf_name, fd, level, block_type,
+            TableReaderCaller::kMaxBlockCacheLookupCaller, /*block_id=*/0);
+        UpdateFeatureVectors(block.access_sequence_number_timeline,
+                             block.access_timeline, label, &label_features,
+                             &label_predictions);
+      };
+  TraverseBlocks(block_callback);
+  WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
+    const std::string& label,
+    const std::map<std::string, Features>& label_features,
+    const std::map<std::string, Predictions>& label_predictions,
+    uint32_t max_number_of_values) const {
+  std::default_random_engine rand_engine(env_->NowMicros());
+  for (auto const& label_feature_vectors : label_features) {
+    const Features& past = label_feature_vectors.second;
+    auto it = label_predictions.find(label_feature_vectors.first);
+    assert(it != label_predictions.end());
+    const Predictions& future = it->second;
+    const std::string output_path = output_dir_ + "/" + label + "_" +
+                                    label_feature_vectors.first + "_" +
+                                    kFileNameSuffixCorrelation;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header(
+        "num_accesses_since_last_access,elapsed_time_since_last_access,num_"
+        "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_"
+        "access");
+    out << header << std::endl;
+    std::vector<uint32_t> indexes;
+    for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) {
+      indexes.push_back(i);
+    }
+    std::shuffle(indexes.begin(), indexes.end(), rand_engine);
+    for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) {
+      uint32_t rand_index = indexes[i];
+      out << std::to_string(past.num_accesses_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.elapsed_time_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.num_past_accesses[rand_index]) << ",";
+      out << std::to_string(future.num_accesses_till_next_access[rand_index])
+          << ",";
+      out << std::to_string(future.elapsed_time_till_next_access[rand_index])
+          << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet(
+    uint32_t max_number_of_values) const {
+  std::string label = "GetKeyInfo";
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  for (auto const& get_info : get_key_info_map_) {
+    const GetKeyInfo& info = get_info.second;
+    UpdateFeatureVectors(info.access_sequence_number_timeline,
+                         info.access_timeline, label, &label_features,
+                         &label_predictions);
+  }
+  WriteCorrelationFeaturesToFile(label, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
 std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
     const std::string& label_str) const {
   std::stringstream ss(label_str);
@@ -371,7 +684,6 @@ void BlockCacheTraceAnalyzer::TraverseBlocks(
                        uint64_t /*block_key_id*/,
                        const BlockAccessInfo& /*block_access_info*/)>
         block_callback) const {
-  uint64_t block_id = 0;
   for (auto const& cf_aggregates : cf_aggregates_map_) {
     // Stats per column family.
     const std::string& cf_name = cf_aggregates.first;
@@ -387,8 +699,8 @@ void BlockCacheTraceAnalyzer::TraverseBlocks(
              block_type_aggregates.second.block_access_info_map) {
           // Stats per block.
           block_callback(cf_name, fd, level, type, block_access_info.first,
-                         block_id, block_access_info.second);
-          block_id++;
+                         block_access_info.second.block_id,
+                         block_access_info.second);
         }
       }
     }
@@ -1046,12 +1358,15 @@ void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats(
 
 BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
     const std::string& trace_file_path, const std::string& output_dir,
-    bool compute_reuse_distance,
+    const std::string& human_readable_trace_file_path,
+    bool compute_reuse_distance, bool mrc_only,
     std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
     : env_(rocksdb::Env::Default()),
       trace_file_path_(trace_file_path),
       output_dir_(output_dir),
+      human_readable_trace_file_path_(human_readable_trace_file_path),
       compute_reuse_distance_(compute_reuse_distance),
+      mrc_only_(mrc_only),
       cache_simulator_(std::move(cache_simulator)) {}
 
 void BlockCacheTraceAnalyzer::ComputeReuseDistance(
@@ -1072,7 +1387,29 @@ void BlockCacheTraceAnalyzer::ComputeReuseDistance(
   info->unique_blocks_since_last_access.clear();
 }
 
-void BlockCacheTraceAnalyzer::RecordAccess(
+Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord(
+    const BlockCacheTraceRecord& access, uint64_t block_id,
+    uint64_t get_key_id) {
+  if (!human_readable_trace_file_writer_) {
+    return Status::OK();
+  }
+  int ret = snprintf(
+      trace_record_buffer_, sizeof(trace_record_buffer_),
+      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%" PRIu32 ",%" PRIu64
+      ""
+      ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n",
+      access.access_timestamp, block_id, access.block_type, access.block_size,
+      access.cf_id, access.level, access.sst_fd_number, access.caller,
+      access.no_insert, access.get_id, get_key_id, access.referenced_data_size,
+      access.is_cache_hit);
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(trace_record_buffer_);
+  return human_readable_trace_file_writer_->Append(printout);
+}
+
+Status BlockCacheTraceAnalyzer::RecordAccess(
     const BlockCacheTraceRecord& access) {
   ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
   SSTFileAccessInfoAggregate& file_aggr =
@@ -1080,18 +1417,30 @@ void BlockCacheTraceAnalyzer::RecordAccess(
   file_aggr.level = access.level;
   BlockTypeAccessInfoAggregate& block_type_aggr =
       file_aggr.block_type_aggregates_map[access.block_type];
+  if (block_type_aggr.block_access_info_map.find(access.block_key) ==
+      block_type_aggr.block_access_info_map.end()) {
+    block_type_aggr.block_access_info_map[access.block_key].block_id =
+        unique_block_id_;
+    unique_block_id_++;
+  }
   BlockAccessInfo& block_access_info =
       block_type_aggr.block_access_info_map[access.block_key];
   if (compute_reuse_distance_) {
     ComputeReuseDistance(&block_access_info);
   }
-  block_access_info.AddAccess(access);
+  block_access_info.AddAccess(access, access_sequence_number_);
   block_info_map_[access.block_key] = &block_access_info;
-  if (trace_start_timestamp_in_seconds_ == 0) {
-    trace_start_timestamp_in_seconds_ =
-        access.access_timestamp / kMicrosInSecond;
+  uint64_t get_key_id = 0;
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    std::string row_key = BlockCacheTraceHelper::ComputeRowKey(access);
+    if (get_key_info_map_.find(row_key) == get_key_info_map_.end()) {
+      get_key_info_map_[row_key].key_id = unique_get_key_id_;
+      get_key_id = unique_get_key_id_;
+      unique_get_key_id_++;
+    }
+    get_key_info_map_[row_key].AddAccess(access, access_sequence_number_);
   }
-  trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
 
   if (compute_reuse_distance_) {
     // Add this block to all existing blocks.
@@ -1108,6 +1457,8 @@ void BlockCacheTraceAnalyzer::RecordAccess(
       }
     }
   }
+  return WriteHumanReadableTraceRecord(access, block_access_info.block_id,
+                                       get_key_id);
 }
 
 Status BlockCacheTraceAnalyzer::Analyze() {
@@ -1122,32 +1473,68 @@ Status BlockCacheTraceAnalyzer::Analyze() {
   if (!s.ok()) {
     return s;
   }
+  if (!human_readable_trace_file_path_.empty()) {
+    s = env_->NewWritableFile(human_readable_trace_file_path_,
+                              &human_readable_trace_file_writer_, EnvOptions());
+    if (!s.ok()) {
+      return s;
+    }
+  }
   uint64_t start = env_->NowMicros();
-  uint64_t processed_records = 0;
   uint64_t time_interval = 0;
   while (s.ok()) {
     BlockCacheTraceRecord access;
     s = reader.ReadAccess(&access);
     if (!s.ok()) {
-      return s;
+      break;
+    }
+    if (!mrc_only_) {
+      s = RecordAccess(access);
+      if (!s.ok()) {
+        break;
+      }
     }
-    RecordAccess(access);
+    if (trace_start_timestamp_in_seconds_ == 0) {
+      trace_start_timestamp_in_seconds_ =
+          access.access_timestamp / kMicrosInSecond;
+    }
+    trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                    is_user_access(access.caller),
+                                    access.is_cache_hit == Boolean::kFalse);
     if (cache_simulator_) {
       cache_simulator_->Access(access);
     }
-    processed_records++;
+    access_sequence_number_++;
     uint64_t now = env_->NowMicros();
     uint64_t duration = (now - start) / kMicrosInSecond;
     if (duration > 10 * time_interval) {
+      uint64_t trace_duration =
+          trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
       fprintf(stdout,
               "Running for %" PRIu64 " seconds: Processed %" PRIu64
-              " records/second\n",
-              duration, processed_records / duration);
-      processed_records = 0;
+              " records/second. Trace duration %" PRIu64
+              " seconds. Observed miss ratio %.2f\n",
+              duration, duration > 0 ? access_sequence_number_ / duration : 0,
+              trace_duration, miss_ratio_stats_.miss_ratio());
       time_interval++;
     }
   }
-  return Status::OK();
+  if (human_readable_trace_file_writer_) {
+    human_readable_trace_file_writer_->Flush();
+    human_readable_trace_file_writer_->Close();
+  }
+  uint64_t now = env_->NowMicros();
+  uint64_t duration = (now - start) / kMicrosInSecond;
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  fprintf(stdout,
+          "Running for %" PRIu64 " seconds: Processed %" PRIu64
+          " records/second. Trace duration %" PRIu64
+          " seconds. Observed miss ratio %.2f\n",
+          duration, duration > 0 ? access_sequence_number_ / duration : 0,
+          trace_duration, miss_ratio_stats_.miss_ratio());
+  return s;
 }
 
 void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
@@ -1321,15 +1708,6 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
               "Top %" PRIu32 " access count blocks access_count=%" PRIu64
               " %s\n",
               top_k, naccess_it->first, statistics.c_str());
-      // if (block->referenced_data_size > block->block_size) {
-      //   for (auto const& ref_key_it : block->key_num_access_map) {
-      //     ParsedInternalKey internal_key;
-      //     ParseInternalKey(ref_key_it.first, &internal_key);
-      //     printf("######%lu %lu %d %s\n", block->referenced_data_size,
-      //     block->block_size, internal_key.type,
-      //     internal_key.user_key.ToString().c_str());
-      //   }
-      // }
     }
   }
 
@@ -1696,16 +2074,32 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
       exit(1);
     }
   }
-  BlockCacheTraceAnalyzer analyzer(
-      FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
-      !FLAGS_reuse_distance_labels.empty(), std::move(cache_simulator));
+  BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
+                                   FLAGS_block_cache_analysis_result_dir,
+                                   FLAGS_human_readable_trace_file_path,
+                                   !FLAGS_reuse_distance_labels.empty(),
+                                   FLAGS_mrc_only, std::move(cache_simulator));
   Status s = analyzer.Analyze();
-  if (!s.IsIncomplete()) {
+  if (!s.IsIncomplete() && !s.ok()) {
     // Read all traces.
     fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
     exit(1);
   }
   fprintf(stdout, "Status: %s\n", s.ToString().c_str());
+  analyzer.WriteMissRatioCurves();
+  analyzer.WriteMissRatioTimeline(1);
+  analyzer.WriteMissRatioTimeline(kSecondInMinute);
+  analyzer.WriteMissRatioTimeline(kSecondInHour);
+  analyzer.WriteMissTimeline(1);
+  analyzer.WriteMissTimeline(kSecondInMinute);
+  analyzer.WriteMissTimeline(kSecondInHour);
+
+  if (FLAGS_mrc_only) {
+    fprintf(stdout,
+            "Skipping the analysis statistics since the user wants to compute "
+            "MRC only");
+    return 0;
+  }
 
   analyzer.PrintStatsSummary();
   if (FLAGS_print_access_count_stats) {
@@ -1727,7 +2121,6 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
     analyzer.PrintDataBlockAccessStats();
   }
   print_break_lines(/*num_break_lines=*/3);
-  analyzer.WriteMissRatioCurves();
 
   if (!FLAGS_timeline_labels.empty()) {
     std::stringstream ss(FLAGS_timeline_labels);
@@ -1819,6 +2212,18 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
       analyzer.WriteGetSpatialLocality(label, buckets);
     }
   }
+
+  if (!FLAGS_analyze_correlation_coefficients_labels.empty()) {
+    std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteCorrelationFeatures(
+          label, FLAGS_analyze_correlation_coefficients_max_number_of_values);
+    }
+    analyzer.WriteCorrelationFeaturesForGet(
+        FLAGS_analyze_correlation_coefficients_max_number_of_values);
+  }
   return 0;
 }
 
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h
index 32a90342cb1..bc41ff468cc 100644
--- a/tools/block_cache_trace_analyzer.h
+++ b/tools/block_cache_trace_analyzer.h
@@ -16,8 +16,23 @@
 #include "utilities/simulator_cache/cache_simulator.h"
 
 namespace rocksdb {
+
+// Statistics of a key refereneced by a Get.
+struct GetKeyInfo {
+  uint64_t key_id = 0;
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::vector<uint64_t> access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+  }
+};
+
 // Statistics of a block.
 struct BlockAccessInfo {
+  uint64_t block_id = 0;
   uint64_t num_accesses = 0;
   uint64_t block_size = 0;
   uint64_t first_access_time = 0;
@@ -39,7 +54,16 @@ struct BlockAccessInfo {
   // Number of reuses grouped by reuse distance.
   std::map<uint64_t, uint64_t> reuse_distance_count;
 
-  void AddAccess(const BlockCacheTraceRecord& access) {
+  // The access sequence numbers of this block.
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>>
+      caller_access_sequence__number_timeline;
+  // The access timestamp in microseconds of this block.
+  std::vector<uint64_t> access_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>> caller_access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
     if (block_size != 0 && access.block_size != 0) {
       assert(block_size == access.block_size);
     }
@@ -57,6 +81,12 @@ struct BlockAccessInfo {
     const uint64_t timestamp_in_seconds =
         access.access_timestamp / kMicrosInSecond;
     caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
+    // Populate the feature vectors.
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    caller_access_sequence__number_timeline[access.caller].push_back(
+        access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+    caller_access_timeline[access.caller].push_back(access.access_timestamp);
     if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
                                                           access.caller)) {
       num_keys = access.num_keys_in_block;
@@ -94,11 +124,23 @@ struct ColumnFamilyAccessInfoAggregate {
   std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map;
 };
 
+struct Features {
+  std::vector<uint64_t> elapsed_time_since_last_access;
+  std::vector<uint64_t> num_accesses_since_last_access;
+  std::vector<uint64_t> num_past_accesses;
+};
+
+struct Predictions {
+  std::vector<uint64_t> elapsed_time_till_next_access;
+  std::vector<uint64_t> num_accesses_till_next_access;
+};
+
 class BlockCacheTraceAnalyzer {
  public:
   BlockCacheTraceAnalyzer(
       const std::string& trace_file_path, const std::string& output_dir,
-      bool compute_reuse_distance,
+      const std::string& human_readable_trace_file_path,
+      bool compute_reuse_distance, bool mrc_only,
       std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
   ~BlockCacheTraceAnalyzer() = default;
   // No copy and move.
@@ -184,6 +226,24 @@ class BlockCacheTraceAnalyzer {
   // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses".
   void WriteMissRatioCurves() const;
 
+  // Write miss ratio timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissRatioTimeline(uint64_t time_unit) const;
+
+  // Write misses timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissTimeline(uint64_t time_unit) const;
+
   // Write the access timeline into a csv file saved in 'output_dir'.
   //
   // The file is named "label_access_timeline".The file format is
@@ -236,6 +296,11 @@ class BlockCacheTraceAnalyzer {
       const std::string& label_str,
       const std::vector<uint64_t>& percent_buckets) const;
 
+  void WriteCorrelationFeatures(const std::string& label_str,
+                                uint32_t max_number_of_values) const;
+
+  void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const;
+
   const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
   TEST_cf_aggregates_map() const {
     return cf_aggregates_map_;
@@ -251,7 +316,7 @@ class BlockCacheTraceAnalyzer {
 
   void ComputeReuseDistance(BlockAccessInfo* info) const;
 
-  void RecordAccess(const BlockCacheTraceRecord& access);
+  Status RecordAccess(const BlockCacheTraceRecord& access);
 
   void UpdateReuseIntervalStats(
       const std::string& label, const std::vector<uint64_t>& time_buckets,
@@ -278,17 +343,41 @@ class BlockCacheTraceAnalyzer {
                          const BlockAccessInfo& /*block_access_info*/)>
           block_callback) const;
 
+  void UpdateFeatureVectors(
+      const std::vector<uint64_t>& access_sequence_number_timeline,
+      const std::vector<uint64_t>& access_timeline, const std::string& label,
+      std::map<std::string, Features>* label_features,
+      std::map<std::string, Predictions>* label_predictions) const;
+
+  void WriteCorrelationFeaturesToFile(
+      const std::string& label,
+      const std::map<std::string, Features>& label_features,
+      const std::map<std::string, Predictions>& label_predictions,
+      uint32_t max_number_of_values) const;
+
+  Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
+                                       uint64_t block_id, uint64_t get_key_id);
+
   rocksdb::Env* env_;
   const std::string trace_file_path_;
   const std::string output_dir_;
+  std::string human_readable_trace_file_path_;
   const bool compute_reuse_distance_;
+  const bool mrc_only_;
 
   BlockCacheTraceHeader header_;
   std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
   std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
   std::map<std::string, BlockAccessInfo*> block_info_map_;
+  std::unordered_map<std::string, GetKeyInfo> get_key_info_map_;
+  uint64_t access_sequence_number_ = 0;
   uint64_t trace_start_timestamp_in_seconds_ = 0;
   uint64_t trace_end_timestamp_in_seconds_ = 0;
+  MissRatioStats miss_ratio_stats_;
+  uint64_t unique_block_id_ = 1;
+  uint64_t unique_get_key_id_ = 1;
+  char trace_record_buffer_[1024 * 1024];
+  std::unique_ptr<rocksdb::WritableFile> human_readable_trace_file_writer_;
 };
 
 int block_cache_trace_analyzer_tool(int argc, char** argv);
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc
index 45ef99eee75..a028bf197c9 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_trace_analyzer_test.cc
@@ -117,7 +117,8 @@ class BlockCacheTracerTest : public testing::Test {
       // Provide these fields for all block types.
       // The writer should only write these fields for data blocks and the
       // caller is either GET or MGET.
-      record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+      record.referenced_key =
+          kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0);
       record.referenced_key_exist_in_block = Boolean::kTrue;
       record.num_keys_in_block = kNumKeysInBlock;
       ASSERT_OK(writer->WriteBlockAccess(
@@ -179,7 +180,8 @@ class BlockCacheTracerTest : public testing::Test {
         "-analyze_get_spatial_locality_labels=" +
             analyze_get_spatial_locality_labels_,
         "-analyze_get_spatial_locality_buckets=" +
-            analyze_get_spatial_locality_buckets_};
+            analyze_get_spatial_locality_buckets_,
+        "-analyze_correlation_coefficients_labels=all"};
     char arg_buffer[kArgBufferSize];
     char* argv[kMaxArgCount];
     int argc = 0;
@@ -236,9 +238,9 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
   RunBlockCacheTraceAnalyzer();
   {
     // Validate the cache miss ratios.
-    const std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
-                                                    1024 * 1024 * 1024};
-    const std::string mrc_path = test_path_ + "/mrc";
+    std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
+                                              1024 * 1024 * 1024};
+    const std::string mrc_path = test_path_ + "/49_50_mrc";
     std::ifstream infile(mrc_path);
     uint32_t config_index = 0;
     std::string line;
@@ -266,6 +268,68 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
     ASSERT_EQ(expected_capacities.size(), config_index);
     infile.close();
     ASSERT_OK(env_->DeleteFile(mrc_path));
+
+    const std::vector<std::string> time_units{"1", "60", "3600"};
+    expected_capacities.push_back(port::kMaxUint64);
+    for (auto const& expected_capacity : expected_capacities) {
+      for (auto const& time_unit : time_units) {
+        const std::string miss_ratio_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_ratio_timeline";
+        std::ifstream mrt_file(miss_ratio_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mrt_file, line));
+        ASSERT_TRUE(getline(mrt_file, line));
+        std::stringstream ss(line);
+        bool read_header = false;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (!read_header) {
+            if (expected_capacity == port::kMaxUint64) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            read_header = true;
+            continue;
+          }
+          ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr));
+        }
+        ASSERT_FALSE(getline(mrt_file, line));
+        mrt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path));
+      }
+      for (auto const& time_unit : time_units) {
+        const std::string miss_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_timeline";
+        std::ifstream mt_file(miss_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mt_file, line));
+        ASSERT_TRUE(getline(mt_file, line));
+        std::stringstream ss(line);
+        uint32_t num_misses = 0;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (num_misses == 0) {
+            if (expected_capacity == port::kMaxUint64) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            num_misses++;
+            continue;
+          }
+          num_misses += ParseInt(substr);
+        }
+        ASSERT_EQ(51, num_misses);
+        ASSERT_FALSE(getline(mt_file, line));
+        mt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_timeline_path));
+      }
+    }
   }
   {
     // Validate the timeline csv files.
@@ -543,7 +607,9 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     // Read blocks.
     BlockCacheTraceAnalyzer analyzer(trace_file_path_,
                                      /*output_miss_ratio_curve_path=*/"",
+                                     /*human_readable_trace_file_path=*/"",
                                      /*compute_reuse_distance=*/true,
+                                     /*mrc_only=*/false,
                                      /*simulator=*/nullptr);
     // The analyzer ends when it detects an incomplete access record.
     ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 4f320ef2d0f..1eeb64ac85d 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -6,6 +6,7 @@
 #include "trace_replay/block_cache_tracer.h"
 
 #include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
 #include "util/hash.h"
@@ -54,6 +55,19 @@ bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) {
          caller == TableReaderCaller::kUserVerifyChecksum;
 }
 
+std::string BlockCacheTraceHelper::ComputeRowKey(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return "";
+  }
+  Slice key = ExtractUserKey(access.referenced_key);
+  uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse
+                        ? 0
+                        : 1 + GetInternalKeySeqno(access.referenced_key);
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" +
+         std::to_string(seq_no);
+}
+
 BlockCacheTraceWriter::BlockCacheTraceWriter(
     Env* env, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer)
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index b1a258843e5..3863ca430a4 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -20,6 +20,7 @@ extern const uint64_t kMicrosInSecond;
 extern const uint64_t kSecondInMinute;
 extern const uint64_t kSecondInHour;
 
+struct BlockCacheTraceRecord;
 
 class BlockCacheTraceHelper {
  public:
@@ -27,7 +28,9 @@ class BlockCacheTraceHelper {
                                          TableReaderCaller caller);
   static bool IsGetOrMultiGet(TableReaderCaller caller);
   static bool IsUserAccess(TableReaderCaller caller);
-
+  // Row key is a concatenation of the access's fd_number and the referenced
+  // user key.
+  static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
   static const std::string kUnknownColumnFamilyName;
   static const uint64_t kReservedGetId;
 };
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
index 90433df11bf..06de4c11996 100644
--- a/utilities/simulator_cache/cache_simulator.cc
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -4,13 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "utilities/simulator_cache/cache_simulator.h"
+#include <algorithm>
 #include "db/dbformat.h"
 
 namespace rocksdb {
 
 namespace {
 const std::string kGhostCachePrefix = "ghost_";
-}
+}  // namespace
 
 GhostCache::GhostCache(std::shared_ptr<Cache> sim_cache)
     : sim_cache_(sim_cache) {}
@@ -22,7 +23,7 @@ bool GhostCache::Admit(const Slice& lookup_key) {
     return true;
   }
   sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
-                     /*deleter=*/nullptr, /*handle=*/nullptr);
+                     /*deleter=*/nullptr);
   return false;
 }
 
@@ -43,18 +44,27 @@ void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
     sim_cache_->Release(handle);
     is_cache_miss = false;
   } else {
-    if (access.no_insert == Boolean::kFalse && admit) {
+    if (access.no_insert == Boolean::kFalse && admit && access.block_size > 0) {
       sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
-                         /*deleter=*/nullptr, /*handle=*/nullptr);
+                         /*deleter=*/nullptr);
     }
   }
-  UpdateMetrics(is_user_access, is_cache_miss);
+  miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                  is_cache_miss);
 }
 
-void CacheSimulator::UpdateMetrics(bool is_user_access, bool is_cache_miss) {
+void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms,
+                                   bool is_user_access, bool is_cache_miss) {
+  uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond;
+  num_accesses_timeline_[timestamp_in_seconds] += 1;
   num_accesses_ += 1;
+  if (num_misses_timeline_.find(timestamp_in_seconds) ==
+      num_misses_timeline_.end()) {
+    num_misses_timeline_[timestamp_in_seconds] = 0;
+  }
   if (is_cache_miss) {
     num_misses_ += 1;
+    num_misses_timeline_[timestamp_in_seconds] += 1;
   }
   if (is_user_access) {
     user_accesses_ += 1;
@@ -76,8 +86,8 @@ Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority(
 
 void PrioritizedCacheSimulator::AccessKVPair(
     const Slice& key, uint64_t value_size, Cache::Priority priority,
-    bool no_insert, bool is_user_access, bool* is_cache_miss, bool* admitted,
-    bool update_metrics) {
+    const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access,
+    bool* is_cache_miss, bool* admitted, bool update_metrics) {
   assert(is_cache_miss);
   assert(admitted);
   *is_cache_miss = true;
@@ -90,11 +100,12 @@ void PrioritizedCacheSimulator::AccessKVPair(
     sim_cache_->Release(handle);
     *is_cache_miss = false;
   } else if (!no_insert && *admitted && value_size > 0) {
-    sim_cache_->Insert(key, /*value=*/nullptr, value_size,
-                       /*deleter=*/nullptr, /*handle=*/nullptr, priority);
+    sim_cache_->Insert(key, /*value=*/nullptr, value_size, /*deleter=*/nullptr,
+                       /*handle=*/nullptr, priority);
   }
   if (update_metrics) {
-    UpdateMetrics(is_user_access, *is_cache_miss);
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                    *is_cache_miss);
   }
 }
 
@@ -102,38 +113,28 @@ void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
   bool is_cache_miss = true;
   bool admitted = true;
   AccessKVPair(access.block_key, access.block_size,
-               ComputeBlockPriority(access), access.no_insert,
+               ComputeBlockPriority(access), access, access.no_insert,
                BlockCacheTraceHelper::IsUserAccess(access.caller),
                &is_cache_miss, &admitted, /*update_metrics=*/true);
 }
 
-std::string HybridRowBlockCacheSimulator::ComputeRowKey(
-    const BlockCacheTraceRecord& access) {
-  assert(access.get_id != BlockCacheTraceHelper::kReservedGetId);
-  Slice key = ExtractUserKey(access.referenced_key);
-  uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse
-                        ? 0
-                        : 1 + GetInternalKeySeqno(access.referenced_key);
-  return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" +
-         std::to_string(seq_no);
-}
-
 void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
-  bool is_cache_miss = true;
-  bool admitted = true;
   // TODO (haoyu): We only support Get for now. We need to extend the tracing
   // for MultiGet, i.e., non-data block accesses must log all keys in a
   // MultiGet.
+  bool is_cache_miss = false;
+  bool admitted = false;
   if (access.caller == TableReaderCaller::kUserGet &&
       access.get_id != BlockCacheTraceHelper::kReservedGetId) {
     // This is a Get/MultiGet request.
-    const std::string& row_key = ComputeRowKey(access);
+    const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
     if (getid_getkeys_map_[access.get_id].find(row_key) ==
         getid_getkeys_map_[access.get_id].end()) {
       // This is the first time that this key is accessed. Look up the key-value
       // pair first. Do not update the miss/accesses metrics here since it will
       // be updated later.
       AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH,
+                   access,
                    /*no_insert=*/false,
                    /*is_user_access=*/true, &is_cache_miss, &admitted,
                    /*update_metrics=*/false);
@@ -154,28 +155,31 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
       // referenced key-value pair already. Thus, we treat these lookups as
       // hits. This is also to ensure the total number of accesses are the same
       // when comparing to other policies.
-      UpdateMetrics(/*is_user_access=*/true, /*is_cache_miss=*/false);
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
       return;
     }
     // The key-value pair observes a cache miss. We need to access its
     // index/filter/data blocks.
     AccessKVPair(
         access.block_key, access.block_type, ComputeBlockPriority(access),
+        access,
         /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
         /*is_user_access=*/true, &is_cache_miss, &admitted,
         /*update_metrics=*/true);
     if (access.referenced_data_size > 0 &&
         miss_inserted.second == InsertResult::ADMITTED) {
-      sim_cache_->Insert(
-          row_key, /*value=*/nullptr, access.referenced_data_size,
-          /*deleter=*/nullptr, /*handle=*/nullptr, Cache::Priority::HIGH);
+      sim_cache_->Insert(row_key, /*value=*/nullptr,
+                         access.referenced_data_size, /*deleter=*/nullptr,
+                         /*handle=*/nullptr, Cache::Priority::HIGH);
       getid_getkeys_map_[access.get_id][row_key] =
           std::make_pair(true, InsertResult::INSERTED);
     }
     return;
   }
   AccessKVPair(access.block_key, access.block_size,
-               ComputeBlockPriority(access), access.no_insert,
+               ComputeBlockPriority(access), access, access.no_insert,
                BlockCacheTraceHelper::IsUserAccess(access.caller),
                &is_cache_miss, &admitted, /*update_metrics=*/true);
 }
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
index 82972688658..3863fcf88dd 100644
--- a/utilities/simulator_cache/cache_simulator.h
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -5,6 +5,9 @@
 
 #pragma once
 
+#include <unordered_map>
+
+#include "cache/lru_cache.h"
 #include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
@@ -29,6 +32,51 @@ struct CacheConfiguration {
   }
 };
 
+class MissRatioStats {
+ public:
+  void reset_counter() {
+    num_misses_ = 0;
+    num_accesses_ = 0;
+    user_accesses_ = 0;
+    user_misses_ = 0;
+  }
+  double miss_ratio() const {
+    if (num_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
+  }
+  uint64_t total_accesses() const { return num_accesses_; }
+
+  const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
+    return num_accesses_timeline_;
+  }
+
+  const std::map<uint64_t, uint64_t>& num_misses_timeline() const {
+    return num_misses_timeline_;
+  }
+
+  double user_miss_ratio() const {
+    if (user_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
+  }
+  uint64_t user_accesses() const { return user_accesses_; }
+
+  void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
+                     bool is_cache_miss);
+
+ private:
+  uint64_t num_accesses_ = 0;
+  uint64_t num_misses_ = 0;
+  uint64_t user_accesses_ = 0;
+  uint64_t user_misses_ = 0;
+
+  std::map<uint64_t, uint64_t> num_accesses_timeline_;
+  std::map<uint64_t, uint64_t> num_misses_timeline_;
+};
+
 // A ghost cache admits an entry on its second access.
 class GhostCache {
  public:
@@ -61,37 +109,15 @@ class CacheSimulator {
   CacheSimulator& operator=(CacheSimulator&&) = delete;
 
   virtual void Access(const BlockCacheTraceRecord& access);
-  void reset_counter() {
-    num_misses_ = 0;
-    num_accesses_ = 0;
-    user_accesses_ = 0;
-    user_misses_ = 0;
-  }
-  double miss_ratio() const {
-    if (num_accesses_ == 0) {
-      return -1;
-    }
-    return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
-  }
-  uint64_t total_accesses() const { return num_accesses_; }
 
-  double user_miss_ratio() const {
-    if (user_accesses_ == 0) {
-      return -1;
-    }
-    return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
-  }
-  uint64_t user_accesses() const { return user_accesses_; }
+  void reset_counter() { miss_ratio_stats_.reset_counter(); }
 
- protected:
-  void UpdateMetrics(bool is_user_access, bool is_cache_miss);
+  const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; }
 
+ protected:
+  MissRatioStats miss_ratio_stats_;
   std::unique_ptr<GhostCache> ghost_cache_;
   std::shared_ptr<Cache> sim_cache_;
-  uint64_t num_accesses_ = 0;
-  uint64_t num_misses_ = 0;
-  uint64_t user_accesses_ = 0;
-  uint64_t user_misses_ = 0;
 };
 
 // A prioritized cache simulator that runs against a block cache trace.
@@ -107,7 +133,8 @@ class PrioritizedCacheSimulator : public CacheSimulator {
  protected:
   // Access the key-value pair and returns true upon a cache miss.
   void AccessKVPair(const Slice& key, uint64_t value_size,
-                    Cache::Priority priority, bool no_insert,
+                    Cache::Priority priority,
+                    const BlockCacheTraceRecord& access, bool no_insert,
                     bool is_user_access, bool* is_cache_miss, bool* admitted,
                     bool update_metrics);
 
@@ -135,10 +162,6 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
   void Access(const BlockCacheTraceRecord& access) override;
 
  private:
-  // Row key is a concatenation of the access's fd_number and the referenced
-  // user key.
-  std::string ComputeRowKey(const BlockCacheTraceRecord& access);
-
   enum InsertResult : char {
     INSERTED,
     ADMITTED,
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
index f435785e6a1..dc3b8327e01 100644
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -94,21 +94,21 @@ TEST_F(CacheSimulatorTest, CacheSimulator) {
       new CacheSimulator(nullptr, sim_cache));
   cache_simulator->Access(access);
   cache_simulator->Access(access);
-  ASSERT_EQ(2, cache_simulator->total_accesses());
-  ASSERT_EQ(50, cache_simulator->miss_ratio());
-  ASSERT_EQ(2, cache_simulator->user_accesses());
-  ASSERT_EQ(50, cache_simulator->user_miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
 
   cache_simulator->Access(compaction_access);
   cache_simulator->Access(compaction_access);
-  ASSERT_EQ(4, cache_simulator->total_accesses());
-  ASSERT_EQ(75, cache_simulator->miss_ratio());
-  ASSERT_EQ(2, cache_simulator->user_accesses());
-  ASSERT_EQ(50, cache_simulator->user_miss_ratio());
+  ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
 
   cache_simulator->reset_counter();
-  ASSERT_EQ(0, cache_simulator->total_accesses());
-  ASSERT_EQ(-1, cache_simulator->miss_ratio());
+  ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio());
   auto handle = sim_cache->Lookup(access.block_key);
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
@@ -129,9 +129,9 @@ TEST_F(CacheSimulatorTest, GhostCacheSimulator) {
                   /*high_pri_pool_ratio=*/0)));
   cache_simulator->Access(access);
   cache_simulator->Access(access);
-  ASSERT_EQ(2, cache_simulator->total_accesses());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
   // Both of them will be miss since we have a ghost cache.
-  ASSERT_EQ(100, cache_simulator->miss_ratio());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
 }
 
 TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
@@ -144,8 +144,8 @@ TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
       new PrioritizedCacheSimulator(nullptr, sim_cache));
   cache_simulator->Access(access);
   cache_simulator->Access(access);
-  ASSERT_EQ(2, cache_simulator->total_accesses());
-  ASSERT_EQ(50, cache_simulator->miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
 
   auto handle = sim_cache->Lookup(access.block_key);
   ASSERT_NE(nullptr, handle);
@@ -166,9 +166,9 @@ TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
                       /*high_pri_pool_ratio=*/0)));
   cache_simulator->Access(access);
   cache_simulator->Access(access);
-  ASSERT_EQ(2, cache_simulator->total_accesses());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
   // Both of them will be miss since we have a ghost cache.
-  ASSERT_EQ(100, cache_simulator->miss_ratio());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
 }
 
 TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
@@ -200,10 +200,11 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
     cache_simulator->Access(first_get);
     block_id++;
   }
-  ASSERT_EQ(10, cache_simulator->total_accesses());
-  ASSERT_EQ(100, cache_simulator->miss_ratio());
-  ASSERT_EQ(10, cache_simulator->user_accesses());
-  ASSERT_EQ(100, cache_simulator->user_miss_ratio());
+
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
   auto handle = sim_cache->Lookup(
       std::to_string(first_get.sst_fd_number) + "_" +
       ExtractUserKey(first_get.referenced_key).ToString() + "_" +
@@ -225,10 +226,12 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
     cache_simulator->Access(second_get);
     block_id++;
   }
-  ASSERT_EQ(15, cache_simulator->total_accesses());
-  ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->miss_ratio()));
-  ASSERT_EQ(15, cache_simulator->user_accesses());
-  ASSERT_EQ(66, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
   handle = sim_cache->Lookup(
       std::to_string(second_get.sst_fd_number) + "_" +
       ExtractUserKey(second_get.referenced_key).ToString() + "_" +
@@ -252,10 +255,12 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
     cache_simulator->Access(third_get);
     block_id++;
   }
-  ASSERT_EQ(20, cache_simulator->total_accesses());
-  ASSERT_EQ(75, static_cast<uint64_t>(cache_simulator->miss_ratio()));
-  ASSERT_EQ(20, cache_simulator->user_accesses());
-  ASSERT_EQ(75, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
   // Assert that the third key is not inserted into the cache.
   handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" +
                              third_get.referenced_key);
@@ -318,19 +323,21 @@ TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) {
   // Two get requests access the same key.
   cache_simulator->Access(first_get);
   cache_simulator->Access(second_get);
-  ASSERT_EQ(2, cache_simulator->total_accesses());
-  ASSERT_EQ(100, cache_simulator->miss_ratio());
-  ASSERT_EQ(2, cache_simulator->user_accesses());
-  ASSERT_EQ(100, cache_simulator->user_miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
   // We insert the key-value pair upon the second get request. A third get
   // request should observe a hit.
   for (uint32_t i = 0; i < 10; i++) {
     cache_simulator->Access(third_get);
   }
-  ASSERT_EQ(12, cache_simulator->total_accesses());
-  ASSERT_EQ(16, static_cast<uint64_t>(cache_simulator->miss_ratio()));
-  ASSERT_EQ(12, cache_simulator->user_accesses());
-  ASSERT_EQ(16, static_cast<uint64_t>(cache_simulator->user_miss_ratio()));
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
 }
 
 }  // namespace rocksdb

From 66b5613d0c3f84e5ef72c43b62a2e9866efdde8a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 22 Jul 2019 18:53:03 -0700
Subject: [PATCH 242/572] row_cache to share entry for recent snapshots (#5600)

Summary:
Right now, users cannot take advantage of row cache, unless no snapshot is used, or Get() is repeated for the same snapshots. This limits the usage of row cache.
This change eliminate this restriction in some cases. If the snapshot used is newer than the largest sequence number in the file, and write callback function is not registered, the same row cache key is used as no snapshot is given. We still need the callback function restriction for now because the callback function may filter out different keys for different snapshots even if the snapshots are new.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5600

Test Plan: Add a unit test.

Differential Revision: D16386616

fbshipit-source-id: 6b7d214bd215d191b03ccf55926ad4b703ec2e53
---
 HISTORY.md          |  1 +
 db/db_test2.cc      | 46 +++++++++++++++++++++++++++++++++++++++++++++
 db/table_cache.cc   | 21 +++++++++++++++++++--
 table/get_context.h |  2 ++
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b9d0f741317..efd49f642b0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -33,6 +33,7 @@
 * Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 109a7a377bf..3664b3a249f 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3771,6 +3771,52 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
   delete db_;
   db_ = nullptr;
 }
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 121d4941fc0..2290b5939c5 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -11,6 +11,7 @@
 
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "file/filename.h"
 
@@ -24,6 +25,7 @@
 #include "table/table_builder.h"
 #include "table/table_reader.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
@@ -277,8 +279,23 @@ Status TableCache::Get(const ReadOptions& options,
     // sequence key increases. However, to support caching snapshot
     // reads, we append the sequence number (incremented by 1 to
     // distinguish from 0) only in this case.
-    uint64_t seq_no =
-        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
+    // If the snapshot is larger than the largest seqno in the file,
+    // all data should be exposed to the snapshot, so we treat it
+    // the same as there is no snapshot. The exception is that if
+    // a seq-checking callback is registered, some internal keys
+    // may still be filtered out.
+    uint64_t seq_no = 0;
+    // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+    if (options.snapshot != nullptr &&
+        (get_context->has_callback() ||
+         static_cast_with_check<const SnapshotImpl, const Snapshot>(
+             options.snapshot)
+                 ->GetSequenceNumber() <= fd.largest_seqno)) {
+      // We should consider to use options.snapshot->GetSequenceNumber()
+      // instead of GetInternalKeySeqno(k), which will make the code
+      // easier to understand.
+      seq_no = 1 + GetInternalKeySeqno(k);
+    }
 
     // Compute row cache key.
     row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
diff --git a/table/get_context.h b/table/get_context.h
index 7a37beb2df2..7110ceae806 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -136,6 +136,8 @@ class GetContext {
 
   void ReportCounters();
 
+  bool has_callback() const { return callback_ != nullptr; }
+
   uint64_t get_tracing_get_id() const { return tracing_get_id_; }
 
  private:

From 327c4807a7fe8532326323e2753670daf06a0f6b Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 22 Jul 2019 20:01:25 -0700
Subject: [PATCH 243/572] Disable refresh snapshot feature by default (#5606)

Summary:
There are concerns about the correctness of this patch. Disabling by default until the concerns are resolved.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5606

Differential Revision: D16428064

Pulled By: maysamyabandeh

fbshipit-source-id: a89280f0ea85796c9c9dfbfd9a8e91dad9b000b3
---
 HISTORY.md                | 2 +-
 include/rocksdb/options.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index efd49f642b0..19f4ce1297c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -20,7 +20,7 @@
 * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
 
 ### New Features
-* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
+* Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 234af6a31eb..35c27556553 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -275,10 +275,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // this option helps reducing the cpu usage of long-running compactions. The
   // feature is disabled when max_subcompactions is greater than one.
   //
-  // Default: 0.1s
+  // Default: 0
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t snap_refresh_nanos = 100 * 1000 * 1000;  // 0.1s
+  uint64_t snap_refresh_nanos = 0;
 
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family

From eae832740b16f9c2fbe2225f9c3eef0c0a1e1f48 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Tue, 23 Jul 2019 08:04:58 -0700
Subject: [PATCH 244/572] WriteUnPrepared: improve read your own write
 functionality (#5573)

Summary:
There are a number of fixes in this PR (with most bugs found via the added stress tests):
1. Re-enable reseek optimization. This was initially disabled to avoid infinite loops in https://github.com/facebook/rocksdb/pull/3955 but this can be resolved by remembering not to reseek after a reseek has already been done. This problem only affects forward iteration in `DBIter::FindNextUserEntryInternal`, as we already disable reseeking in `DBIter::FindValueForCurrentKeyUsingSeek`.
2. Verify that ReadOption.snapshot can be safely used for iterator creation. Some snapshots would not give correct results because snaphsot validation would not be enforced, breaking some assumptions in Prev() iteration.
3. In the non-snapshot Get() case, reads done at `LastPublishedSequence` may not be enough, because unprepared sequence numbers are not published. Use `std::max(published_seq, max_visible_seq)` to do lookups instead.
4. Add stress test to test reading own writes.
5. Minor bug in the allow_concurrent_memtable_write case where we forgot to pass in batch_per_txn_.
6. Minor performance optimization in `CalcMaxUnpreparedSequenceNumber` by assigning by reference instead of value.
7. Add some more comments everywhere.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5573

Differential Revision: D16276089

Pulled By: lth

fbshipit-source-id: 18029c944eb427a90a87dee76ac1b23f37ec1ccb
---
 db/db_impl/db_impl.cc                         |  15 +
 db/db_impl/db_impl_write.cc                   |   3 +-
 db/db_iter.cc                                 |  33 +-
 db/read_callback.h                            |   3 -
 utilities/transactions/transaction_test.cc    |  16 +
 .../write_unprepared_transaction_test.cc      | 345 +++++++++++-------
 .../transactions/write_unprepared_txn.cc      |  80 ++--
 utilities/transactions/write_unprepared_txn.h |  57 ++-
 .../transactions/write_unprepared_txn_db.cc   |  82 ++++-
 9 files changed, 436 insertions(+), 198 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index af9aea011a3..8132d5a0b38 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1499,7 +1499,22 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
                    ? versions_->LastSequence()
                    : versions_->LastPublishedSequence();
     if (callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
       callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = callback->max_visible_seq();
     }
   }
   TEST_SYNC_POINT("DBImpl::GetImpl:3");
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index c0d320013b7..95a1b31c769 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -172,7 +172,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       w.status = WriteBatchInternal::InsertInto(
           &w, w.sequence, &column_family_memtables, &flush_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt);
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+          batch_per_txn_);
 
       PERF_TIMER_START(write_pre_and_post_process_time);
     }
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 633724c5763..060138fd64b 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -263,12 +263,6 @@ class DBIter final: public Iterator {
   bool TooManyInternalKeysSkipped(bool increment = true);
   inline bool IsVisible(SequenceNumber sequence);
 
-  // CanReseekToSkip() returns whether the iterator can use the optimization
-  // where it reseek by sequence number to get the next key when there are too
-  // many versions. This is disabled for write unprepared because seeking to
-  // sequence number does not guarantee that it is visible.
-  inline bool CanReseekToSkip();
-
   // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
   // is called
   void TempPinData() {
@@ -453,6 +447,11 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
   //                         greater than that,
   //  - none of the above  : saved_key_ can contain anything, it doesn't matter.
   uint64_t num_skipped = 0;
+  // For write unprepared, the target sequence number in reseek could be larger
+  // than the snapshot, and thus needs to be skipped again. This could result in
+  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+  // to one.
+  bool reseek_done = false;
 
   is_blob_ = false;
 
@@ -498,6 +497,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
         assert(!skipping || user_comparator_.Compare(
                                 ikey_.user_key, saved_key_.GetUserKey()) > 0);
         num_skipped = 0;
+        reseek_done = false;
         switch (ikey_.type) {
           case kTypeDeletion:
           case kTypeSingleDeletion:
@@ -551,6 +551,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
                 // they are hidden by this deletion.
                 skipping = true;
                 num_skipped = 0;
+                reseek_done = false;
                 PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
               } else if (ikey_.type == kTypeBlobIndex) {
                 if (!allow_blob_) {
@@ -581,6 +582,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
               // they are hidden by this deletion.
               skipping = true;
               num_skipped = 0;
+              reseek_done = false;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
             } else {
               // By now, we are sure the current ikey is going to yield a
@@ -611,14 +613,23 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
             !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
         skipping = false;
         num_skipped = 0;
+        reseek_done = false;
       }
     }
 
     // If we have sequentially iterated via numerous equal keys, then it's
     // better to seek so that we can avoid too many key comparisons.
-    if (num_skipped > max_skip_ && CanReseekToSkip()) {
+    //
+    // To avoid infinite loops, do not reseek if we have already attempted to
+    // reseek previously.
+    //
+    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+    // than it does not make sense to reseek as we would actually land further
+    // away from the desired key. There is opportunity for optimization here.
+    if (num_skipped > max_skip_ && !reseek_done) {
       is_key_seqnum_zero_ = false;
       num_skipped = 0;
+      reseek_done = true;
       std::string last_key;
       if (skipping) {
         // We're looking for the next user-key but all we see are the same
@@ -937,7 +948,7 @@ bool DBIter::FindValueForCurrentKey() {
     // This user key has lots of entries.
     // We're going from old to new, and it's taking too long. Let's do a Seek()
     // and go from new to old. This helps when a key was overwritten many times.
-    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
+    if (num_skipped >= max_skip_) {
       return FindValueForCurrentKeyUsingSeek();
     }
 
@@ -1234,7 +1245,7 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
       PERF_COUNTER_ADD(internal_key_skipped_count, 1);
     }
 
-    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
+    if (num_skipped >= max_skip_) {
       num_skipped = 0;
       IterKey last_key;
       last_key.SetInternalKey(ParsedInternalKey(
@@ -1281,10 +1292,6 @@ bool DBIter::IsVisible(SequenceNumber sequence) {
   }
 }
 
-bool DBIter::CanReseekToSkip() {
-  return read_callback_ == nullptr || read_callback_->CanReseekToSkip();
-}
-
 void DBIter::Seek(const Slice& target) {
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
diff --git a/db/read_callback.h b/db/read_callback.h
index 60f91ef872d..d8801e65173 100644
--- a/db/read_callback.h
+++ b/db/read_callback.h
@@ -42,9 +42,6 @@ class ReadCallback {
   // Refresh to a more recent visible seq
   virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
 
-  // Refer to DBIter::CanReseekToSkip
-  virtual bool CanReseekToSkip() { return true; }
-
  protected:
   // The max visible seq, it is usually the snapshot but could be larger if
   // transaction has its own writes written to db.
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index a410c5b5196..7868d0060e9 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -3471,6 +3471,12 @@ TEST_P(TransactionTest, LockLimitTest) {
 }
 
 TEST_P(TransactionTest, IteratorTest) {
+  // This test does writes without snapshot validation, and then tries to create
+  // iterator later, which is unsupported in write unprepared.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   std::string value;
@@ -3589,6 +3595,16 @@ TEST_P(TransactionTest, IteratorTest) {
 }
 
 TEST_P(TransactionTest, DisableIndexingTest) {
+  // Skip this test for write unprepared. It does not solely rely on WBWI for
+  // read your own writes, so depending on whether batches are flushed or not,
+  // only some writes will be visible.
+  //
+  // Also, write unprepared does not support creating iterators if there has
+  // been txn->Put() without snapshot validation.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options;
   std::string value;
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index faa6c774578..a2546229e4d 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -37,6 +37,9 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple(false, true, WRITE_UNPREPARED)));
 
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
+  // The following tests checks whether reading your own write for
+  // a transaction works for write unprepared, when there are uncommitted
+  // values written into DB.
   auto verify_state = [](Iterator* iter, const std::string& key,
                          const std::string& value) {
     ASSERT_TRUE(iter->Valid());
@@ -45,155 +48,251 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
     ASSERT_EQ(value, iter->value().ToString());
   };
 
-  options.disable_auto_compactions = true;
-  ReOpen();
-
-  // The following tests checks whether reading your own write for
-  // a transaction works for write unprepared, when there are uncommitted
-  // values written into DB.
-  //
-  // Although the values written by DB::Put are technically committed, we add
-  // their seq num to unprep_seqs_ to pretend that they were written into DB
-  // as part of an unprepared batch, and then check if they are visible to the
-  // transaction.
-  auto snapshot0 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v2"));
-  auto snapshot2 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v3"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v4"));
-  auto snapshot4 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v5"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v6"));
-  auto snapshot6 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v7"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v8"));
-  auto snapshot8 = db->GetSnapshot();
-
-  TransactionOptions txn_options;
-  WriteOptions write_options;
-  Transaction* txn = db->BeginTransaction(write_options, txn_options);
-  WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-
-  ReadOptions roptions;
-  roptions.snapshot = snapshot0;
-
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  auto iter = txn->GetIterator(roptions);
+  // Test always reseeking vs never reseeking.
+  for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
+    options.max_sequential_skip_in_iterations = max_skip;
+    options.disable_auto_compactions = true;
+    ReOpen();
 
-  // Test Get().
-  std::string value;
+    TransactionOptions txn_options;
+    WriteOptions woptions;
+    ReadOptions roptions;
 
-  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
-  ASSERT_EQ(value, "v3");
+    ASSERT_OK(db->Put(woptions, "a", ""));
+    ASSERT_OK(db->Put(woptions, "b", ""));
 
-  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
-  ASSERT_EQ(value, "v4");
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+    txn->SetSnapshot();
 
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+    for (int i = 0; i < 5; i++) {
+      std::string stored_value = "v" + ToString(i);
+      ASSERT_OK(txn->Put("a", stored_value));
+      ASSERT_OK(txn->Put("b", stored_value));
+      wup_txn->FlushWriteBatchToDB(false);
 
-  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
-  ASSERT_EQ(value, "v7");
+      // Test Get()
+      std::string value;
+      ASSERT_OK(txn->Get(roptions, "a", &value));
+      ASSERT_EQ(value, stored_value);
+      ASSERT_OK(txn->Get(roptions, "b", &value));
+      ASSERT_EQ(value, stored_value);
 
-  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
-  ASSERT_EQ(value, "v8");
+      // Test Next()
+      auto iter = txn->GetIterator(roptions);
+      iter->Seek("a");
+      verify_state(iter, "a", stored_value);
 
-  wup_txn->unprep_seqs_.clear();
+      iter->Next();
+      verify_state(iter, "b", stored_value);
 
-  // Test Next().
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      iter->SeekToFirst();
+      verify_state(iter, "a", stored_value);
 
-  iter->Seek("a");
-  verify_state(iter, "a", "v3");
+      iter->Next();
+      verify_state(iter, "b", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v4");
+      delete iter;
 
-  iter->SeekToFirst();
-  verify_state(iter, "a", "v3");
+      // Test Prev()
+      iter = txn->GetIterator(roptions);
+      iter->SeekForPrev("b");
+      verify_state(iter, "b", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v4");
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
 
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      iter->SeekToLast();
+      verify_state(iter, "b", stored_value);
 
-  iter->Seek("a");
-  verify_state(iter, "a", "v7");
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v8");
-
-  iter->SeekToFirst();
-  verify_state(iter, "a", "v7");
-
-  iter->Next();
-  verify_state(iter, "b", "v8");
-
-  wup_txn->unprep_seqs_.clear();
-
-  // Test Prev(). For Prev(), we need to adjust the snapshot to match what is
-  // possible in WriteUnpreparedTxn.
-  //
-  // Because of row locks and ValidateSnapshot, there cannot be any committed
-  // entries after snapshot, but before the first prepared key.
-  roptions.snapshot = snapshot2;
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      delete iter;
+    }
 
-  iter->SeekForPrev("b");
-  verify_state(iter, "b", "v4");
+    delete txn;
+  }
+}
 
-  iter->Prev();
-  verify_state(iter, "a", "v3");
+TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
+  // This is a stress test where different threads are writing random keys, and
+  // then before committing or aborting the transaction, it validates to see
+  // that it can read the keys it wrote, and the keys it did not write respect
+  // the snapshot. To avoid row lock contention (and simply stressing the
+  // locking system), each thread is mostly only writing to its own set of keys.
+  const uint32_t kNumIter = 1000;
+  const uint32_t kNumThreads = 10;
+  const uint32_t kNumKeys = 5;
+
+  std::default_random_engine rand(static_cast<uint32_t>(
+      std::hash<std::thread::id>()(std::this_thread::get_id())));
+
+  enum Action { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
+  // Test with
+  // 1. no snapshots set
+  // 2. snapshot set on ReadOptions
+  // 3. snapshot set, and refreshing after every write.
+  for (Action a : {NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT}) {
+    WriteOptions write_options;
+    txn_db_options.transaction_lock_timeout = -1;
+    options.disable_auto_compactions = true;
+    ReOpen();
+
+    std::vector<std::string> keys;
+    for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
+      keys.push_back("k" + ToString(k));
+    }
+    std::shuffle(keys.begin(), keys.end(), rand);
+
+    // This counter will act as a "sequence number" to help us validate
+    // visibility logic with snapshots. If we had direct access to the seqno of
+    // snapshots and key/values, then we should directly compare those instead.
+    std::atomic<int64_t> counter(0);
+
+    std::function<void(uint32_t)> stress_thread = [&](int id) {
+      size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+      Random64 rnd(static_cast<uint32_t>(tid));
+
+      Transaction* txn;
+      TransactionOptions txn_options;
+      // batch_size of 1 causes writes to DB for every marker.
+      txn_options.max_write_batch_size = 1;
+      ReadOptions read_options;
+
+      for (uint32_t i = 0; i < kNumIter; i++) {
+        std::set<std::string> owned_keys(&keys[id * kNumKeys],
+                                         &keys[(id + 1) * kNumKeys]);
+        // Add unowned keys to make the workload more interesting, but this
+        // increases row lock contention, so just do it sometimes.
+        if (rnd.OneIn(2)) {
+          owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
+        }
 
-  iter->SeekToLast();
-  verify_state(iter, "b", "v4");
+        txn = db->BeginTransaction(write_options, txn_options);
+        txn->SetName(ToString(id));
+        txn->SetSnapshot();
+        if (a >= RO_SNAPSHOT) {
+          read_options.snapshot = txn->GetSnapshot();
+          ASSERT_TRUE(read_options.snapshot != nullptr);
+        }
 
-  iter->Prev();
-  verify_state(iter, "a", "v3");
+        uint64_t buf[2];
+        buf[0] = id;
 
-  roptions.snapshot = snapshot6;
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+        // When scanning through the database, make sure that all unprepared
+        // keys have value >= snapshot and all other keys have value < snapshot.
+        int64_t snapshot_num = counter.fetch_add(1);
 
-  iter->SeekForPrev("b");
-  verify_state(iter, "b", "v8");
+        Status s;
+        for (const auto& key : owned_keys) {
+          buf[1] = counter.fetch_add(1);
+          s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
+          if (!s.ok()) {
+            break;
+          }
+          if (a == REFRESH_SNAPSHOT) {
+            txn->SetSnapshot();
+            read_options.snapshot = txn->GetSnapshot();
+            snapshot_num = counter.fetch_add(1);
+          }
+        }
 
-  iter->Prev();
-  verify_state(iter, "a", "v7");
+        // Failure is possible due to snapshot validation. In this case,
+        // rollback and move onto next iteration.
+        if (!s.ok()) {
+          ASSERT_TRUE(s.IsBusy());
+          ASSERT_OK(txn->Rollback());
+          delete txn;
+          continue;
+        }
 
-  iter->SeekToLast();
-  verify_state(iter, "b", "v8");
+        auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
+                              const std::string& key,
+                              const std::string& value) {
+          if (owned_keys.count(key) > 0) {
+            ASSERT_EQ(value.size(), 16);
+
+            // Since this key is part of owned_keys, then this key must be
+            // unprepared by this transaction identified by 'id'
+            ASSERT_EQ(((int64_t*)value.c_str())[0], id);
+            if (a == REFRESH_SNAPSHOT) {
+              // If refresh snapshot is true, then the snapshot is refreshed
+              // after every Put(), meaning that the current snapshot in
+              // snapshot_num must be greater than the "seqno" of any keys
+              // written by the current transaction.
+              ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+            } else {
+              // If refresh snapshot is not on, then the snapshot was taken at
+              // the beginning of the transaction, meaning all writes must come
+              // after snapshot_num
+              ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
+            }
+          } else if (a >= RO_SNAPSHOT) {
+            // If this is not an unprepared key, just assert that the key
+            // "seqno" is smaller than the snapshot seqno.
+            ASSERT_EQ(value.size(), 16);
+            ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+          }
+        };
+
+        // Validate Get()/Next()/Prev(). Do only one of them to save time, and
+        // reduce lock contention.
+        switch (rnd.Uniform(3)) {
+          case 0:  // Validate Get()
+          {
+            for (const auto& key : keys) {
+              std::string value;
+              s = txn->Get(read_options, Slice(key), &value);
+              if (!s.ok()) {
+                ASSERT_TRUE(s.IsNotFound());
+                ASSERT_EQ(owned_keys.count(key), 0);
+              } else {
+                verify_key(key, value);
+              }
+            }
+            break;
+          }
+          case 1:  // Validate Next()
+          {
+            Iterator* iter = txn->GetIterator(read_options);
+            for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+              verify_key(iter->key().ToString(), iter->value().ToString());
+            }
+            delete iter;
+            break;
+          }
+          case 2:  // Validate Prev()
+          {
+            Iterator* iter = txn->GetIterator(read_options);
+            for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+              verify_key(iter->key().ToString(), iter->value().ToString());
+            }
+            delete iter;
+            break;
+          }
+          default:
+            ASSERT_TRUE(false);
+        }
 
-  iter->Prev();
-  verify_state(iter, "a", "v7");
+        if (rnd.OneIn(2)) {
+          ASSERT_OK(txn->Commit());
+        } else {
+          ASSERT_OK(txn->Rollback());
+        }
+        delete txn;
+      }
+    };
 
-  // Since the unprep_seqs_ data were faked for testing, we do not want the
-  // destructor for the transaction to be rolling back data that did not
-  // exist.
-  wup_txn->unprep_seqs_.clear();
+    std::vector<port::Thread> threads;
+    for (uint32_t i = 0; i < kNumThreads; i++) {
+      threads.emplace_back(stress_thread, i);
+    }
 
-  db->ReleaseSnapshot(snapshot0);
-  db->ReleaseSnapshot(snapshot2);
-  db->ReleaseSnapshot(snapshot4);
-  db->ReleaseSnapshot(snapshot6);
-  db->ReleaseSnapshot(snapshot8);
-  delete iter;
-  delete txn;
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
 }
 
 // This tests how write unprepared behaves during recovery when the DB crashes
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index d127220e47d..4d1401b3aa1 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -32,7 +32,7 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
 
 SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber(
     WriteUnpreparedTxn* txn) {
-  auto unprep_seqs = txn->GetUnpreparedSequenceNumbers();
+  const auto& unprep_seqs = txn->GetUnpreparedSequenceNumbers();
   if (unprep_seqs.size()) {
     return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
   }
@@ -44,7 +44,8 @@ WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const TransactionOptions& txn_options)
     : WritePreparedTxn(txn_db, write_options, txn_options),
       wupt_db_(txn_db),
-      recovered_txn_(false) {
+      recovered_txn_(false),
+      largest_validated_seq_(0) {
   max_write_batch_size_ = txn_options.max_write_batch_size;
   // We set max bytes to zero so that we don't get a memory limit error.
   // Instead of trying to keep write batch strictly under the size limit, we
@@ -85,75 +86,82 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   write_batch_.SetMaxBytes(0);
   unprep_seqs_.clear();
   recovered_txn_ = false;
+  largest_validated_seq_ = 0;
 }
 
-Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
-                               const Slice& key, const Slice& value,
-                               const bool assume_tracked) {
+Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
   Status s = MaybeFlushWriteBatchToDB();
   if (!s.ok()) {
     return s;
   }
-  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  s = do_write();
+  if (s.ok()) {
+    if (snapshot_) {
+      largest_validated_seq_ =
+          std::max(largest_validated_seq_, snapshot_->GetSequenceNumber());
+    } else {
+      largest_validated_seq_ = kMaxSequenceNumber;
+    }
+  }
+  return s;
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const Slice& key, const Slice& value,
+                               const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
                                const SliceParts& key, const SliceParts& value,
                                const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
                                  const Slice& key, const Slice& value,
                                  const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Merge(column_family, key, value, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Merge(column_family, key, value,
+                                      assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
                                   const Slice& key, const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
                                   const SliceParts& key,
                                   const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
                                         const Slice& key,
                                         const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
                                         const SliceParts& key,
                                         const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
 }
 
 // WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 15a76d13437..b64fd81e611 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -17,6 +17,40 @@ namespace rocksdb {
 class WriteUnpreparedTxnDB;
 class WriteUnpreparedTxn;
 
+// WriteUnprepared transactions needs to be able to read their own uncommitted
+// writes, and supporting this requires some careful consideration. Because
+// writes in the current transaction may be flushed to DB already, we cannot
+// rely on the contents of WriteBatchWithIndex to determine whether a key should
+// be visible or not, so we have to remember to check the DB for any uncommitted
+// keys that should be visible to us. First, we will need to change the seek to
+// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
+// Any key greater than max_visible_seq should not be visible because they
+// cannot be unprepared by the current transaction and they are not in its
+// snapshot.
+//
+// When we seek to max_visible_seq, one of these cases will happen:
+// 1. We hit a unprepared key from the current transaction.
+// 2. We hit a unprepared key from the another transaction.
+// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
+// 4. We hit a committed key with seq <= snap_seq.
+//
+// IsVisibleFullCheck handles all cases correctly.
+//
+// Other notes:
+// Note that max_visible_seq is only calculated once at iterator construction
+// time, meaning if the same transaction is adding more unprep seqs through
+// writes during iteration, these newer writes may not be visible. This is not a
+// problem for MySQL though because it avoids modifying the index as it is
+// scanning through it to avoid the Halloween Problem. Instead, it scans the
+// index once up front, and modifies based on a temporary copy.
+//
+// In DBIter, there is a "reseek" optimization if the iterator skips over too
+// many keys. However, this assumes that the reseek seeks exactly to the
+// required key. In write unprepared, even after seeking directly to
+// max_visible_seq, some iteration may be required before hitting a visible key,
+// and special precautions must be taken to avoid performing another reseek,
+// leading to an infinite loop.
+//
 class WriteUnpreparedTxnReadCallback : public ReadCallback {
  public:
   WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db,
@@ -25,7 +59,7 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
                                  WriteUnpreparedTxn* txn)
       // Pass our last uncommitted seq as the snapshot to the parent class to
       // ensure that the parent will not prematurely filter out own writes. We
-      // will do the exact comparison agaisnt snapshots in IsVisibleFullCheck
+      // will do the exact comparison against snapshots in IsVisibleFullCheck
       // override.
       : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted),
         db_(db),
@@ -34,12 +68,6 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
 
   virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
 
-  bool CanReseekToSkip() override {
-    return wup_snapshot_ == max_visible_seq_;
-    // Otherwise our own writes uncommitted are in db, and the assumptions
-    // behind reseek optimizations are no longer valid.
-  }
-
   void Refresh(SequenceNumber seq) override {
     max_visible_seq_ = std::max(max_visible_seq_, seq);
     wup_snapshot_ = seq;
@@ -130,6 +158,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   Status MaybeFlushWriteBatchToDB();
   Status FlushWriteBatchToDB(bool prepared);
+  Status HandleWrite(std::function<Status()> do_write);
 
   // For write unprepared, we check on every writebatch append to see if
   // max_write_batch_size_ has been exceeded, and then call
@@ -153,6 +182,20 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // locked for efficiency reasons. For recovered transactions, skip unlocking
   // keys when transaction ends.
   bool recovered_txn_;
+
+  // Track the largest sequence number at which we performed snapshot
+  // validation. If snapshot validation was skipped because no snapshot was set,
+  // then this is set to kMaxSequenceNumber. This value is useful because it
+  // means that for keys that have unprepared seqnos, we can guarantee that no
+  // committed keys by other transactions can exist between
+  // largest_validated_seq_ and max_unprep_seq. See
+  // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
+  // necessary for iterator Prev().
+  //
+  // Currently this value only increases during the lifetime of a transaction,
+  // but in some cases, we should be able to restore the previously largest
+  // value when calling RollbackToSavepoint.
+  SequenceNumber largest_validated_seq_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index c4be058bb96..c3fcd1f45d2 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -368,25 +368,77 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   constexpr bool ALLOW_BLOB = true;
   constexpr bool ALLOW_REFRESH = true;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
-  SequenceNumber snapshot_seq;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
-  if (options.snapshot != nullptr) {
-    snapshot_seq = options.snapshot->GetSequenceNumber();
-    min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(
-            options.snapshot)
-            ->min_uncommitted_;
-  } else {
-    auto* snapshot = GetSnapshot();
-    // We take a snapshot to make sure that the related data in the commit map
-    // are not deleted.
-    snapshot_seq = snapshot->GetSequenceNumber();
-    min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-            ->min_uncommitted_;
+
+  // Currently, the Prev() iterator logic does not work well without snapshot
+  // validation. The logic simply iterates through values of a key in
+  // ascending seqno order, stopping at the first non-visible value and
+  // returning the last visible value.
+  //
+  // For example, if snapshot sequence is 3, and we have the following keys:
+  // foo: v1 1
+  // foo: v2 2
+  // foo: v3 3
+  // foo: v4 4
+  // foo: v5 5
+  //
+  // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
+  // which is the last visible key.
+  //
+  // For unprepared transactions, if we have snap_seq = 3, but the current
+  // transaction has unprep_seq 5, then returning the first non-visible key
+  // would be incorrect, as we should return v5, and not v3. The problem is that
+  // there are committed keys at snapshot_seq < commit_seq < unprep_seq.
+  //
+  // Snapshot validation can prevent this problem by ensuring that no committed
+  // keys exist at snapshot_seq < commit_seq, and thus any value with a sequence
+  // number greater than snapshot_seq must be unprepared keys. For example, if
+  // the transaction had a snapshot at 3, then snapshot validation would be
+  // performed during the Put(v5) call. It would find v4, and the Put would fail
+  // with snapshot validation failure.
+  //
+  // Because of this, if any writes have occurred, then the transaction snapshot
+  // must be used for the iterator. If no writes have occurred though, we can
+  // simply create a snapshot. Later writes would not be visible though, but we
+  // don't support iterating while writing anyway.
+  //
+  // TODO(lth): Improve Prev() logic to continue iterating until
+  // max_visible_seq, and then return the last visible key, so that this
+  // restriction can be lifted.
+  const Snapshot* snapshot = nullptr;
+  if (options.snapshot == nullptr) {
+    snapshot = GetSnapshot();
     own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  } else {
+    snapshot = options.snapshot;
   }
+
+  snapshot_seq = snapshot->GetSequenceNumber();
   assert(snapshot_seq != kMaxSequenceNumber);
+  // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
+  // guaranteed that for keys that were modified by this transaction (and thus
+  // might have unprepared versions), no committed versions exist at
+  // largest_validated_seq < commit_seq (or the contrapositive: any committed
+  // version must exist at commit_seq <= largest_validated_seq). This implies
+  // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
+  // snapshot_seq. As explained above, the problem with Prev() only happens when
+  // snapshot_seq < commit_seq.
+  //
+  // For keys that were not modified by this transaction, largest_validated_seq_
+  // is meaningless, and Prev() should just work with the existing visibility
+  // logic.
+  if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
+      !txn->unprep_seqs_.empty()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "WriteUnprepared iterator creation failed since the "
+                    "transaction has performed unvalidated writes");
+    return nullptr;
+  }
+  min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+          ->min_uncommitted_;
+
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   auto* state =
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);

From 112702ac6cd14dfb2f0fbf929216deabcf2ccafc Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 23 Jul 2019 11:12:25 -0700
Subject: [PATCH 245/572] Parallelize file_reader_writer_test in order to
 reduce timeouts

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5608

Test Plan:
make check
buck test mode/dev-tsan internal_repo_rocksdb/repo:file_reader_writer_test -- --run-disabled

Differential Revision: D16441796

Pulled By: anand1976

fbshipit-source-id: afbb88a9fcb1c0ba22215118767e8eab3d1d6a4a
---
 Makefile | 2 +-
 TARGETS  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index f8a904bd39d..65d884fa4fb 100644
--- a/Makefile
+++ b/Makefile
@@ -480,7 +480,6 @@ TESTS = \
 	fault_injection_test \
 	filelock_test \
 	filename_test \
-	file_reader_writer_test \
 	block_based_filter_block_test \
 	full_filter_block_test \
 	partitioned_filter_block_test \
@@ -580,6 +579,7 @@ PARALLEL_TEST = \
 	external_sst_file_test \
 	import_column_family_test \
 	fault_injection_test \
+	file_reader_writer_test \
 	inlineskiplist_test \
 	manual_compaction_test \
 	persistent_cache_test \
diff --git a/TARGETS b/TARGETS
index cfd9ef73d40..9246af36361 100644
--- a/TARGETS
+++ b/TARGETS
@@ -753,7 +753,7 @@ ROCKS_TESTS = [
     [
         "file_reader_writer_test",
         "util/file_reader_writer_test.cc",
-        "serial",
+        "parallel",
     ],
     [
         "filelock_test",

From 3782accf7de5830fef1fc88d69bbe2d9259b023f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 23 Jul 2019 13:56:52 -0700
Subject: [PATCH 246/572] ldb sometimes specify a string-append merge operator
 (#5607)

Summary:
Right now, ldb cannot scan a DB with merge operands with default ldb. There is no hard to give a general merge operator so that it can at least print out something
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5607

Test Plan: Run ldb against a DB with merge operands and see the outputs.

Differential Revision: D16442634

fbshipit-source-id: c66c414ec07f219cfc6e6ec2cc14c783ee95df54
---
 HISTORY.md       |  1 +
 tools/ldb_cmd.cc | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 19f4ce1297c..04f194e9258 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -18,6 +18,7 @@
 * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
 * Overload GetAllKeyVersions() to support non-default column family.
 * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
+* ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 22b2399a278..338f09fb992 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -31,6 +31,7 @@
 #include "util/coding.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
+#include "utilities/merge_operators.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
 #include <cstdlib>
@@ -353,11 +354,24 @@ void LDBCommand::OpenDB() {
           stderr,
           "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
     }
+
+    // If merge operator is not set, set a string append operator. There is
+    // no harm doing it.
+    for (auto& cf_entry : column_families_) {
+      if (!cf_entry.options.merge_operator) {
+        cf_entry.options.merge_operator =
+            MergeOperators::CreateStringAppendOperator(':');
+      }
+    }
   }
   options_ = PrepareOptionsForOpenDB();
   if (!exec_state_.IsNotStarted()) {
     return;
   }
+  if (column_families_.empty() && !options_.merge_operator) {
+    // No harm to add a general merge operator if it is not specified.
+    options_.merge_operator = MergeOperators::CreateStringAppendOperator(':');
+  }
   // Open the DB.
   Status st;
   std::vector<ColumnFamilyHandle*> handles_opened;

From 6b7fcc0d5f8c91f891f243906e6431969cfa8d11 Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Tue, 23 Jul 2019 15:30:59 -0700
Subject: [PATCH 247/572] Improve CPU Efficiency of ApproximateSize (part 1)
 (#5613)

Summary:
1. Avoid creating the iterator in order to call BlockBasedTable::ApproximateOffsetOf(). Instead, directly call into it.
2. Optimize BlockBasedTable::ApproximateOffsetOf() keeps the index block iterator in stack.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5613

Differential Revision: D16442660

Pulled By: elipoz

fbshipit-source-id: 9320be3e918c139b10e758cbbb684706d172e516
---
 db/table_cache.cc                             | 28 ++++++++++++++++++-
 db/table_cache.h                              |  6 ++++
 db/version_set.cc                             | 19 ++++---------
 db/version_set.h                              |  4 +--
 table/block_based/block_based_table_reader.cc | 12 ++++++--
 table/block_based/block_based_table_reader.h  |  7 +++--
 6 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/db/table_cache.cc b/db/table_cache.cc
index 2290b5939c5..48415beff34 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -194,7 +194,7 @@ InternalIterator* TableCache::NewIterator(
   if (table_reader == nullptr) {
     s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
                   options.read_tier == kBlockCacheTier /* no_io */,
-                  !for_compaction /* record read_stats */, file_read_hist,
+                  !for_compaction /* record_read_stats */, file_read_hist,
                   skip_filters, level);
     if (s.ok()) {
       table_reader = GetTableReaderFromHandle(handle);
@@ -505,4 +505,30 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) {
   cache->Erase(GetSliceForFileNumber(&file_number));
 }
 
+uint64_t TableCache::ApproximateOffsetOf(
+    const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+    const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateOffsetOf(key, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
 }  // namespace rocksdb
diff --git a/db/table_cache.h b/db/table_cache.h
index f9fd4815228..89a0b1b5c63 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -153,6 +153,12 @@ class TableCache {
       const FileDescriptor& fd,
       const SliceTransform* prefix_extractor = nullptr);
 
+  // Returns approximated offset of a key in a file represented by fd.
+  uint64_t ApproximateOffsetOf(
+      const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const SliceTransform* prefix_extractor = nullptr);
+
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 559a4190f16..281065d0502 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4974,19 +4974,12 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
   } else {
     // "key" falls in the range for this table.  Add the
     // approximate offset of "key" within the table.
-    TableReader* table_reader_ptr;
-    InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
-        ReadOptions(), v->env_options_, v->cfd_->internal_comparator(),
-        *f.file_metadata, nullptr /* range_del_agg */,
-        v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr,
-        /*file_read_hist=*/nullptr, caller,
-        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
-        /*smallest_compaction_key=*/nullptr,
-        /*largest_compaction_key=*/nullptr);
-    if (table_reader_ptr != nullptr) {
-      result = table_reader_ptr->ApproximateOffsetOf(key, caller);
-    }
-    delete iter;
+    TableCache* table_cache = v->cfd_->table_cache();
+    if (table_cache != nullptr) {
+      result = table_cache->ApproximateOffsetOf(
+          key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(),
+          v->GetMutableCFOptions().prefix_extractor.get());
+    }
   }
   return result;
 }
diff --git a/db/version_set.h b/db/version_set.h
index 6b7c42881c1..ee94f5966df 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -655,7 +655,7 @@ class Version {
 
   uint64_t GetSstFilesSize();
 
-  MutableCFOptions GetMutableCFOptions() { return mutable_cf_options_; }
+  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
 
  private:
   Env* env_;
@@ -981,7 +981,7 @@ class VersionSet {
   void AddLiveFiles(std::vector<FileDescriptor>* live_list);
 
   // Return the approximate size of data to be scanned for range [start, end)
-  // in levels [start_level, end_level). If end_level == 0 it will search
+  // in levels [start_level, end_level). If end_level == -1 it will search
   // through all non-empty levels
   uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
                            int start_level, int end_level,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index fde11c0d362..000bc295fc1 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -4018,10 +4018,11 @@ Status BlockBasedTable::CreateIndexReader(
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
                                               TableReaderCaller caller) {
   BlockCacheLookupContext context(caller);
-  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
+  IndexBlockIter iiter_on_stack;
+  auto index_iter =
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
-                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_context=*/&context));
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
 
   index_iter->Seek(key);
   uint64_t result;
@@ -4041,6 +4042,11 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
       result = rep_->footer.metaindex_handle().offset();
     }
   }
+
+  if (index_iter != &iiter_on_stack) {
+    delete index_iter;
+  }
+
   return result;
 }
 
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 189cd5d2e3a..3a16e2995fb 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -318,8 +318,11 @@ class BlockBasedTable : public TableReader {
       BlockCacheLookupContext* lookup_context) const;
 
   // Get the iterator from the index reader.
-  // If input_iter is not set, return new Iterator
-  // If input_iter is set, update it and return it as Iterator
+  //
+  // If input_iter is not set, return a new Iterator.
+  // If input_iter is set, try to update it and return it as Iterator.
+  // However note that in some cases the returned iterator may be different
+  // from input_iter. In such case the returned iterator should be freed.
   //
   // Note: ErrorIterator with Status::Incomplete shall be returned if all the
   // following conditions are met:

From 092f41703798011db3cc118d1b32c8ca5ddf9749 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 23 Jul 2019 15:57:43 -0700
Subject: [PATCH 248/572] Move the uncompression dictionary object out of the
 block cache (#5584)

Summary:
RocksDB has historically stored uncompression dictionary objects in the block
cache as opposed to storing just the block contents. This neccesitated
evicting the object upon table close. With the new code, only the raw blocks
are stored in the cache, eliminating the need for eviction.

In addition, the patch makes the following improvements:

1) Compression dictionary blocks are now prefetched/pinned similarly to
index/filter blocks.
2) A copy operation got eliminated when the uncompression dictionary is
retrieved.
3) Errors related to retrieving the uncompression dictionary are propagated as
opposed to silently ignored.

Note: the patch temporarily breaks the compression dictionary evicition stats.
They will be fixed in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5584

Test Plan: make asan_check

Differential Revision: D16344151

Pulled By: ltamasi

fbshipit-source-id: 2962b295f5b19628f9da88a3fcebbce5a5017a7b
---
 CMakeLists.txt                                |   1 +
 HISTORY.md                                    |   7 +-
 TARGETS                                       |   1 +
 db/db_block_cache_test.cc                     |  67 +++--
 db/version_set.cc                             |   6 -
 include/rocksdb/cache.h                       |   5 -
 src.mk                                        |   1 +
 .../block_based_filter_block_test.cc          |   7 +-
 table/block_based/block_based_table_reader.cc | 249 ++++--------------
 table/block_based/block_based_table_reader.h  |  16 +-
 table/block_based/full_filter_block_test.cc   |   7 +-
 table/block_based/partitioned_filter_block.cc |   2 +-
 .../partitioned_filter_block_test.cc          |   4 -
 .../block_based/uncompression_dict_reader.cc  | 138 ++++++++++
 table/block_based/uncompression_dict_reader.h |  64 +++++
 table/table_reader.h                          |   2 -
 table/table_test.cc                           | 170 ------------
 util/compression.h                            |  93 +++++--
 18 files changed, 391 insertions(+), 449 deletions(-)
 create mode 100644 table/block_based/uncompression_dict_reader.cc
 create mode 100644 table/block_based/uncompression_dict_reader.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b49a13572bb..0bd7311498f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -601,6 +601,7 @@ set(SOURCES
         table/block_based/full_filter_block.cc
         table/block_based/index_builder.cc
         table/block_based/partitioned_filter_block.cc
+        table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
         table/bloom_block.cc
         table/cuckoo/cuckoo_table_builder.cc
diff --git a/HISTORY.md b/HISTORY.md
index 04f194e9258..d452a68a30f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,9 +6,10 @@
 
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
-* Index and filter blocks are now handled similarly to data blocks with regards to the block cache: instead of storing reader objects in the cache, only the blocks themselves are cached. In addition, index and filter blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any).
+* Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers.
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
-* Due to the above refactoring, block cache eviction statistics for indexes and filters are temporarily broken. We plan to reintroduce them in a later phase.
+* Due to the above refactoring, block cache eviction statistics for indexes, filters, and compression dictionaries are temporarily broken. We plan to reintroduce them in a later phase.
+* Errors related to the retrieval of the compression dictionary are now propagated to the user.
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
@@ -26,6 +27,7 @@
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
 * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
+* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
@@ -35,6 +37,7 @@
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
 * Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
+* The compression dictionary is no longer copied to a new object upon retrieval.
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
diff --git a/TARGETS b/TARGETS
index 9246af36361..122da8b542f 100644
--- a/TARGETS
+++ b/TARGETS
@@ -198,6 +198,7 @@ cpp_library(
         "table/block_based/full_filter_block.cc",
         "table/block_based/index_builder.cc",
         "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/uncompression_dict_reader.cc",
         "table/block_fetcher.cc",
         "table/bloom_block.cc",
         "table/cuckoo/cuckoo_table_builder.cc",
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 77f37da0d45..422fd83bc20 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -19,6 +19,9 @@ class DBBlockCacheTest : public DBTestBase {
   size_t hit_count_ = 0;
   size_t insert_count_ = 0;
   size_t failure_count_ = 0;
+  size_t compression_dict_miss_count_ = 0;
+  size_t compression_dict_hit_count_ = 0;
+  size_t compression_dict_insert_count_ = 0;
   size_t compressed_miss_count_ = 0;
   size_t compressed_hit_count_ = 0;
   size_t compressed_insert_count_ = 0;
@@ -69,6 +72,15 @@ class DBBlockCacheTest : public DBTestBase {
         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
   }
 
+  void RecordCacheCountersForCompressionDict(const Options& options) {
+    compression_dict_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    compression_dict_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    compression_dict_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+  }
+
   void CheckCacheCounters(const Options& options, size_t expected_misses,
                           size_t expected_hits, size_t expected_inserts,
                           size_t expected_failures) {
@@ -87,6 +99,28 @@ class DBBlockCacheTest : public DBTestBase {
     failure_count_ = new_failure_count;
   }
 
+  void CheckCacheCountersForCompressionDict(
+      const Options& options, size_t expected_compression_dict_misses,
+      size_t expected_compression_dict_hits,
+      size_t expected_compression_dict_inserts) {
+    size_t new_compression_dict_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    size_t new_compression_dict_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    size_t new_compression_dict_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+    ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+              new_compression_dict_miss_count);
+    ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+              new_compression_dict_hit_count);
+    ASSERT_EQ(
+        compression_dict_insert_count_ + expected_compression_dict_inserts,
+        new_compression_dict_insert_count);
+    compression_dict_miss_count_ = new_compression_dict_miss_count;
+    compression_dict_hit_count_ = new_compression_dict_hit_count;
+    compression_dict_insert_count_ = new_compression_dict_insert_count;
+  }
+
   void CheckCompressedCacheCounters(const Options& options,
                                     size_t expected_misses,
                                     size_t expected_hits,
@@ -671,6 +705,8 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
     options.table_factory.reset(new BlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
+    RecordCacheCountersForCompressionDict(options);
+
     for (int i = 0; i < kNumFiles; ++i) {
       ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
       for (int j = 0; j < kNumEntriesPerFile; ++j) {
@@ -683,27 +719,26 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
 
+    // Compression dictionary blocks are preloaded.
+    CheckCacheCountersForCompressionDict(
+        options, kNumFiles /* expected_compression_dict_misses */,
+        0 /* expected_compression_dict_hits */,
+        kNumFiles /* expected_compression_dict_inserts */);
+
     // Seek to a key in a file. It should cause the SST's dictionary meta-block
     // to be read.
     RecordCacheCounters(options);
-    ASSERT_EQ(0,
-              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
-    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
+    RecordCacheCountersForCompressionDict(options);
     ReadOptions read_options;
     ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
-    // Two blocks missed/added: dictionary and data block
-    // One block hit: index since it's prefetched
-    CheckCacheCounters(options, 2 /* expected_misses */, 1 /* expected_hits */,
-                       2 /* expected_inserts */, 0 /* expected_failures */);
-    ASSERT_EQ(1,
-              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
-    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
+    // Two block hits: index and dictionary since they are prefetched
+    // One block missed/added: data block
+    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+                       1 /* expected_inserts */, 0 /* expected_failures */);
+    CheckCacheCountersForCompressionDict(
+        options, 0 /* expected_compression_dict_misses */,
+        1 /* expected_compression_dict_hits */,
+        0 /* expected_compression_dict_inserts */);
   }
 }
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 281065d0502..7d477a6806b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3420,16 +3420,10 @@ VersionSet::VersionSet(const std::string& dbname,
       env_options_(storage_options),
       block_cache_tracer_(block_cache_tracer) {}
 
-void CloseTables(void* ptr, size_t) {
-  TableReader* table_reader = reinterpret_cast<TableReader*>(ptr);
-  table_reader->Close();
-}
-
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
   // VersionSet
   Cache* table_cache = column_family_set_->get_table_cache();
-  table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */);
   column_family_set_.reset();
   for (auto& file : obsolete_files_) {
     if (file.metadata->table_reader_handle) {
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 410c2cf827a..6bde575e0fc 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -250,11 +250,6 @@ class Cache {
 
   virtual std::string GetPrintableOptions() const { return ""; }
 
-  // Mark the last inserted object as being a raw data block. This will be used
-  // in tests. The default implementation does nothing.
-  virtual void TEST_mark_as_data_block(const Slice& /*key*/,
-                                       size_t /*charge*/) {}
-
   MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
  private:
diff --git a/src.mk b/src.mk
index 4d635173b89..0f04fc73916 100644
--- a/src.mk
+++ b/src.mk
@@ -121,6 +121,7 @@ LIB_SOURCES =                                                   \
   table/block_based/full_filter_block.cc                        \
   table/block_based/index_builder.cc                            \
   table/block_based/partitioned_filter_block.cc                 \
+  table/block_based/uncompression_dict_reader.cc                \
   table/block_fetcher.cc                             		\
   table/bloom_block.cc                               		\
   table/cuckoo/cuckoo_table_builder.cc                          \
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index 70bbde96ac8..d223dec6e1f 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -45,10 +45,7 @@ class TestHashFilter : public FilterPolicy {
 class MockBlockBasedTable : public BlockBasedTable {
  public:
   explicit MockBlockBasedTable(Rep* rep)
-      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {
-    // Initialize what Open normally does as much as necessary for the test
-    rep->cache_key_prefix_size = 10;
-  }
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
 };
 
 class FilterBlockTest : public testing::Test {
@@ -64,7 +61,6 @@ class FilterBlockTest : public testing::Test {
       : ioptions_(options_),
         env_options_(options_),
         icomp_(options_.comparator) {
-    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(new TestHashFilter);
 
     constexpr bool skip_filters = false;
@@ -271,7 +267,6 @@ class BlockBasedFilterBlockTest : public testing::Test {
       : ioptions_(options_),
         env_options_(options_),
         icomp_(options_.comparator) {
-    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
 
     constexpr bool skip_filters = false;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 000bc295fc1..314763ec3b4 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -63,7 +63,6 @@ extern const std::string kHashIndexPrefixesMetadataBlock;
 typedef BlockBasedTable::IndexReader IndexReader;
 
 BlockBasedTable::~BlockBasedTable() {
-  Close();
   delete rep_;
 }
 
@@ -148,8 +147,6 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) {
   delete entry;
 }
 
-void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
-
 // Release the cached entry and decrement its ref count.
 void ForceReleaseCachedEntry(void* arg, void* h) {
   Cache* cache = reinterpret_cast<Cache*>(arg);
@@ -1419,37 +1416,6 @@ Status BlockBasedTable::ReadRangeDelBlock(
   return s;
 }
 
-Status BlockBasedTable::ReadCompressionDictBlock(
-    FilePrefetchBuffer* prefetch_buffer,
-    std::unique_ptr<const BlockContents>* compression_dict_block) const {
-  assert(compression_dict_block != nullptr);
-  Status s;
-  if (!rep_->compression_dict_handle.IsNull()) {
-    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
-    PersistentCacheOptions cache_options;
-    ReadOptions read_options;
-    read_options.verify_checksums = true;
-    BlockFetcher compression_block_fetcher(
-        rep_->file.get(), prefetch_buffer, rep_->footer, read_options,
-        rep_->compression_dict_handle, compression_dict_cont.get(),
-        rep_->ioptions, false /* decompress */, false /*maybe_compressed*/,
-        BlockType::kCompressionDictionary, UncompressionDict::GetEmptyDict(),
-        cache_options);
-    s = compression_block_fetcher.ReadBlockContents();
-
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(
-          rep_->ioptions.info_log,
-          "Encountered error while reading data from compression dictionary "
-          "block %s",
-          s.ToString().c_str());
-    } else {
-      *compression_dict_block = std::move(compression_dict_cont);
-    }
-  }
-  return s;
-}
-
 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
     BlockBasedTable* new_table, bool prefetch_all,
@@ -1555,23 +1521,16 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     }
   }
 
-  // TODO(ajkr): also prefetch compression dictionary block
-  // TODO(ajkr): also pin compression dictionary block when
-  // `pin_l0_filter_and_index_blocks_in_cache == true`.
-  if (!table_options.cache_index_and_filter_blocks) {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+    s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
+                                        prefetch_all, pin_all, lookup_context,
+                                        &uncompression_dict_reader);
     if (!s.ok()) {
       return s;
     }
 
-    if (!rep_->compression_dict_handle.IsNull()) {
-      assert(compression_dict_block != nullptr);
-      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      rep_->uncompression_dict.reset(new UncompressionDict(
-          compression_dict_block->data.ToString(),
-          rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics));
-    }
+    rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
   }
 
   assert(s.ok());
@@ -1609,8 +1568,8 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const {
   if (rep_->index_reader) {
     usage += rep_->index_reader->ApproximateMemoryUsage();
   }
-  if (rep_->uncompression_dict) {
-    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
+  if (rep_->uncompression_dict_reader) {
+    usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
   }
   return usage;
 }
@@ -1757,9 +1716,6 @@ Status BlockBasedTable::GetDataBlockFromCache(
       Cache::Handle* cache_handle = nullptr;
       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
                               &DeleteCachedEntry<TBlocklike>, &cache_handle);
-#ifndef NDEBUG
-      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
-#endif  // NDEBUG
       if (s.ok()) {
         assert(cache_handle != nullptr);
         block->SetCachedValue(block_holder.release(), block_cache,
@@ -1863,9 +1819,6 @@ Status BlockBasedTable::PutDataBlockToCache(
     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
                             &DeleteCachedEntry<TBlocklike>, &cache_handle,
                             priority);
-#ifndef NDEBUG
-    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
-#endif  // NDEBUG
     if (s.ok()) {
       assert(cache_handle != nullptr);
       cached_block->SetCachedValue(block_holder.release(), block_cache,
@@ -1914,86 +1867,6 @@ std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
   }
 }
 
-CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
-    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context) const {
-  if (!rep_->table_options.cache_index_and_filter_blocks) {
-    // block cache is either disabled or not used for meta-blocks. In either
-    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
-    return {rep_->uncompression_dict.get(), nullptr /* cache */,
-            nullptr /* cache_handle */, false /* own_value */};
-  }
-  if (rep_->compression_dict_handle.IsNull()) {
-    return CachableEntry<UncompressionDict>();
-  }
-  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                  rep_->compression_dict_handle, cache_key_buf);
-  auto cache_handle =
-      GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
-                        BlockType::kCompressionDictionary, get_context);
-  UncompressionDict* dict = nullptr;
-  bool is_cache_hit = false;
-  size_t usage = 0;
-  if (cache_handle != nullptr) {
-    dict = reinterpret_cast<UncompressionDict*>(
-        rep_->table_options.block_cache->Value(cache_handle));
-    is_cache_hit = true;
-    usage = dict->ApproximateMemoryUsage();
-  } else if (no_io) {
-    // Do not invoke any io.
-  } else {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    Status s =
-        ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block);
-    if (s.ok()) {
-      assert(compression_dict_block != nullptr);
-      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      std::unique_ptr<UncompressionDict> uncompression_dict(
-          new UncompressionDict(compression_dict_block->data.ToString(),
-                                rep_->blocks_definitely_zstd_compressed,
-                                rep_->ioptions.statistics));
-      usage = uncompression_dict->ApproximateMemoryUsage();
-      s = rep_->table_options.block_cache->Insert(
-          cache_key, uncompression_dict.get(), usage,
-          &DeleteCachedUncompressionDictEntry, &cache_handle,
-          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-
-      if (s.ok()) {
-        UpdateCacheInsertionMetrics(BlockType::kCompressionDictionary,
-                                    get_context, usage);
-        dict = uncompression_dict.release();
-      } else {
-        RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
-        assert(dict == nullptr);
-        assert(cache_handle == nullptr);
-      }
-    }
-  }
-  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
-      lookup_context) {
-    // Avoid making copy of block_key and cf_name when constructing the access
-    // record.
-    BlockCacheTraceRecord access_record(
-        rep_->ioptions.env->NowMicros(),
-        /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
-        /*block_size=*/usage, rep_->cf_id_for_tracing(),
-        /*cf_name=*/"", rep_->level_for_tracing(),
-        rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
-        /*no_insert=*/no_io, lookup_context->get_id,
-        lookup_context->get_from_user_specified_snapshot,
-        /*referenced_key=*/"");
-    block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
-                                          rep_->cf_name_for_tracing(),
-                                          lookup_context->referenced_key);
-  }
-  return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
-          cache_handle, false /* own_value */};
-}
-
 // disable_prefix_seek should be set to true when prefix_extractor found in SST
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
@@ -2028,13 +1901,17 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     return iter;
   }
 
-  const bool no_io = (ro.read_tier == kBlockCacheTier);
-  auto uncompression_dict_storage =
-      GetUncompressionDict(prefetch_buffer, no_io, get_context, lookup_context);
-  const UncompressionDict& uncompression_dict =
-      uncompression_dict_storage.GetValue() == nullptr
-          ? UncompressionDict::GetEmptyDict()
-          : *uncompression_dict_storage.GetValue();
+  UncompressionDict uncompression_dict;
+  if (rep_->uncompression_dict_reader) {
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, get_context, lookup_context,
+        &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+  }
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
@@ -2268,7 +2145,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
       Statistics* statistics = rep_->ioptions.statistics;
       const bool maybe_compressed =
-          block_type != BlockType::kFilter && rep_->blocks_maybe_compressed;
+          block_type != BlockType::kFilter &&
+          block_type != BlockType::kCompressionDictionary &&
+          rep_->blocks_maybe_compressed;
       const bool do_uncompress = maybe_compressed && !block_cache_compressed;
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
@@ -2321,6 +2200,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       case BlockType::kFilter:
         trace_block_type = TraceType::kBlockTraceFilterBlock;
         break;
+      case BlockType::kCompressionDictionary:
+        trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+        break;
       case BlockType::kRangeDeletion:
         trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
         break;
@@ -2568,7 +2450,9 @@ Status BlockBasedTable::RetrieveBlock(
   }
 
   const bool maybe_compressed =
-      block_type != BlockType::kFilter && rep_->blocks_maybe_compressed;
+      block_type != BlockType::kFilter &&
+      block_type != BlockType::kCompressionDictionary &&
+      rep_->blocks_maybe_compressed;
   const bool do_uncompress = maybe_compressed;
   std::unique_ptr<TBlocklike> block;
 
@@ -3504,12 +3388,17 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
-      auto uncompression_dict_storage = GetUncompressionDict(
-          nullptr, no_io, sst_file_range.begin()->get_context, &lookup_context);
-      const UncompressionDict& uncompression_dict =
-          uncompression_dict_storage.GetValue() == nullptr
-              ? UncompressionDict::GetEmptyDict()
-              : *uncompression_dict_storage.GetValue();
+
+      UncompressionDict uncompression_dict;
+      Status uncompression_dict_status;
+      if (rep_->uncompression_dict_reader) {
+        uncompression_dict_status =
+            rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                nullptr /* prefetch_buffer */, no_io,
+                sst_file_range.begin()->get_context, &lookup_context,
+                &uncompression_dict);
+      }
+
       size_t total_len = 0;
       ReadOptions ro = read_options;
       ro.read_tier = kBlockCacheTier;
@@ -3535,6 +3424,14 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           sst_file_range.SkipKey(miter);
           continue;
         }
+
+        if (!uncompression_dict_status.ok()) {
+          *(miter->s) = uncompression_dict_status;
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
         statuses.emplace_back();
         results.emplace_back();
         if (v.handle.offset() == offset) {
@@ -4191,23 +4088,25 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   }
 
   // Output compression dictionary
-  if (!rep_->compression_dict_handle.IsNull()) {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(nullptr /* prefetch_buffer */,
-                                 &compression_dict_block);
+  if (rep_->uncompression_dict_reader) {
+    UncompressionDict uncompression_dict;
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr /* prefetch_buffer */, false /* no_io */,
+        nullptr /* get_context */, nullptr /* lookup_context */,
+        &uncompression_dict);
     if (!s.ok()) {
       return s;
     }
-    assert(compression_dict_block != nullptr);
-    auto compression_dict = compression_dict_block->data;
+
+    const Slice& raw_dict = uncompression_dict.GetRawDict();
     out_file->Append(
         "Compression Dictionary:\n"
         "--------------------------------------\n");
     out_file->Append("  size (bytes): ");
-    out_file->Append(rocksdb::ToString(compression_dict.size()));
+    out_file->Append(rocksdb::ToString(raw_dict.size()));
     out_file->Append("\n\n");
     out_file->Append("  HEX    ");
-    out_file->Append(compression_dict.ToString(true).c_str());
+    out_file->Append(raw_dict.ToString(true).c_str());
     out_file->Append("\n\n");
   }
 
@@ -4233,29 +4132,6 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   return s;
 }
 
-void BlockBasedTable::Close() {
-  if (rep_->closed) {
-    return;
-  }
-
-  // cleanup index, filter, and compression dictionary blocks
-  // to avoid accessing dangling pointers
-  if (!rep_->table_options.no_block_cache) {
-    if (!rep_->compression_dict_handle.IsNull()) {
-      // Get the compression dictionary block key
-      char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-      auto key =
-          GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                      rep_->compression_dict_handle, cache_key);
-
-      Cache* const cache = rep_->table_options.block_cache.get();
-      cache->Erase(key);
-    }
-  }
-
-  rep_->closed = true;
-}
-
 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
   out_file->Append(
       "Index Details:\n"
@@ -4431,15 +4307,4 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
   out_file->Append("\n  ------\n");
 }
 
-namespace {
-
-void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
-  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
-  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
-             dict->ApproximateMemoryUsage());
-  delete dict;
-}
-
-}  // anonymous namespace
-
 }  // namespace rocksdb
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 3a16e2995fb..85346d75c72 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -29,6 +29,7 @@
 #include "table/block_based/block_type.h"
 #include "table/block_based/cachable_entry.h"
 #include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
@@ -176,8 +177,6 @@ class BlockBasedTable : public TableReader {
 
   Status VerifyChecksum(TableReaderCaller caller) override;
 
-  void Close() override;
-
   ~BlockBasedTable();
 
   bool TEST_FilterBlockInCache() const;
@@ -242,8 +241,11 @@ class BlockBasedTable : public TableReader {
 
   template <typename TBlocklike>
   friend class FilterBlockReaderCommon;
+
   friend class PartitionIndexReader;
 
+  friend class UncompressionDictReader;
+
  protected:
   Rep* rep_;
   explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
@@ -313,10 +315,6 @@ class BlockBasedTable : public TableReader {
         CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
       char* scratch, const UncompressionDict& uncompression_dict) const;
 
-  CachableEntry<UncompressionDict> GetUncompressionDict(
-      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context) const;
-
   // Get the iterator from the index reader.
   //
   // If input_iter is not set, return a new Iterator.
@@ -416,9 +414,6 @@ class BlockBasedTable : public TableReader {
                            InternalIterator* meta_iter,
                            const InternalKeyComparator& internal_comparator,
                            BlockCacheLookupContext* lookup_context);
-  Status ReadCompressionDictBlock(
-      FilePrefetchBuffer* prefetch_buffer,
-      std::unique_ptr<const BlockContents>* compression_dict_block) const;
   Status PrefetchIndexAndFilterBlocks(
       FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
       BlockBasedTable* new_table, bool prefetch_all,
@@ -514,7 +509,7 @@ struct BlockBasedTable::Rep {
 
   std::unique_ptr<IndexReader> index_reader;
   std::unique_ptr<FilterBlockReader> filter;
-  std::unique_ptr<UncompressionDict> uncompression_dict;
+  std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
 
   enum class FilterType {
     kNoFilter,
@@ -566,7 +561,6 @@ struct BlockBasedTable::Rep {
   bool index_key_includes_seq = true;
   bool index_value_is_full = true;
 
-  bool closed = false;
   const bool immortal_table;
 
   SequenceNumber get_global_seqno(BlockType block_type) const {
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index e8fcce07d75..b87db6def94 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -44,10 +44,7 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
 class MockBlockBasedTable : public BlockBasedTable {
  public:
   explicit MockBlockBasedTable(Rep* rep)
-      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {
-    // Initialize what Open normally does as much as necessary for the test
-    rep->cache_key_prefix_size = 10;
-  }
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
 };
 
 class TestFilterBitsReader : public FilterBitsReader {
@@ -116,7 +113,6 @@ class PluginFullFilterBlockTest : public testing::Test {
       : ioptions_(options_),
         env_options_(options_),
         icomp_(options_.comparator) {
-    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(new TestHashFilter);
 
     constexpr bool skip_filters = false;
@@ -210,7 +206,6 @@ class FullFilterBlockTest : public testing::Test {
       : ioptions_(options_),
         env_options_(options_),
         icomp_(options_.comparator) {
-    table_options_.no_block_cache = true;
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
 
     constexpr bool skip_filters = false;
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index ae57e85dca6..158ed84abee 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -324,7 +324,7 @@ void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
 
   prefetch_buffer.reset(new FilePrefetchBuffer());
   s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
-    static_cast<size_t>(prefetch_len));
+                                static_cast<size_t>(prefetch_len));
 
   // After prefetch, read the partitions one by one
   ReadOptions read_options;
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 5e9e467723c..aa667afedf0 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -27,7 +27,6 @@ class MockedBlockBasedTable : public BlockBasedTable {
   MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
       : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
-    rep->cache_key_prefix_size = 10;
     rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
     rep->index_value_is_full = !pib->get_use_value_delta_encoding();
   }
@@ -67,9 +66,6 @@ class PartitionedFilterBlockTest
         env_options_(options_),
         icomp_(options_.comparator) {
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
-    table_options_.no_block_cache = true;  // Otherwise BlockBasedTable::Close
-                                           // will access variable that are not
-                                           // initialized in our mocked version
     table_options_.format_version = GetParam();
     table_options_.index_block_restart_interval = 3;
   }
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 00000000000..d74dbf6c497
--- /dev/null
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace rocksdb {
+
+Status UncompressionDictReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(uncompression_dict_reader);
+
+  CachableEntry<BlockContents> uncompression_dict_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadUncompressionDictionaryBlock(
+        table, prefetch_buffer, ReadOptions(), nullptr /* get_context */,
+        lookup_context, &uncompression_dict_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      uncompression_dict_block.Reset();
+    }
+  }
+
+  uncompression_dict_reader->reset(
+      new UncompressionDictReader(table, std::move(uncompression_dict_block)));
+
+  return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<BlockContents>* uncompression_dict_block) {
+  // TODO: add perf counter for compression dictionary read time
+
+  assert(table);
+  assert(uncompression_dict_block);
+  assert(uncompression_dict_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+  assert(!rep->compression_dict_handle.IsNull());
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->compression_dict_handle,
+      UncompressionDict::GetEmptyDict(), uncompression_dict_block,
+      BlockType::kCompressionDictionary, get_context, lookup_context);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep->ioptions.info_log,
+        "Encountered error while reading data from compression dictionary "
+        "block %s",
+        s.ToString().c_str());
+  }
+
+  return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionaryBlock(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<BlockContents>* uncompression_dict_block) const {
+  assert(uncompression_dict_block);
+
+  if (!uncompression_dict_block_.IsEmpty()) {
+    uncompression_dict_block->SetUnownedValue(
+        uncompression_dict_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadUncompressionDictionaryBlock(table_, prefetch_buffer, read_options,
+                                          get_context, lookup_context,
+                                          uncompression_dict_block);
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    UncompressionDict* uncompression_dict) const {
+  CachableEntry<BlockContents> uncompression_dict_block;
+  const Status s = GetOrReadUncompressionDictionaryBlock(
+      prefetch_buffer, no_io, get_context, lookup_context,
+      &uncompression_dict_block);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(uncompression_dict);
+  assert(table_);
+  assert(table_->get_rep());
+
+  UncompressionDict dict(uncompression_dict_block.GetValue()->data,
+                         table_->get_rep()->blocks_definitely_zstd_compressed);
+  *uncompression_dict = std::move(dict);
+  uncompression_dict_block.TransferTo(uncompression_dict);
+
+  return Status::OK();
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+  assert(!uncompression_dict_block_.GetOwnValue() ||
+         uncompression_dict_block_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_block_.GetOwnValue()
+             ? uncompression_dict_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+    return usage;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 00000000000..808149e96b3
--- /dev/null
+++ b/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,64 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace rocksdb {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+  static Status Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context,
+      std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+  Status GetOrReadUncompressionDictionary(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      UncompressionDict* uncompression_dict) const;
+
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  UncompressionDictReader(
+      const BlockBasedTable* t,
+      CachableEntry<BlockContents>&& uncompression_dict_block)
+      : table_(t),
+        uncompression_dict_block_(std::move(uncompression_dict_block)) {
+    assert(table_);
+  }
+
+  static Status ReadUncompressionDictionaryBlock(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      const ReadOptions& read_options, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<BlockContents>* uncompression_dict_block);
+
+  Status GetOrReadUncompressionDictionaryBlock(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<BlockContents>* uncompression_dict_block) const;
+
+  const BlockBasedTable* table_;
+  CachableEntry<BlockContents> uncompression_dict_block_;
+};
+
+}  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
index 72d11a7bd24..eb383c8fe8e 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -124,8 +124,6 @@ class TableReader {
   virtual Status VerifyChecksum(TableReaderCaller /*caller*/) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
-
-  virtual void Close() {}
 };
 
 }  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index bb034311668..6cd26bc732a 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2889,176 +2889,6 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
   }
 }
 
-// A wrapper around LRICache that also keeps track of data blocks (in contrast
-// with the objects) in the cache. The class is very simple and can be used only
-// for trivial tests.
-class MockCache : public LRUCache {
- public:
-  MockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-            double high_pri_pool_ratio)
-      : LRUCache(capacity, num_shard_bits, strict_capacity_limit,
-                 high_pri_pool_ratio) {}
-  Status Insert(const Slice& key, void* value, size_t charge,
-                void (*deleter)(const Slice& key, void* value),
-                Handle** handle = nullptr,
-                Priority priority = Priority::LOW) override {
-    // Replace the deleter with our own so that we keep track of data blocks
-    // erased from the cache
-    deleters_[key.ToString()] = deleter;
-    return ShardedCache::Insert(key, value, charge, &MockDeleter, handle,
-                                priority);
-  }
-  // This is called by the application right after inserting a data block
-  void TEST_mark_as_data_block(const Slice& key, size_t charge) override {
-    marked_data_in_cache_[key.ToString()] = charge;
-    marked_size_ += charge;
-  }
-  using DeleterFunc = void (*)(const Slice& key, void* value);
-  static std::map<std::string, DeleterFunc> deleters_;
-  static std::map<std::string, size_t> marked_data_in_cache_;
-  static size_t marked_size_;
-  static void MockDeleter(const Slice& key, void* value) {
-    // If the item was marked for being data block, decrease its usage from  the
-    // total data block usage of the cache
-    if (marked_data_in_cache_.find(key.ToString()) !=
-        marked_data_in_cache_.end()) {
-      marked_size_ -= marked_data_in_cache_[key.ToString()];
-    }
-    // Then call the origianl deleter
-    assert(deleters_.find(key.ToString()) != deleters_.end());
-    auto deleter = deleters_[key.ToString()];
-    deleter(key, value);
-  }
-};
-
-size_t MockCache::marked_size_ = 0;
-std::map<std::string, MockCache::DeleterFunc> MockCache::deleters_;
-std::map<std::string, size_t> MockCache::marked_data_in_cache_;
-
-// Block cache can contain raw data blocks as well as general objects. If an
-// object depends on the table to be live, it then must be destructed before the
-// table is closed. This test makes sure that the only items remains in the
-// cache after the table is closed are raw data blocks.
-TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
-  std::vector<CompressionType> compression_types{kNoCompression};
-
-  // The following are the compression library versions supporting compression
-  // dictionaries. See the test case CacheCompressionDict in the
-  // DBBlockCacheTest suite.
-#ifdef ZLIB
-  compression_types.push_back(kZlibCompression);
-#endif  // ZLIB
-#if LZ4_VERSION_NUMBER >= 10400
-  compression_types.push_back(kLZ4Compression);
-  compression_types.push_back(kLZ4HCCompression);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-#if ZSTD_VERSION_NUMBER >= 500
-  compression_types.push_back(kZSTD);
-#endif  // ZSTD_VERSION_NUMBER >= 500
-
-  for (int level: {-1, 0, 1, 10}) {
-    for (auto index_type :
-        {BlockBasedTableOptions::IndexType::kBinarySearch,
-        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) {
-      for (bool block_based_filter : {true, false}) {
-        for (bool partition_filter : {true, false}) {
-          if (partition_filter &&
-              (block_based_filter ||
-               index_type !=
-               BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) {
-            continue;
-          }
-          for (bool index_and_filter_in_cache : {true, false}) {
-            for (bool pin_l0 : {true, false}) {
-              for (bool pin_top_level : {true, false}) {
-                if (pin_l0 && !index_and_filter_in_cache) {
-                  continue;
-                }
-
-                for (auto compression_type : compression_types) {
-                  for (uint32_t max_dict_bytes : {0, 1 << 14}) {
-                    if (compression_type == kNoCompression && max_dict_bytes)
-                      continue;
-
-                    // Create a table
-                    Options opt;
-                    std::unique_ptr<InternalKeyComparator> ikc;
-                    ikc.reset(new test::PlainInternalKeyComparator(
-                      opt.comparator));
-                    opt.compression = compression_type;
-                    opt.compression_opts.max_dict_bytes = max_dict_bytes;
-                    BlockBasedTableOptions table_options =
-                      GetBlockBasedTableOptions();
-                    table_options.block_size = 1024;
-                    table_options.index_type = index_type;
-                    table_options.pin_l0_filter_and_index_blocks_in_cache =
-                      pin_l0;
-                    table_options.pin_top_level_index_and_filter =
-                      pin_top_level;
-                    table_options.partition_filters = partition_filter;
-                    table_options.cache_index_and_filter_blocks =
-                      index_and_filter_in_cache;
-                    // big enough so we don't ever lose cached values.
-                    table_options.block_cache = std::make_shared<MockCache>(
-                      16 * 1024 * 1024, 4, false, 0.0);
-                    table_options.filter_policy.reset(
-                      rocksdb::NewBloomFilterPolicy(10, block_based_filter));
-                    opt.table_factory.reset(NewBlockBasedTableFactory(
-                      table_options));
-
-                    bool convert_to_internal_key = false;
-                    TableConstructor c(BytewiseComparator(),
-                      convert_to_internal_key, level);
-                    std::string user_key = "k01";
-                    std::string key =
-                      InternalKey(user_key, 0, kTypeValue).Encode().ToString();
-                    c.Add(key, "hello");
-                    std::vector<std::string> keys;
-                    stl_wrappers::KVMap kvmap;
-                    const ImmutableCFOptions ioptions(opt);
-                    const MutableCFOptions moptions(opt);
-                    c.Finish(opt, ioptions, moptions, table_options, *ikc,
-                      &keys, &kvmap);
-
-                    // Doing a read to make index/filter loaded into the cache
-                    auto table_reader =
-                      dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-                    PinnableSlice value;
-                    GetContext get_context(opt.comparator, nullptr, nullptr,
-                      nullptr, GetContext::kNotFound, user_key, &value,
-                      nullptr, nullptr, nullptr, nullptr);
-                    InternalKey ikey(user_key, 0, kTypeValue);
-                    auto s = table_reader->Get(ReadOptions(), key, &get_context,
-                      moptions.prefix_extractor.get());
-                    ASSERT_EQ(get_context.State(), GetContext::kFound);
-                    ASSERT_STREQ(value.data(), "hello");
-
-                    // Close the table
-                    c.ResetTableReader();
-
-                    auto usage = table_options.block_cache->GetUsage();
-                    auto pinned_usage =
-                      table_options.block_cache->GetPinnedUsage();
-                    // The only usage must be for marked data blocks
-                    ASSERT_EQ(usage, MockCache::marked_size_);
-                    // There must be some pinned data since PinnableSlice has
-                    // not released them yet
-                    ASSERT_GT(pinned_usage, 0);
-                    // Release pinnable slice reousrces
-                    value.Reset();
-                    pinned_usage = table_options.block_cache->GetPinnedUsage();
-                    ASSERT_EQ(pinned_usage, 0);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  } // level
-}
-
 TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   // Check that when we reopen a table we don't lose access to blocks already
   // in the cache. This test checks whether the Table actually makes use of the
diff --git a/util/compression.h b/util/compression.h
index aa8af74499b..5dbb6c244aa 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -21,6 +21,7 @@
 #include <string>
 
 #include "memory/memory_allocator.h"
+#include "rocksdb/cleanable.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "util/coding.h"
@@ -216,36 +217,60 @@ struct CompressionDict {
 
 // Holds dictionary and related data, like ZSTD's digested uncompression
 // dictionary.
-struct UncompressionDict {
+struct UncompressionDict : public Cleanable {
+  // Block containing the data for the compression dictionary. It is non-empty
+  // only if the constructor that takes a string parameter is used.
+  std::string dict_;
+
+  // Slice pointing to the compression dictionary data. Points to
+  // dict_ if the string constructor is used. In the case of the Slice
+  // constructor, it is a copy of the Slice passed by the caller.
+  Slice slice_;
+
 #ifdef ROCKSDB_ZSTD_DDICT
-  ZSTD_DDict* zstd_ddict_;
+  // Processed version of the contents of slice_ for ZSTD compression.
+  ZSTD_DDict* zstd_ddict_ = nullptr;
 #endif  // ROCKSDB_ZSTD_DDICT
-  // Block containing the data for the compression dictionary. It may be
-  // redundant with the data held in `zstd_ddict_`.
-  std::string dict_;
-  // This `Statistics` pointer is intended to be used upon block cache eviction,
-  // so only needs to be populated on `UncompressionDict`s that'll be inserted
-  // into block cache.
-  Statistics* statistics_;
 
+  // Slice constructor: it is the caller's responsibility to either
+  // a) make sure slice remains valid throughout the lifecycle of this object OR
+  // b) transfer the management of the underlying resource (e.g. cache handle)
+  // to this object, in which case UncompressionDict is self-contained, and the
+  // resource is guaranteed to be released (via the cleanup logic in Cleanable)
+  // when UncompressionDict is destroyed.
 #ifdef ROCKSDB_ZSTD_DDICT
-  UncompressionDict(std::string dict, bool using_zstd,
-                    Statistics* _statistics = nullptr) {
+  UncompressionDict(Slice slice, bool using_zstd)
 #else   // ROCKSDB_ZSTD_DDICT
-  UncompressionDict(std::string dict, bool /*using_zstd*/,
-                    Statistics* _statistics = nullptr) {
+  UncompressionDict(Slice slice, bool /*using_zstd*/)
 #endif  // ROCKSDB_ZSTD_DDICT
-    dict_ = std::move(dict);
-    statistics_ = _statistics;
+      : slice_(std::move(slice)) {
 #ifdef ROCKSDB_ZSTD_DDICT
-    zstd_ddict_ = nullptr;
-    if (!dict_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size());
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
       assert(zstd_ddict_ != nullptr);
     }
 #endif  // ROCKSDB_ZSTD_DDICT
   }
 
+  // String constructor: results in a self-contained UncompressionDict.
+  UncompressionDict(std::string dict, bool using_zstd)
+      : UncompressionDict(Slice(dict), using_zstd) {
+    dict_ = std::move(dict);
+  }
+
+  UncompressionDict(UncompressionDict&& rhs)
+      : dict_(std::move(rhs.dict_)),
+        slice_(std::move(rhs.slice_))
+#ifdef ROCKSDB_ZSTD_DDICT
+        ,
+        zstd_ddict_(rhs.zstd_ddict_)
+#endif
+  {
+#ifdef ROCKSDB_ZSTD_DDICT
+    rhs.zstd_ddict_ = nullptr;
+#endif
+  }
+
   ~UncompressionDict() {
 #ifdef ROCKSDB_ZSTD_DDICT
     size_t res = 0;
@@ -257,20 +282,34 @@ struct UncompressionDict {
 #endif                 // ROCKSDB_ZSTD_DDICT
   }
 
+  UncompressionDict& operator=(UncompressionDict&& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    dict_ = std::move(rhs.dict_);
+    slice_ = std::move(rhs.slice_);
+
+#ifdef ROCKSDB_ZSTD_DDICT
+    zstd_ddict_ = rhs.zstd_ddict_;
+    rhs.zstd_ddict_ = nullptr;
+#endif
+
+    return *this;
+  }
+
+  const Slice& GetRawDict() const { return slice_; }
+
 #ifdef ROCKSDB_ZSTD_DDICT
   const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
 #endif  // ROCKSDB_ZSTD_DDICT
 
-  Slice GetRawDict() const { return dict_; }
-
   static const UncompressionDict& GetEmptyDict() {
     static UncompressionDict empty_dict{};
     return empty_dict;
   }
 
-  Statistics* statistics() const { return statistics_; }
-
-  size_t ApproximateMemoryUsage() {
+  size_t ApproximateMemoryUsage() const {
     size_t usage = 0;
     usage += sizeof(struct UncompressionDict);
 #ifdef ROCKSDB_ZSTD_DDICT
@@ -281,11 +320,9 @@ struct UncompressionDict {
   }
 
   UncompressionDict() = default;
-  // Disable copy/move
+  // Disable copy
   UncompressionDict(const CompressionDict&) = delete;
   UncompressionDict& operator=(const CompressionDict&) = delete;
-  UncompressionDict(CompressionDict&&) = delete;
-  UncompressionDict& operator=(CompressionDict&&) = delete;
 };
 
 class CompressionContext {
@@ -725,7 +762,7 @@ inline CacheAllocationPtr Zlib_Uncompress(
     return nullptr;
   }
 
-  Slice compression_dict = info.dict().GetRawDict();
+  const Slice& compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     // Initialize the compression library's dictionary
     st = inflateSetDictionary(
@@ -1040,7 +1077,7 @@ inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
   auto output = AllocateBlock(output_len, allocator);
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
-  Slice compression_dict = info.dict().GetRawDict();
+  const Slice& compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     LZ4_setStreamDecode(stream, compression_dict.data(),
                         static_cast<int>(compression_dict.size()));

From cfcf045accbc5d682a02f4acb1192a7f54f05f1f Mon Sep 17 00:00:00 2001
From: Mark Rambacher <mrambach@gmail.com>
Date: Tue, 23 Jul 2019 17:08:26 -0700
Subject: [PATCH 249/572] =?UTF-8?q?The=20ObjectRegistry=20class=20replaces?=
 =?UTF-8?q?=20the=20Registrar=20and=20NewCustomObjects.=E2=80=A6=20(#5293)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The ObjectRegistry class replaces the Registrar and NewCustomObjects.  Objects are registered with the registry by Type (the class must implement the static const char *Type() method).

This change is necessary for a few reasons:
- By having a class (rather than static template instances), the class can be passed between compilation units, meaning that objects could be registered and shared from a dynamic library with an executable.
- By having a class with instances, different units could have different objects registered.  This could be useful if, for example, one Option allowed for a dynamic library and one did not.

When combined with some other PRs (being able to load shared libraries, a Configurable interface to configure objects to/from string), this code will allow objects in external shared libraries to be added to a RocksDB image at run-time, rather than requiring every new extension to be built into the main library and called explicitly by every program.

Test plan (on riversand963's  devserver)
```
$COMPILE_WITH_ASAN=1 make -j32 all && sleep 1 && make check
```
All tests pass.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5293

Differential Revision: D16363396

Pulled By: riversand963

fbshipit-source-id: fbe4acb615bfc11103eef40a0b288845791c0180
---
 CMakeLists.txt                              |   3 +-
 HISTORY.md                                  |   1 +
 TARGETS                                     |  26 ++-
 env/env.cc                                  |  15 ++
 env/env_basic_test.cc                       |   4 +-
 include/rocksdb/comparator.h                |   1 +
 include/rocksdb/env.h                       |   5 +
 include/rocksdb/merge_operator.h            |   1 +
 include/rocksdb/statistics.h                |   2 +-
 include/rocksdb/utilities/object_registry.h | 225 +++++++++++++++-----
 options/options_helper.cc                   |  26 +--
 options/options_test.cc                     |  33 +--
 src.mk                                      |   1 +
 tools/block_cache_trace_analyzer.cc         |   2 +-
 tools/db_bench_tool.cc                      |  17 +-
 tools/ldb_cmd.cc                            |  11 +-
 utilities/object_registry.cc                |  87 ++++++++
 utilities/object_registry_test.cc           | 137 ++++++++++--
 18 files changed, 465 insertions(+), 132 deletions(-)
 create mode 100644 utilities/object_registry.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0bd7311498f..086975f3e8f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -520,7 +520,7 @@ set(SOURCES
         db/flush_job.cc
         db/flush_scheduler.cc
         db/forward_iterator.cc
-	db/import_column_family_job.cc
+        db/import_column_family_job.cc
         db/internal_stats.cc
         db/logs_with_prep_tracker.cc
         db/log_reader.cc
@@ -681,6 +681,7 @@ set(SOURCES
         utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/string_append/stringappend2.cc
         utilities/merge_operators/uint64add.cc
+        utilities/object_registry.cc
         utilities/option_change_migration/option_change_migration.cc
         utilities/options/options_util.cc
         utilities/persistent_cache/block_cache_tier.cc
diff --git a/HISTORY.md b/HISTORY.md
index d452a68a30f..59205341020 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -20,6 +20,7 @@
 * Overload GetAllKeyVersions() to support non-default column family.
 * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
+* Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
diff --git a/TARGETS b/TARGETS
index 122da8b542f..ba6f96c0b5f 100644
--- a/TARGETS
+++ b/TARGETS
@@ -276,6 +276,7 @@ cpp_library(
         "utilities/merge_operators/string_append/stringappend.cc",
         "utilities/merge_operators/string_append/stringappend2.cc",
         "utilities/merge_operators/uint64add.cc",
+        "utilities/object_registry.cc",
         "utilities/option_change_migration/option_change_migration.cc",
         "utilities/options/options_util.cc",
         "utilities/persistent_cache/block_cache_tier.cc",
@@ -371,11 +372,6 @@ ROCKS_TESTS = [
         "logging/auto_roll_logger_test.cc",
         "serial",
     ],
-    [
-        "env_logger_test",
-        "logging/env_logger_test.cc",
-        "serial",
-    ],
     [
         "autovector_test",
         "util/autovector_test.cc",
@@ -422,13 +418,13 @@ ROCKS_TESTS = [
         "serial",
     ],
     [
-        "cache_test",
-        "cache/cache_test.cc",
+        "cache_simulator_test",
+        "utilities/simulator_cache/cache_simulator_test.cc",
         "serial",
     ],
     [
-        "cache_simulator_test",
-        "utilities/simulator_cache/cache_simulator_test.cc",
+        "cache_test",
+        "cache/cache_test.cc",
         "serial",
     ],
     [
@@ -554,7 +550,7 @@ ROCKS_TESTS = [
     [
         "db_bloom_filter_test",
         "db/db_bloom_filter_test.cc",
-        "parallel",
+        "serial",
     ],
     [
         "db_compaction_filter_test",
@@ -711,6 +707,11 @@ ROCKS_TESTS = [
         "env/env_basic_test.cc",
         "serial",
     ],
+    [
+        "env_logger_test",
+        "logging/env_logger_test.cc",
+        "serial",
+    ],
     [
         "env_test",
         "env/env_test.cc",
@@ -796,6 +797,11 @@ ROCKS_TESTS = [
         "monitoring/histogram_test.cc",
         "serial",
     ],
+    [
+        "import_column_family_test",
+        "db/import_column_family_test.cc",
+        "parallel",
+    ],
     [
         "inlineskiplist_test",
         "memtable/inlineskiplist_test.cc",
diff --git a/env/env.cc b/env/env.cc
index 87b6b35c16c..4c222cfc19e 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -16,6 +16,7 @@
 #include "port/port.h"
 #include "port/sys_time.h"
 #include "rocksdb/options.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -28,6 +29,20 @@ Status Env::NewLogger(const std::string& fname,
   return NewEnvLogger(fname, this, result);
 }
 
+Status Env::LoadEnv(const std::string& value, Env** result) {
+  Env* env = *result;
+  Status s;
+#ifndef ROCKSDB_LITE
+  s = ObjectRegistry::NewInstance()->NewStaticObject<Env>(value, &env);
+#else
+  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
+#endif
+  if (s.ok()) {
+    *result = env;
+  }
+  return s;
+}
+
 std::string Env::PriorityToString(Env::Priority priority) {
   switch (priority) {
     case Env::Priority::BOTTOM:
diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc
index f306edbd6ba..c955bdb7141 100644
--- a/env/env_basic_test.cc
+++ b/env/env_basic_test.cc
@@ -11,7 +11,6 @@
 
 #include "env/mock_env.h"
 #include "rocksdb/env.h"
-#include "rocksdb/utilities/object_registry.h"
 #include "test_util/testharness.h"
 
 namespace rocksdb {
@@ -104,13 +103,12 @@ namespace {
 // ValuesIn() will skip running tests when given an empty collection.
 std::vector<Env*> GetCustomEnvs() {
   static Env* custom_env;
-  static std::unique_ptr<Env> custom_env_guard;
   static bool init = false;
   if (!init) {
     init = true;
     const char* uri = getenv("TEST_ENV_URI");
     if (uri != nullptr) {
-      custom_env = NewCustomObject<Env>(uri, &custom_env_guard);
+      Env::LoadEnv(uri, &custom_env);
     }
   }
 
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 9f262367d11..e30a9d01459 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -35,6 +35,7 @@ class Comparator {
 
   virtual ~Comparator() {}
 
+  static const char* Type() { return "Comparator"; }
   // Three-way comparison.  Returns value:
   //   < 0 iff "a" < "b",
   //   == 0 iff "a" == "b",
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 126f25747ff..398a7ff511d 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -144,6 +144,11 @@ class Env {
 
   virtual ~Env();
 
+  static const char* Type() { return "Environment"; }
+
+  // Loads the environment specified by the input value into the result
+  static Status LoadEnv(const std::string& value, Env** result);
+
   // Return a default environment suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default environment.
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
index d8ddcc6a097..36f47e254ed 100644
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -46,6 +46,7 @@ class Logger;
 class MergeOperator {
  public:
   virtual ~MergeOperator() {}
+  static const char* Type() { return "MergeOperator"; }
 
   // Gives the client a way to express the read -> modify -> write semantics
   // key:      (IN)    The key that's associated with this merge operation.
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 653b460cbdd..a8d01e03415 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -480,7 +480,7 @@ enum StatsLevel : uint8_t {
 class Statistics {
  public:
   virtual ~Statistics() {}
-
+  static const char* Type() { return "Statistics"; }
   virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
   virtual void histogramData(uint32_t type,
                              HistogramData* const data) const = 0;
diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h
index 86a51b92ead..d1516079a61 100644
--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@@ -11,80 +11,195 @@
 #include <memory>
 #include <regex>
 #include <string>
+#include <unordered_map>
 #include <vector>
-
-#include "rocksdb/env.h"
+#include "rocksdb/status.h"
 
 namespace rocksdb {
-
-// Creates a new T using the factory function that was registered with a pattern
-// that matches the provided "target" string according to std::regex_match.
-//
-// If no registered functions match, returns nullptr. If multiple functions
-// match, the factory function used is unspecified.
-//
-// Populates res_guard with result pointer if caller is granted ownership.
-template <typename T>
-T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard);
-
+class Logger;
 // Returns a new T when called with a string. Populates the std::unique_ptr
 // argument if granting ownership to caller.
 template <typename T>
-using FactoryFunc = std::function<T*(const std::string&, std::unique_ptr<T>*)>;
-
-// To register a factory function for a type T, initialize a Registrar<T> object
-// with static storage duration. For example:
-//
-//   static Registrar<Env> hdfs_reg("hdfs://.*", &CreateHdfsEnv);
-//
-// Then, calling NewCustomObject<Env>("hdfs://some_path", ...) will match the
-// regex provided above, so it returns the result of invoking CreateHdfsEnv.
-template <typename T>
-class Registrar {
+using FactoryFunc =
+    std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+class ObjectLibrary {
  public:
-  explicit Registrar(std::string pattern, FactoryFunc<T> factory);
-};
+  // Base class for an Entry in the Registry.
+  class Entry {
+   public:
+    virtual ~Entry() {}
+    Entry(const std::string& name) : name_(std::move(name)) {}
+
+    // Checks to see if the target matches this entry
+    virtual bool matches(const std::string& target) const {
+      return name_ == target;
+    }
+    const std::string& Name() const { return name_; }
+
+   private:
+    const std::string name_;  // The name of the Entry
+  };                          // End class Entry
+
+  // An Entry containing a FactoryFunc for creating new Objects
+  template <typename T>
+  class FactoryEntry : public Entry {
+   public:
+    FactoryEntry(const std::string& name, FactoryFunc<T> f)
+        : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {}
+    ~FactoryEntry() override {}
+    bool matches(const std::string& target) const override {
+      return std::regex_match(target, pattern_);
+    }
+    // Creates a new T object.
+    T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+                        std::string* msg) const {
+      return factory_(target, guard, msg);
+    }
 
-// Implementation details follow.
+   private:
+    std::regex pattern_;  // The pattern for this entry
+    FactoryFunc<T> factory_;
+  };  // End class FactoryEntry
+ public:
+  // Finds the entry matching the input name and type
+  const Entry* FindEntry(const std::string& type,
+                         const std::string& name) const;
+  void Dump(Logger* logger) const;
+
+  // Registers the factory with the library for the pattern.
+  // If the pattern matches, the factory may be used to create a new object.
+  template <typename T>
+  const FactoryFunc<T>& Register(const std::string& pattern,
+                                 const FactoryFunc<T>& factory) {
+    std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory));
+    AddEntry(T::Type(), entry);
+    return factory;
+  }
+  // Returns the default ObjectLibrary
+  static std::shared_ptr<ObjectLibrary>& Default();
 
-namespace internal {
+ private:
+  // Adds the input entry to the list for the given type
+  void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry);
 
-template <typename T>
-struct RegistryEntry {
-  std::regex pattern;
-  FactoryFunc<T> factory;
+  // ** FactoryFunctions for this loader, organized by type
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
 };
 
-template <typename T>
-struct Registry {
-  static Registry* Get() {
-    static Registry<T> instance;
-    return &instance;
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+  static std::shared_ptr<ObjectRegistry> NewInstance();
+
+  ObjectRegistry();
+
+  void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+    libraries_.emplace_back(library);
   }
-  std::vector<RegistryEntry<T>> entries;
 
- private:
-  Registry() = default;
-};
+  // Creates a new T using the factory function that was registered with a
+  // pattern that matches the provided "target" string according to
+  // std::regex_match.
+  //
+  // If no registered functions match, returns nullptr. If multiple functions
+  // match, the factory function used is unspecified.
+  //
+  // Populates res_guard with result pointer if caller is granted ownership.
+  template <typename T>
+  T* NewObject(const std::string& target, std::unique_ptr<T>* guard,
+               std::string* errmsg) {
+    guard->reset();
+    const auto* basic = FindEntry(T::Type(), target);
+    if (basic != nullptr) {
+      const auto* factory =
+          static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic);
+      return factory->NewFactoryObject(target, guard, errmsg);
+    } else {
+      *errmsg = std::string("Could not load ") + T::Type();
+      return nullptr;
+    }
+  }
+
+  // Creates a new unique T using the input factory functions.
+  // Returns OK if a new unique T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a unique ptr)
+  template <typename T>
+  Status NewUniqueObject(const std::string& target,
+                         std::unique_ptr<T>* result) {
+    std::string errmsg;
+    T* ptr = NewObject(target, result, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (*result) {
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a unique ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
 
-}  // namespace internal
+  // Creates a new shared T using the input factory functions.
+  // Returns OK if a new shared T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a shared ptr)
+  template <typename T>
+  Status NewSharedObject(const std::string& target,
+                         std::shared_ptr<T>* result) {
+    std::string errmsg;
+    std::unique_ptr<T> guard;
+    T* ptr = NewObject(target, &guard, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a shared ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
 
-template <typename T>
-T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard) {
-  res_guard->reset();
-  for (const auto& entry : internal::Registry<T>::Get()->entries) {
-    if (std::regex_match(target, entry.pattern)) {
-      return entry.factory(target, res_guard);
+  // Creates a new static T using the input factory functions.
+  // Returns OK if a new static T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return a guarded object
+  //                      (meaning it is managed by a unique ptr)
+  template <typename T>
+  Status NewStaticObject(const std::string& target, T** result) {
+    std::string errmsg;
+    std::unique_ptr<T> guard;
+    T* ptr = NewObject(target, &guard, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (guard.get()) {
+      return Status::InvalidArgument(std::string("Cannot make a static ") +
+                                         T::Type() + " from a guarded one ",
+                                     target);
+    } else {
+      *result = ptr;
+      return Status::OK();
     }
   }
-  return nullptr;
-}
 
-template <typename T>
-Registrar<T>::Registrar(std::string pattern, FactoryFunc<T> factory) {
-  internal::Registry<T>::Get()->entries.emplace_back(internal::RegistryEntry<T>{
-      std::regex(std::move(pattern)), std::move(factory)});
-}
+  // Dump the contents of the registry to the logger
+  void Dump(Logger* logger) const;
+
+ private:
+  const ObjectLibrary::Entry* FindEntry(const std::string& type,
+                                        const std::string& name) const;
 
+  // The set of libraries to search for factories for this registry.
+  // The libraries are searched in reverse order (back to front) when
+  // searching for entries.
+  std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+};
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 922ece3a81a..5733ceed455 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -1045,21 +1045,21 @@ Status ParseColumnFamilyOption(const std::string& name,
     } else {
       if (name == kNameComparator) {
         // Try to get comparator from object registry first.
-        std::unique_ptr<const Comparator> comp_guard;
-        const Comparator* comp =
-            NewCustomObject<const Comparator>(value, &comp_guard);
         // Only support static comparator for now.
-        if (comp != nullptr && !comp_guard) {
-          new_options->comparator = comp;
+        Status status = ObjectRegistry::NewInstance()->NewStaticObject(
+            value, &new_options->comparator);
+        if (status.ok()) {
+          return status;
         }
       } else if (name == kNameMergeOperator) {
         // Try to get merge operator from object registry first.
-        std::unique_ptr<std::shared_ptr<MergeOperator>> mo_guard;
-        std::shared_ptr<MergeOperator>* mo =
-            NewCustomObject<std::shared_ptr<MergeOperator>>(value, &mo_guard);
+        std::shared_ptr<MergeOperator> mo;
+        Status status =
+            ObjectRegistry::NewInstance()->NewSharedObject<MergeOperator>(
+                value, &new_options->merge_operator);
         // Only support static comparator for now.
-        if (mo != nullptr) {
-          new_options->merge_operator = *mo;
+        if (status.ok()) {
+          return status;
         }
       }
 
@@ -1191,10 +1191,10 @@ Status ParseDBOption(const std::string& name,
           NewGenericRateLimiter(static_cast<int64_t>(ParseUint64(value))));
     } else if (name == kNameEnv) {
       // Currently `Env` can be deserialized from object registry only.
-      std::unique_ptr<Env> env_guard;
-      Env* env = NewCustomObject<Env>(value, &env_guard);
+      Env* env = new_options->env;
+      Status status = Env::LoadEnv(value, &env);
       // Only support static env for now.
-      if (env != nullptr && !env_guard) {
+      if (status.ok()) {
         new_options->env = env;
       }
     } else {
diff --git a/options/options_test.cc b/options/options_test.cc
index 823a9c1e054..05ea766f6a6 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -341,11 +341,11 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
 
   // Comparator from object registry
   std::string kCompName = "reverse_comp";
-  static Registrar<const Comparator> test_reg_a(
-      kCompName, [](const std::string& /*name*/,
-                    std::unique_ptr<const Comparator>* /*comparator_guard*/) {
-        return ReverseBytewiseComparator();
-      });
+  ObjectLibrary::Default()->Register<const Comparator>(
+      kCompName,
+      [](const std::string& /*name*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
@@ -354,13 +354,12 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
   // MergeOperator from object registry
   std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
   std::string kMoName = bxo->Name();
-  static Registrar<std::shared_ptr<MergeOperator>> test_reg_b(
-      kMoName, [](const std::string& /*name*/,
-                  std::unique_ptr<std::shared_ptr<MergeOperator>>*
-                      merge_operator_guard) {
-        merge_operator_guard->reset(
-            new std::shared_ptr<MergeOperator>(new BytesXOROperator()));
-        return merge_operator_guard->get();
+  ObjectLibrary::Default()->Register<MergeOperator>(
+      kMoName,
+      [](const std::string& /*name*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new BytesXOROperator());
+        return guard->get();
       });
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
@@ -770,9 +769,10 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
     explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
   };
 
-  static Registrar<Env> test_reg_env(
+  ObjectLibrary::Default()->Register<Env>(
       kCustomEnvName,
-      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/) {
+      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+         std::string* /* errmsg */) {
         static CustomEnv env(Env::Default());
         return &env;
       });
@@ -813,8 +813,9 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.create_if_missing, true);
   ASSERT_EQ(new_options.max_open_files, 1);
   ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
-  std::unique_ptr<Env> env_guard;
-  ASSERT_EQ(NewCustomObject<Env>(kCustomEnvName, &env_guard), new_options.env);
+  Env* newEnv = new_options.env;
+  ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv));
+  ASSERT_EQ(newEnv, new_options.env);
 }
 
 TEST_F(OptionsTest, DBOptionsSerialization) {
diff --git a/src.mk b/src.mk
index 0f04fc73916..3462a6a58bb 100644
--- a/src.mk
+++ b/src.mk
@@ -195,6 +195,7 @@ LIB_SOURCES =                                                   \
   utilities/merge_operators/string_append/stringappend2.cc      \
   utilities/merge_operators/uint64add.cc                        \
   utilities/merge_operators/bytesxor.cc                         \
+  utilities/object_registry.cc                                  \
   utilities/option_change_migration/option_change_migration.cc  \
   utilities/options/options_util.cc                             \
   utilities/persistent_cache/block_cache_tier.cc                \
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc
index 08143ebcf88..761395a6654 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_trace_analyzer.cc
@@ -1637,7 +1637,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
     }
     fprintf(stdout,
             "Bottom %" PRIu32 " access count. Access count=%" PRIu64
-            " nblocks=%" PRIu64 " %s\n",
+            " nblocks=%" ROCKSDB_PRIszt " %s\n",
             bottom_k, naccess_it->first, naccess_it->second.size(),
             statistics.c_str());
   }
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 39f9eebc7e0..f6a9d945897 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -3049,8 +3049,9 @@ class Benchmark {
   std::shared_ptr<TimestampEmulator> timestamp_emulator_;
   std::unique_ptr<port::Thread> secondary_update_thread_;
   std::atomic<int> secondary_update_stopped_{0};
+#ifndef ROCKSDB_LITE
   uint64_t secondary_db_updates_ = 0;
-
+#endif  // ROCKSDB_LITE
   struct ThreadArg {
     Benchmark* bm;
     SharedState* shared;
@@ -6366,13 +6367,12 @@ int db_bench_tool(int argc, char** argv) {
     exit(1);
   }
   if (!FLAGS_statistics_string.empty()) {
-    std::unique_ptr<Statistics> custom_stats_guard;
-    dbstats.reset(NewCustomObject<Statistics>(FLAGS_statistics_string,
-                                              &custom_stats_guard));
-    custom_stats_guard.release();
+    Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
+        FLAGS_statistics_string, &dbstats);
     if (dbstats == nullptr) {
-      fprintf(stderr, "No Statistics registered matching string: %s\n",
-              FLAGS_statistics_string.c_str());
+      fprintf(stderr,
+              "No Statistics registered matching string: %s status=%s\n",
+              FLAGS_statistics_string.c_str(), s.ToString().c_str());
       exit(1);
     }
   }
@@ -6400,12 +6400,11 @@ int db_bench_tool(int argc, char** argv) {
     StringToCompressionType(FLAGS_compression_type.c_str());
 
 #ifndef ROCKSDB_LITE
-  std::unique_ptr<Env> custom_env_guard;
   if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
     fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
     exit(1);
   } else if (!FLAGS_env_uri.empty()) {
-    FLAGS_env = NewCustomObject<Env>(FLAGS_env_uri, &custom_env_guard);
+    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env);
     if (FLAGS_env == nullptr) {
       fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
       exit(1);
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 338f09fb992..86dfcc54e9e 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -20,7 +20,6 @@
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/debug.h"
-#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -2854,8 +2853,9 @@ void BackupCommand::DoCommand() {
     return;
   }
   printf("open db OK\n");
-  std::unique_ptr<Env> custom_env_guard;
-  Env* custom_env = NewCustomObject<Env>(backup_env_uri_, &custom_env_guard);
+  Env* custom_env = nullptr;
+  Env::LoadEnv(backup_env_uri_, &custom_env);
+
   BackupableDBOptions backup_options =
       BackupableDBOptions(backup_dir_, custom_env);
   backup_options.info_log = logger_.get();
@@ -2889,8 +2889,9 @@ void RestoreCommand::Help(std::string& ret) {
 }
 
 void RestoreCommand::DoCommand() {
-  std::unique_ptr<Env> custom_env_guard;
-  Env* custom_env = NewCustomObject<Env>(backup_env_uri_, &custom_env_guard);
+  Env* custom_env = nullptr;
+  Env::LoadEnv(backup_env_uri_, &custom_env);
+
   std::unique_ptr<BackupEngineReadOnly> restore_engine;
   Status status;
   {
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
new file mode 100644
index 00000000000..3706e791e00
--- /dev/null
+++ b/utilities/object_registry.cc
@@ -0,0 +1,87 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+#ifndef ROCKSDB_LITE
+// Looks through the "type" factories for one that matches "name".
+// If found, returns the pointer to the Entry matching this name.
+// Otherwise, nullptr is returned
+const ObjectLibrary::Entry *ObjectLibrary::FindEntry(
+    const std::string &type, const std::string &name) const {
+  auto entries = entries_.find(type);
+  if (entries != entries_.end()) {
+    for (const auto &entry : entries->second) {
+      if (entry->matches(name)) {
+        return entry.get();
+      }
+    }
+  }
+  return nullptr;
+}
+
+void ObjectLibrary::AddEntry(const std::string &type,
+                             std::unique_ptr<Entry> &entry) {
+  auto &entries = entries_[type];
+  entries.emplace_back(std::move(entry));
+}
+
+void ObjectLibrary::Dump(Logger *logger) const {
+  for (const auto &iter : entries_) {
+    ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
+                     iter.first.c_str());
+    bool printed_one = false;
+    for (const auto &e : iter.second) {
+      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':',
+                       e->Name().c_str());
+      printed_one = true;
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "\n");
+}
+
+// Returns the Default singleton instance of the ObjectLibrary
+// This instance will contain most of the "standard" registered objects
+std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+  static std::shared_ptr<ObjectLibrary> instance =
+      std::make_shared<ObjectLibrary>();
+  return instance;
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+  std::shared_ptr<ObjectRegistry> instance = std::make_shared<ObjectRegistry>();
+  return instance;
+}
+
+ObjectRegistry::ObjectRegistry() {
+  libraries_.push_back(ObjectLibrary::Default());
+}
+
+// Searches (from back to front) the libraries looking for the
+// an entry that matches this pattern.
+// Returns the entry if it is found, and nullptr otherwise
+const ObjectLibrary::Entry *ObjectRegistry::FindEntry(
+    const std::string &type, const std::string &name) const {
+  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+    const auto *entry = iter->get()->FindEntry(type, name);
+    if (entry != nullptr) {
+      return entry;
+    }
+  }
+  return nullptr;
+}
+
+void ObjectRegistry::Dump(Logger *logger) const {
+  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+    iter->get()->Dump(logger);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc
index cc7c38d8a65..826931845dc 100644
--- a/utilities/object_registry_test.cc
+++ b/utilities/object_registry_test.cc
@@ -17,44 +17,145 @@ class EnvRegistryTest : public testing::Test {
 
 int EnvRegistryTest::num_a = 0;
 int EnvRegistryTest::num_b = 0;
+static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->Register<Env>(
+    "a://.*",
+    [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+       std::string* /* errmsg */) {
+      ++EnvRegistryTest::num_a;
+      return Env::Default();
+    });
 
-static Registrar<Env> test_reg_a("a://.*",
-                                 [](const std::string& /*uri*/,
-                                    std::unique_ptr<Env>* /*env_guard*/) {
-                                   ++EnvRegistryTest::num_a;
-                                   return Env::Default();
-                                 });
-
-static Registrar<Env> test_reg_b("b://.*", [](const std::string& /*uri*/,
-                                              std::unique_ptr<Env>* env_guard) {
-  ++EnvRegistryTest::num_b;
-  // Env::Default() is a singleton so we can't grant ownership directly to the
-  // caller - we must wrap it first.
-  env_guard->reset(new EnvWrapper(Env::Default()));
-  return env_guard->get();
-});
+static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->Register<Env>(
+    "b://.*", [](const std::string& /*uri*/, std::unique_ptr<Env>* env_guard,
+                 std::string* /* errmsg */) {
+      ++EnvRegistryTest::num_b;
+      // Env::Default() is a singleton so we can't grant ownership directly to
+      // the caller - we must wrap it first.
+      env_guard->reset(new EnvWrapper(Env::Default()));
+      return env_guard->get();
+    });
 
 TEST_F(EnvRegistryTest, Basics) {
+  std::string msg;
   std::unique_ptr<Env> env_guard;
-  auto res = NewCustomObject<Env>("a://test", &env_guard);
+  auto registry = ObjectRegistry::NewInstance();
+  auto res = registry->NewObject<Env>("a://test", &env_guard, &msg);
   ASSERT_NE(res, nullptr);
   ASSERT_EQ(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(0, num_b);
 
-  res = NewCustomObject<Env>("b://test", &env_guard);
+  res = registry->NewObject<Env>("b://test", &env_guard, &msg);
   ASSERT_NE(res, nullptr);
   ASSERT_NE(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(1, num_b);
 
-  res = NewCustomObject<Env>("c://test", &env_guard);
+  res = registry->NewObject<Env>("c://test", &env_guard, &msg);
   ASSERT_EQ(res, nullptr);
   ASSERT_EQ(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(1, num_b);
 }
 
+TEST_F(EnvRegistryTest, LocalRegistry) {
+  std::string msg;
+  std::unique_ptr<Env> guard;
+  auto registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "test-local",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ObjectLibrary::Default()->Register<Env>(
+      "test-global",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ASSERT_EQ(
+      ObjectRegistry::NewInstance()->NewObject<Env>("test-local", &guard, &msg),
+      nullptr);
+  ASSERT_NE(
+      ObjectRegistry::NewInstance()->NewObject("test-global", &guard, &msg),
+      nullptr);
+  ASSERT_NE(registry->NewObject<Env>("test-local", &guard, &msg), nullptr);
+  ASSERT_NE(registry->NewObject<Env>("test-global", &guard, &msg), nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckShared) {
+  std::shared_ptr<Env> shared;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_OK(registry->NewSharedObject<Env>("guarded", &shared));
+  ASSERT_NE(shared, nullptr);
+  shared.reset();
+  ASSERT_NOK(registry->NewSharedObject<Env>("unguarded", &shared));
+  ASSERT_EQ(shared, nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckStatic) {
+  Env* env = nullptr;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_NOK(registry->NewStaticObject<Env>("guarded", &env));
+  ASSERT_EQ(env, nullptr);
+  env = nullptr;
+  ASSERT_OK(registry->NewStaticObject<Env>("unguarded", &env));
+  ASSERT_NE(env, nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckUnique) {
+  std::unique_ptr<Env> unique;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_OK(registry->NewUniqueObject<Env>("guarded", &unique));
+  ASSERT_NE(unique, nullptr);
+  unique.reset();
+  ASSERT_NOK(registry->NewUniqueObject<Env>("unguarded", &unique));
+  ASSERT_EQ(unique, nullptr);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From f5b951f7b6b223f0373bec2d935a0a3a68c17d32 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 23 Jul 2019 19:34:56 -0700
Subject: [PATCH 250/572] Fix wrong info log printing for num_range_deletions
 (#5617)

Summary:
num_range_deletions printing is wrong in this log line:

2019/07/18-12:59:15.309271 7f869f9ff700 EVENT_LOG_v1 {"time_micros": 1563479955309228, "cf_name": "5", "job": 955, "event": "table_file_creation", "file_number": 34579, "file_size": 2239842, "table_properties": {"data_size": 1988792, "index_size": 3067, "index_partitions": 0, "top_level_index_size": 0, "index_key_is_user_key": 0, "index_value_is_delta_encoded": 1, "filter_size": 170821, "raw_key_size": 1951792, "raw_average_key_size": 16, "raw_value_size": 1731720, "raw_average_value_size": 14, "num_data_blocks": 199, "num_entries": 121987, "num_deletions": 15184, "num_merge_operands": 86512, "num_range_deletions": 86512, "format_version": 0, "fixed_key_len": 0, "filter_policy": "rocksdb.BuiltinBloomFilter", "column_family_name": "5", "column_family_id": 5, "comparator": "leveldb.BytewiseComparator", "merge_operator": "PutOperator", "prefix_extractor_name": "rocksdb.FixedPrefix.7", "property_collectors": "[]", "compression": "ZSTD", "compression_options": "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; ", "creation_time": 1563479951, "oldest_key_time": 0, "file_creation_time": 1563479954}}

It actually prints "num_merge_operands" number. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5617

Test Plan: Just build.

Differential Revision: D16453110

fbshipit-source-id: fc1024b3cd5650312ed47a1379f0d2cf8b2d8a8f
---
 db/event_helpers.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index f1b4b6417ed..4c38ad31400 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -106,7 +106,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
               << "num_entries" << table_properties.num_entries
               << "num_deletions" << table_properties.num_deletions
               << "num_merge_operands" << table_properties.num_merge_operands
-              << "num_range_deletions" << table_properties.num_merge_operands
+              << "num_range_deletions" << table_properties.num_range_deletions
               << "format_version" << table_properties.format_version
               << "fixed_key_len" << table_properties.fixed_key_len
               << "filter_policy" << table_properties.filter_policy_name

From 66b524a9112bfe12d8e43cfb69e5ab7a65c9a950 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Wed, 24 Jul 2019 10:21:18 -0700
Subject: [PATCH 251/572] Simplify WriteUnpreparedTxnReadCallback and fix some
 comments (#5621)

Summary:
Simplify WriteUnpreparedTxnReadCallback so we just have one function `CalcMaxVisibleSeq`. Also, there's no need for the read callback to hold onto the transaction any more, so just hold the set of unprep_seqs, reducing about of indirection in `IsVisibleFullCheck`.

Also, some comments about using transaction snapshot were out of date, so remove them.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5621

Differential Revision: D16459883

Pulled By: lth

fbshipit-source-id: cd581323fd18982e817d99af57b6eaba59e599bb
---
 .../transactions/write_unprepared_txn.cc      | 15 ++-------
 utilities/transactions/write_unprepared_txn.h | 31 ++++++++++---------
 .../transactions/write_unprepared_txn_db.cc   | 30 ++++++++----------
 3 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 4d1401b3aa1..9265c3d4afb 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -13,15 +13,13 @@
 namespace rocksdb {
 
 bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
-  auto unprep_seqs = txn_->GetUnpreparedSequenceNumbers();
-
   // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is
   // in unprep_seqs, we have to check if seq is equal to prep_seq or any of
   // the prepare_batch_cnt seq nums after it.
   //
   // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is
   // large.
-  for (const auto& it : unprep_seqs) {
+  for (const auto& it : unprep_seqs_) {
     if (it.first <= seq && seq < it.first + it.second) {
       return true;
     }
@@ -30,15 +28,6 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
   return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
 }
 
-SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber(
-    WriteUnpreparedTxn* txn) {
-  const auto& unprep_seqs = txn->GetUnpreparedSequenceNumbers();
-  if (unprep_seqs.size()) {
-    return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
-  }
-  return 0;
-}
-
 WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const WriteOptions& write_options,
                                        const TransactionOptions& txn_options)
@@ -537,7 +526,7 @@ Status WriteUnpreparedTxn::Get(const ReadOptions& options,
   const bool backed_by_snapshot =
       wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
-                                          this);
+                                          unprep_seqs_);
   auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
                                             value, &callback);
   if (LIKELY(wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index b64fd81e611..d81c30217df 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -53,17 +53,17 @@ class WriteUnpreparedTxn;
 //
 class WriteUnpreparedTxnReadCallback : public ReadCallback {
  public:
-  WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db,
-                                 SequenceNumber snapshot,
-                                 SequenceNumber min_uncommitted,
-                                 WriteUnpreparedTxn* txn)
+  WriteUnpreparedTxnReadCallback(
+      WritePreparedTxnDB* db, SequenceNumber snapshot,
+      SequenceNumber min_uncommitted,
+      const std::map<SequenceNumber, size_t>& unprep_seqs)
       // Pass our last uncommitted seq as the snapshot to the parent class to
       // ensure that the parent will not prematurely filter out own writes. We
       // will do the exact comparison against snapshots in IsVisibleFullCheck
       // override.
-      : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted),
+      : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
         db_(db),
-        txn_(txn),
+        unprep_seqs_(unprep_seqs),
         wup_snapshot_(snapshot) {}
 
   virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
@@ -74,15 +74,18 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
   }
 
  private:
-  static SequenceNumber CalcMaxVisibleSeq(WriteUnpreparedTxn* txn,
-                                          SequenceNumber snapshot_seq) {
-    SequenceNumber max_unprepared = CalcMaxUnpreparedSequenceNumber(txn);
+  static SequenceNumber CalcMaxVisibleSeq(
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SequenceNumber snapshot_seq) {
+    SequenceNumber max_unprepared = 0;
+    if (unprep_seqs.size()) {
+      max_unprepared =
+          unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
+    }
     return std::max(max_unprepared, snapshot_seq);
   }
-  static SequenceNumber CalcMaxUnpreparedSequenceNumber(
-      WriteUnpreparedTxn* txn);
   WritePreparedTxnDB* db_;
-  WriteUnpreparedTxn* txn_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
   SequenceNumber wup_snapshot_;
 };
 
@@ -124,8 +127,6 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   virtual Status RebuildFromWriteBatch(WriteBatch*) override;
 
-  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
-
  protected:
   void Initialize(const TransactionOptions& txn_options) override;
 
@@ -156,6 +157,8 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
   friend class WriteUnpreparedTxnDB;
 
+  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+
   Status MaybeFlushWriteBatchToDB();
   Status FlushWriteBatchToDB(bool prepared);
   Status HandleWrite(std::function<Status()> do_write);
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index c3fcd1f45d2..875d5416763 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -348,7 +348,8 @@ struct WriteUnpreparedTxnDB::IteratorState {
   IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
                 std::shared_ptr<ManagedSnapshot> s,
                 SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
-      : callback(txn_db, sequence, min_uncommitted, txn), snapshot(s) {}
+      : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_),
+        snapshot(s) {}
   SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
 
   WriteUnpreparedTxnReadCallback callback;
@@ -384,27 +385,22 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   // foo: v5 5
   //
   // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
-  // which is the last visible key.
+  // which is the last visible value.
   //
   // For unprepared transactions, if we have snap_seq = 3, but the current
-  // transaction has unprep_seq 5, then returning the first non-visible key
+  // transaction has unprep_seq 5, then returning the first non-visible value
   // would be incorrect, as we should return v5, and not v3. The problem is that
-  // there are committed keys at snapshot_seq < commit_seq < unprep_seq.
+  // there are committed values at snapshot_seq < commit_seq < unprep_seq.
   //
   // Snapshot validation can prevent this problem by ensuring that no committed
-  // keys exist at snapshot_seq < commit_seq, and thus any value with a sequence
-  // number greater than snapshot_seq must be unprepared keys. For example, if
-  // the transaction had a snapshot at 3, then snapshot validation would be
-  // performed during the Put(v5) call. It would find v4, and the Put would fail
-  // with snapshot validation failure.
-  //
-  // Because of this, if any writes have occurred, then the transaction snapshot
-  // must be used for the iterator. If no writes have occurred though, we can
-  // simply create a snapshot. Later writes would not be visible though, but we
-  // don't support iterating while writing anyway.
+  // values exist at snapshot_seq < commit_seq, and thus any value with a
+  // sequence number greater than snapshot_seq must be unprepared values. For
+  // example, if the transaction had a snapshot at 3, then snapshot validation
+  // would be performed during the Put(v5) call. It would find v4, and the Put
+  // would fail with snapshot validation failure.
   //
   // TODO(lth): Improve Prev() logic to continue iterating until
-  // max_visible_seq, and then return the last visible key, so that this
+  // max_visible_seq, and then return the last visible value, so that this
   // restriction can be lifted.
   const Snapshot* snapshot = nullptr;
   if (options.snapshot == nullptr) {
@@ -418,9 +414,9 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   assert(snapshot_seq != kMaxSequenceNumber);
   // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
   // guaranteed that for keys that were modified by this transaction (and thus
-  // might have unprepared versions), no committed versions exist at
+  // might have unprepared values), no committed values exist at
   // largest_validated_seq < commit_seq (or the contrapositive: any committed
-  // version must exist at commit_seq <= largest_validated_seq). This implies
+  // value must exist at commit_seq <= largest_validated_seq). This implies
   // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
   // snapshot_seq. As explained above, the problem with Prev() only happens when
   // snapshot_seq < commit_seq.

From 5daa426a18bf5349584154b51a5404f2b1b69d1a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 24 Jul 2019 12:04:58 -0700
Subject: [PATCH 252/572] Fix regression bug of Auto rolling logger when
 handling failures (#5622)

Summary:
Auto roll logger fails to handle file creation error in the correct way, which may expose to seg fault condition to users. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5622

Test Plan: Add a unit test on creating file under a non-existing directory. The test fails without the fix.

Differential Revision: D16460853

fbshipit-source-id: e96da4bef4f16db171ea04a11b2ec5a9448ddbde
---
 logging/auto_roll_logger.cc      | 5 ++---
 logging/auto_roll_logger_test.cc | 9 +++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc
index ec240f5a334..223dfbe303c 100644
--- a/logging/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -46,9 +46,8 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
   }
   GetExistingFiles();
   ResetLogger();
-  s = TrimOldLogFiles();
-  if (!status_.ok()) {
-    status_ = s;
+  if (status_.ok()) {
+    status_ = TrimOldLogFiles();
   }
 }
 
diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
index fa668114cfb..dd279d62a25 100644
--- a/logging/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -635,6 +635,15 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) {
   delete db;
 }
 
+TEST_F(AutoRollLoggerTest, FileCreateFailure) {
+  Options options;
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.db_log_dir = "/a/dir/does/not/exist/at/all";
+
+  std::shared_ptr<Logger> logger;
+  ASSERT_NOK(CreateLoggerFromOptions("", options, &logger));
+  ASSERT_TRUE(!logger);
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 7260347fd1af7d6f631bd4263368c8fd2a3bbbf2 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 24 Jul 2019 15:11:36 -0700
Subject: [PATCH 253/572] Auto Roll Logger to add some extra checking to avoid
 segfault. (#5623)

Summary:
AutoRollLogger sets GetStatus() to be non-OK if the log file fails to be created and logger_ is set to null. It is left to the caller to check the status before calling function to this class. There is no harm to create another null checking to logger_ before we using it, so that in case users mis-use the logger, they don't get a segfault.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5623

Test Plan: Run all existing tests.

Differential Revision: D16466251

fbshipit-source-id: 262b885eec28bf741d91e9191c3cb5ff964e1bce
---
 logging/auto_roll_logger.cc | 14 +++++++++++++-
 logging/auto_roll_logger.h  |  4 ++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc
index 223dfbe303c..3109f0bc69c 100644
--- a/logging/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -155,6 +155,11 @@ std::string AutoRollLogger::ValistToString(const char* format,
 
 void AutoRollLogger::LogInternal(const char* format, ...) {
   mutex_.AssertHeld();
+
+  if (!logger_) {
+    return;
+  }
+
   va_list args;
   va_start(args, format);
   logger_->Logv(format, args);
@@ -163,7 +168,10 @@ void AutoRollLogger::LogInternal(const char* format, ...) {
 
 void AutoRollLogger::Logv(const char* format, va_list ap) {
   assert(GetStatus().ok());
-
+  if (!logger_) {
+    return;
+  }
+  
   std::shared_ptr<Logger> logger;
   {
     MutexLock l(&mutex_);
@@ -207,6 +215,10 @@ void AutoRollLogger::WriteHeaderInfo() {
 }
 
 void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  if (!logger_) {
+    return;
+  }
+
   // header message are to be retained in memory. Since we cannot make any
   // assumptions about the data contained in va_list, we will retain them as
   // strings
diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h
index a14fbfd5892..45cbc2697a1 100644
--- a/logging/auto_roll_logger.h
+++ b/logging/auto_roll_logger.h
@@ -41,6 +41,10 @@ class AutoRollLogger : public Logger {
   }
 
   size_t GetLogFileSize() const override {
+    if (!logger_) {
+      return 0;
+    }
+
     std::shared_ptr<Logger> logger;
     {
       MutexLock l(&mutex_);

From d9dc6b4637276740a19ff8f649fc0d634342e960 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 24 Jul 2019 15:17:55 -0700
Subject: [PATCH 254/572] Declare snapshot refresh incompatible with delete
 range (#5625)

Summary:
The ::snap_refresh_nanos option is incompatible with DeleteRange feature. Currently the code relies on range_del_agg.IsEmpty() to disable it if there are range delete tombstones. However ::IsEmpty does not guarantee that there is no RangeDelete tombstones in the SST files. The patch declares the two features incompatible in inline comments until we later figure how to properly detect the presence of RangeDelete tombstones in compaction inputs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5625

Differential Revision: D16468218

Pulled By: maysamyabandeh

fbshipit-source-id: bd7beca278bc7e1db75e7ee4522d05a3a6ca86f4
---
 include/rocksdb/options.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 35c27556553..896beba23fc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -275,6 +275,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // this option helps reducing the cpu usage of long-running compactions. The
   // feature is disabled when max_subcompactions is greater than one.
   //
+  // NOTE: This feautre is currently incompatible with RangeDeletes.
+  //
   // Default: 0
   //
   // Dynamically changeable through SetOptions() API

From 0d16fad51b5b8ad41ccc70faab11599f7120b093 Mon Sep 17 00:00:00 2001
From: Chad Austin <chadaustin@fb.com>
Date: Thu, 25 Jul 2019 11:42:31 -0700
Subject: [PATCH 255/572] rocksdb: build on macosx

Summary:
Make rocksdb build on macos:
1) Reorganize OS-specific flags and deps in rocksdb/src/TARGETS
2) Sandbox fbcode apple platform builds from repo root include path (which conflicts
    with layout of rocksdb headers).
3) Fix dep-translation for bzip2.

Reviewed By: andrewjcg

Differential Revision: D15125826

fbshipit-source-id: 8e143c689b88b5727e54881a5e80500f879a320b
---
 TARGETS  | 86 ++++++++++++++++++++++++++++++++++++++++----------------
 defs.bzl |  6 +++-
 2 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/TARGETS b/TARGETS
index ba6f96c0b5f..c0c6fd97fad 100644
--- a/TARGETS
+++ b/TARGETS
@@ -6,29 +6,9 @@ REPO_PATH = package_name() + "/"
 
 ROCKSDB_COMPILER_FLAGS = [
     "-fno-builtin-memcmp",
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
-    "-DROCKSDB_FALLOCATE_PRESENT",
-    "-DROCKSDB_MALLOC_USABLE_SIZE",
-    "-DROCKSDB_RANGESYNC_PRESENT",
-    "-DROCKSDB_SCHED_GETCPU_PRESENT",
-    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-    "-DOS_LINUX",
-    # Flags to enable libs we include
-    "-DSNAPPY",
-    "-DZLIB",
-    "-DBZIP2",
-    "-DLZ4",
-    "-DZSTD",
-    "-DZSTD_STATIC_LINKING_ONLY",
-    "-DGFLAGS=gflags",
-    "-DNUMA",
-    "-DTBB",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
     # Added missing flags from output of build_detect_platform
-    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
-    "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
     "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
@@ -41,11 +21,54 @@ ROCKSDB_EXTERNAL_DEPS = [
     ("lz4", None, "lz4"),
     ("zstd", None),
     ("tbb", None),
-    ("numa", None, "numa"),
     ("googletest", None, "gtest"),
 ]
 
+ROCKSDB_OS_DEPS = [
+    (
+        "linux",
+        ["third-party//numa:numa"],
+    ),
+]
+
+ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+    (
+        "linux",
+        [
+            "-DOS_LINUX",
+            "-DROCKSDB_FALLOCATE_PRESENT",
+            "-DROCKSDB_MALLOC_USABLE_SIZE",
+            "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+            "-DROCKSDB_RANGESYNC_PRESENT",
+            "-DROCKSDB_SCHED_GETCPU_PRESENT",
+            "-DHAVE_SSE42",
+            "-DNUMA",
+        ],
+    ),
+    (
+        "macos",
+        ["-DOS_MACOSX"],
+    ),
+]
+
 ROCKSDB_PREPROCESSOR_FLAGS = [
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DTBB",
+
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_BACKTRACE",
+
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
@@ -53,7 +76,6 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
     "x86_64": [
-        "-DHAVE_SSE42",
         "-DHAVE_PCLMUL",
     ],
 }
@@ -70,9 +92,15 @@ sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
+ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+    "linux",
+    ["-DROCKSDB_JEMALLOC"],
+)] if sanitizer == "" else [])
 
-ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
+ROCKSDB_OS_DEPS += ([(
+    "linux",
+    ["third-party//jemalloc:headers"],
+)] if sanitizer == "" else [])
 
 cpp_library(
     name = "rocksdb_lib",
@@ -308,6 +336,8 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -328,6 +358,8 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -344,6 +376,8 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -355,6 +389,8 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_test_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -1113,6 +1149,8 @@ ROCKS_TESTS = [
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
         rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_os_deps = ROCKSDB_OS_DEPS,
+        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
         test_cc = test_cc,
         test_name = test_name,
diff --git a/defs.bzl b/defs.bzl
index f3e8339783e..a9f25ebcc42 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -8,9 +8,11 @@ def test_binary(
         test_cc,
         parallelism,
         rocksdb_arch_preprocessor_flags,
+        rocksdb_os_preprocessor_flags,
         rocksdb_compiler_flags,
         rocksdb_preprocessor_flags,
-        rocksdb_external_deps):
+        rocksdb_external_deps,
+        rocksdb_os_deps):
     TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh"
 
     ttype = "gtest" if parallelism == "parallel" else "simple"
@@ -20,9 +22,11 @@ def test_binary(
         name = test_bin,
         srcs = [test_cc],
         arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
+        os_preprocessor_flags = rocksdb_os_preprocessor_flags,
         compiler_flags = rocksdb_compiler_flags,
         preprocessor_flags = rocksdb_preprocessor_flags,
         deps = [":rocksdb_test_lib"],
+        os_deps = rocksdb_os_deps,
         external_deps = rocksdb_external_deps,
     )
 

From ae152ee666c34b31c4bb0fa5a8fdf46a6b5ea93b Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 25 Jul 2019 15:23:46 -0700
Subject: [PATCH 256/572] Avoid user key copying for Get/Put/Write with
 user-timestamp (#5502)

Summary:
In previous https://github.com/facebook/rocksdb/issues/5079, we added user-specified timestamp to `DB::Get()` and `DB::Put()`. Limitation is that these two functions may cause extra memory allocation and key copy. The reason is that `WriteBatch` does not allocate extra memory for timestamps because it is not aware of timestamp size, and we did not provide an API to assign/update timestamp of each key within a `WriteBatch`.
We address these issues in this PR by doing the following.
1. Add a `timestamp_size_` to `WriteBatch` so that `WriteBatch` can take timestamps into account when calling `WriteBatch::Put`, `WriteBatch::Delete`, etc.
2. Add APIs `WriteBatch::AssignTimestamp` and `WriteBatch::AssignTimestamps` so that application can assign/update timestamps for each key in a `WriteBatch`.
3. Avoid key copy in `GetImpl` by adding new constructor to `LookupKey`.

Test plan (on devserver):
```
$make clean && COMPILE_WITH_ASAN=1 make -j32 all
$./db_basic_test --gtest_filter=Timestamp/DBBasicTestWithTimestampWithParam.PutAndGet/*
$make check
```
If the API extension looks good, I will add more unit tests.

Some simple benchmark using db_bench.
```
$rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillseq,readrandom -num=1000000
$rm -rf /dev/shm/dbbench/* && TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=1000000 -disable_wal=true
```
Master is at a78503bd6c80a3c4137df1962a972fe406b4d90b.
```
|        | readrandom | fillrandom |
| master | 15.53 MB/s | 25.97 MB/s |
| PR5502 | 16.70 MB/s | 25.80 MB/s |
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5502

Differential Revision: D16340894

Pulled By: riversand963

fbshipit-source-id: 51132cf792be07d1efc3ac33f5768c4ee2608bb8
---
 .gitignore                    |   1 +
 db/db_impl/db_impl.cc         |  13 +--
 db/db_impl/db_impl_write.cc   |  12 +--
 db/dbformat.cc                |  12 ++-
 db/dbformat.h                 |  16 ----
 db/lookup_key.h               |   3 +-
 db/write_batch.cc             | 146 ++++++++++++++++++++++++++++++++--
 include/rocksdb/write_batch.h |   9 +++
 util/coding.h                 |  16 +++-
 9 files changed, 183 insertions(+), 45 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7a799c09a9d..c8672a8b31e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ rocksdb_undump
 db_test2
 trace_analyzer
 trace_analyzer_test
+block_cache_trace_analyzer
 .DS_Store
 
 java/out
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8132d5a0b38..54e401ddd5a 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1441,16 +1441,7 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
-  if (nullptr == read_options.timestamp) {
-    return GetImpl(read_options, column_family, key, value);
-  }
-  Slice akey;
-  std::string buf;
-  Status s = AppendTimestamp(key, *(read_options.timestamp), &akey, &buf);
-  if (s.ok()) {
-    s = GetImpl(read_options, column_family, akey, value);
-  }
-  return s;
+  return GetImpl(read_options, column_family, key, value);
 }
 
 Status DBImpl::GetImpl(const ReadOptions& read_options,
@@ -1528,7 +1519,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   // First look in the memtable, then in the immutable memtable (if any).
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
-  LookupKey lkey(key, snapshot);
+  LookupKey lkey(key, snapshot, read_options.timestamp);
   PERF_TIMER_STOP(get_snapshot_time);
 
   bool skip_memtable = (read_options.read_tier == kPersistedTier &&
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 95a1b31c769..0ad2a3e9a86 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1734,14 +1734,16 @@ Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
     }
     return Write(opt, &batch);
   }
-  Slice akey;
-  std::string buf;
-  Status s = AppendTimestamp(key, *(opt.timestamp), &akey, &buf);
+  const Slice* ts = opt.timestamp;
+  assert(nullptr != ts);
+  size_t ts_sz = ts->size();
+  WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
+                   ts_sz);
+  Status s = batch.Put(column_family, key, value);
   if (!s.ok()) {
     return s;
   }
-  WriteBatch batch(akey.size() + value.size() + 24);
-  s = batch.Put(column_family, akey, value);
+  s = batch.AssignTimestamp(*ts);
   if (!s.ok()) {
     return s;
   }
diff --git a/db/dbformat.cc b/db/dbformat.cc
index bfaea868b53..130ba4e8adf 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -159,9 +159,11 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
   }
 }
 
-LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+                     const Slice* ts) {
   size_t usize = _user_key.size();
-  size_t needed = usize + 13;  // A conservative estimate
+  size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+  size_t needed = usize + ts_sz + 13;  // A conservative estimate
   char* dst;
   if (needed <= sizeof(space_)) {
     dst = space_;
@@ -170,10 +172,14 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
   }
   start_ = dst;
   // NOTE: We don't support users keys of more than 2GB :)
-  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
   kstart_ = dst;
   memcpy(dst, _user_key.data(), usize);
   dst += usize;
+  if (nullptr != ts) {
+    memcpy(dst, ts->data(), ts_sz);
+    dst += ts_sz;
+  }
   EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
   dst += 8;
   end_ = dst;
diff --git a/db/dbformat.h b/db/dbformat.h
index c6ee5677c09..1d9b7ef7e3f 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -669,20 +669,4 @@ struct ParsedInternalKeyComparator {
   const InternalKeyComparator* cmp;
 };
 
-// TODO (yanqin): this causes extra memory allocation and copy. Should be
-// addressed in the future.
-inline Status AppendTimestamp(const Slice& key, const Slice& timestamp,
-                              Slice* ret_key, std::string* ret_buf) {
-  assert(ret_key != nullptr);
-  assert(ret_buf != nullptr);
-  if (key.data() + key.size() == timestamp.data()) {
-    *ret_key = Slice(key.data(), key.size() + timestamp.size());
-  } else {
-    ret_buf->assign(key.data(), key.size());
-    ret_buf->append(timestamp.data(), timestamp.size());
-    *ret_key = Slice(*ret_buf);
-  }
-  return Status::OK();
-}
-
 }  // namespace rocksdb
diff --git a/db/lookup_key.h b/db/lookup_key.h
index ddf4ff0e942..1b0f6f56290 100644
--- a/db/lookup_key.h
+++ b/db/lookup_key.h
@@ -21,7 +21,8 @@ class LookupKey {
  public:
   // Initialize *this for looking up user_key at a snapshot with
   // the specified sequence number.
-  LookupKey(const Slice& _user_key, SequenceNumber sequence);
+  LookupKey(const Slice& _user_key, SequenceNumber sequence,
+            const Slice* ts = nullptr);
 
   ~LookupKey();
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index d7a2e792a33..2c2d81e87f6 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -135,6 +135,105 @@ struct BatchContentClassifier : public WriteBatch::Handler {
   }
 };
 
+class TimestampAssigner : public WriteBatch::Handler {
+ public:
+  explicit TimestampAssigner(const Slice& ts)
+      : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
+  explicit TimestampAssigner(const std::vector<Slice>& ts_list)
+      : timestamps_(ts_list) {
+    SanityCheck();
+  }
+  ~TimestampAssigner() override {}
+
+  Status PutCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice& begin_key,
+                       const Slice& end_key) override {
+    AssignTimestamp(begin_key);
+    AssignTimestamp(end_key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    // TODO (yanqin): support blob db in the future.
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+ private:
+  void SanityCheck() const {
+    assert(!timestamps_.empty());
+#ifndef NDEBUG
+    const size_t ts_sz = timestamps_[0].size();
+    for (size_t i = 1; i != timestamps_.size(); ++i) {
+      assert(ts_sz == timestamps_[i].size());
+    }
+#endif  // !NDEBUG
+  }
+
+  void AssignTimestamp(const Slice& key) {
+    assert(timestamps_.empty() || idx_ < timestamps_.size());
+    const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
+    size_t ts_sz = ts.size();
+    char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+    memcpy(ptr, ts.data(), ts_sz);
+  }
+
+  static const std::vector<Slice> kEmptyTimestampList;
+  const Slice timestamp_;
+  const std::vector<Slice>& timestamps_;
+  size_t idx_ = 0;
+
+  // No copy or move.
+  TimestampAssigner(const TimestampAssigner&) = delete;
+  TimestampAssigner(TimestampAssigner&&) = delete;
+  TimestampAssigner& operator=(const TimestampAssigner&) = delete;
+  TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
+};
+const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
+
 }  // anon namespace
 
 struct SavePoints {
@@ -142,7 +241,15 @@ struct SavePoints {
 };
 
 WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
-    : content_flags_(0), max_bytes_(max_bytes), rep_() {
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
   rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
     reserved_bytes : WriteBatchInternal::kHeader);
   rep_.resize(WriteBatchInternal::kHeader);
@@ -151,18 +258,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
 WriteBatch::WriteBatch(const std::string& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
-      rep_(rep) {}
+      rep_(rep),
+      timestamp_size_(0) {}
 
 WriteBatch::WriteBatch(std::string&& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
-      rep_(std::move(rep)) {}
+      rep_(std::move(rep)),
+      timestamp_size_(0) {}
 
 WriteBatch::WriteBatch(const WriteBatch& src)
     : wal_term_point_(src.wal_term_point_),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(src.rep_) {
+      rep_(src.rep_),
+      timestamp_size_(src.timestamp_size_) {
   if (src.save_points_ != nullptr) {
     save_points_.reset(new SavePoints());
     save_points_->stack = src.save_points_->stack;
@@ -174,7 +284,8 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept
       wal_term_point_(std::move(src.wal_term_point_)),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(std::move(src.rep_)) {}
+      rep_(std::move(src.rep_)),
+      timestamp_size_(src.timestamp_size_) {}
 
 WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
   if (&src != this) {
@@ -643,7 +754,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  PutLengthPrefixedSlice(&b->rep_, key);
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSlice(&b->rep_, key);
+  } else {
+    PutVarint32(&b->rep_,
+                static_cast<uint32_t>(key.size() + b->timestamp_size_));
+    b->rep_.append(key.data(), key.size());
+    b->rep_.append(b->timestamp_size_, '\0');
+  }
   PutLengthPrefixedSlice(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@@ -692,7 +810,11 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  PutLengthPrefixedSliceParts(&b->rep_, key);
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSliceParts(&b->rep_, key);
+  } else {
+    PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
+  }
   PutLengthPrefixedSliceParts(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@@ -1038,6 +1160,16 @@ Status WriteBatch::PopSavePoint() {
   return Status::OK();
 }
 
+Status WriteBatch::AssignTimestamp(const Slice& ts) {
+  TimestampAssigner ts_assigner(ts);
+  return Iterate(&ts_assigner);
+}
+
+Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
+  TimestampAssigner ts_assigner(ts_list);
+  return Iterate(&ts_assigner);
+}
+
 class MemTableInserter : public WriteBatch::Handler {
 
   SequenceNumber sequence_;
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 29b660d1987..393c5d9c6ab 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -28,6 +28,7 @@
 #include <atomic>
 #include <memory>
 #include <string>
+#include <vector>
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch_base.h"
 
@@ -60,6 +61,7 @@ struct SavePoint {
 class WriteBatch : public WriteBatchBase {
  public:
   explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
   ~WriteBatch() override;
 
   using WriteBatchBase::Put;
@@ -311,6 +313,12 @@ class WriteBatch : public WriteBatchBase {
   // Returns trie if MarkRollback will be called during Iterate
   bool HasRollback() const;
 
+  // Assign timestamp to write batch
+  Status AssignTimestamp(const Slice& ts);
+
+  // Assign timestamps to write batch
+  Status AssignTimestamps(const std::vector<Slice>& ts_list);
+
   using WriteBatchBase::GetWriteBatch;
   WriteBatch* GetWriteBatch() override { return this; }
 
@@ -361,6 +369,7 @@ class WriteBatch : public WriteBatchBase {
 
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
+  const size_t timestamp_size_;
 
   // Intentionally copyable
 };
diff --git a/util/coding.h b/util/coding.h
index 9427d52618e..3ad6d957007 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -50,6 +50,8 @@ extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
 extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
 extern void PutLengthPrefixedSliceParts(std::string* dst,
                                         const SliceParts& slice_parts);
+extern void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz);
 
 // Standard Get... routines parse a value from the beginning of a Slice
 // and advance the slice past the parsed value.
@@ -306,9 +308,8 @@ inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
   dst->append(value.data(), value.size());
 }
 
-inline void PutLengthPrefixedSliceParts(std::string* dst,
+inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes,
                                         const SliceParts& slice_parts) {
-  size_t total_bytes = 0;
   for (int i = 0; i < slice_parts.num_parts; ++i) {
     total_bytes += slice_parts.parts[i].size();
   }
@@ -318,6 +319,17 @@ inline void PutLengthPrefixedSliceParts(std::string* dst,
   }
 }
 
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts);
+}
+
+inline void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts);
+  dst->append(pad_sz, '\0');
+}
+
 inline int VarintLength(uint64_t v) {
   int len = 1;
   while (v >= 128) {

From 9625a2bc2b56c92487922c192f4b903083e63c2c Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Thu, 25 Jul 2019 22:38:53 -0700
Subject: [PATCH 257/572] Added SizeApproximationOptions to
 DB::GetApproximateSizes (#5626)

Summary:
The new DB::GetApproximateSizes with SizeApproximationOptions argument, which allows to add more options/knobs to the DB::GetApproximateSizes call (beyond only the include_flags)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5626

Differential Revision: D16496913

Pulled By: elipoz

fbshipit-source-id: ee8c6c182330a285fa056ecfc3905a592b451720
---
 HISTORY.md                               |  2 ++
 db/db_impl/db_impl.cc                    | 17 ++++++++++-------
 db/db_impl/db_impl.h                     |  7 ++++---
 db/db_test.cc                            |  9 +++++----
 include/rocksdb/db.h                     | 24 +++++++++++++++++-------
 include/rocksdb/options.h                | 14 ++++++++++----
 include/rocksdb/utilities/stackable_db.h |  9 +++++----
 7 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 59205341020..ace55cab404 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -21,6 +21,7 @@
 * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
 * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
+* Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors.
 
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
@@ -29,6 +30,7 @@
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
 * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
 * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
+* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 54e401ddd5a..16a6d86a658 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2770,11 +2770,13 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
-void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                 const Range* range, int n, uint64_t* sizes,
-                                 uint8_t include_flags) {
-  assert(include_flags & DB::SizeApproximationFlags::INCLUDE_FILES ||
-         include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES);
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+                                   ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+  if (!options.include_memtabtles && !options.include_files) {
+    return Status::InvalidArgument("Invalid options");
+  }
+
   Version* v;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
@@ -2786,18 +2788,19 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
     InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
     InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = 0;
-    if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) {
+    if (options.include_files) {
       sizes[i] += versions_->ApproximateSize(
           v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
           TableReaderCaller::kUserApproximateSize);
     }
-    if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) {
+    if (options.include_memtabtles) {
       sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
       sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
     }
   }
 
   ReturnAndCleanupSuperVersion(cfd, sv);
+  return Status::OK();
 }
 
 std::list<uint64_t>::iterator
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 547e3e1d6be..fe3a2f6f20f 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -233,9 +233,10 @@ class DBImpl : public DB {
   virtual bool GetAggregatedIntProperty(const Slice& property,
                                         uint64_t* aggregated_value) override;
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(
-      ColumnFamilyHandle* column_family, const Range* range, int n,
-      uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) override;
   using DB::GetApproximateMemTableStats;
   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
                                            const Range& range,
diff --git a/db/db_test.cc b/db/db_test.cc
index 36bdda59e21..f247ddb80fa 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2598,13 +2598,14 @@ class ModelDB : public DB {
     return false;
   }
   using DB::GetApproximateSizes;
-  void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/,
-                           const Range* /*range*/, int n, uint64_t* sizes,
-                           uint8_t /*include_flags*/
-                           = INCLUDE_FILES) override {
+  Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+                             ColumnFamilyHandle* /*column_family*/,
+                             const Range* /*range*/, int n,
+                             uint64_t* sizes) override {
     for (int i = 0; i < n; i++) {
       sizes[i] = 0;
     }
+    return Status::OK();
   }
   using DB::GetApproximateMemTableStats;
   void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index d90ca900f45..1d90dc50b4b 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -808,7 +808,7 @@ class DB {
   // stats should be included, or file stats approximation or both
   enum SizeApproximationFlags : uint8_t {
     NONE = 0,
-    INCLUDE_MEMTABLES = 1,
+    INCLUDE_MEMTABLES = 1 << 0,
     INCLUDE_FILES = 1 << 1
   };
 
@@ -818,14 +818,24 @@ class DB {
   // Note that the returned sizes measure file system space usage, so
   // if the user data compresses by a factor of ten, the returned
   // sizes will be one-tenth the size of the corresponding user data size.
-  //
-  // If include_flags defines whether the returned size should include
-  // the recently written data in the mem-tables (if
-  // the mem-table type supports it), data serialized to disk, or both.
-  // include_flags should be of type DB::SizeApproximationFlags
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) = 0;
+
+  // Simpler versions of the GetApproximateSizes() method above.
+  // The include_flags argumenbt must of type DB::SizeApproximationFlags
+  // and can not be NONE.
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
                                    const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) = 0;
+                                   uint8_t include_flags = INCLUDE_FILES) {
+    SizeApproximationOptions options;
+    options.include_memtabtles =
+        (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
+    options.include_files =
+        (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
+    GetApproximateSizes(options, column_family, range, n, sizes);
+  }
   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
                                    uint8_t include_flags = INCLUDE_FILES) {
     GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 896beba23fc..5ae010b8f52 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1093,10 +1093,6 @@ struct DBOptions {
   // The number of bytes to prefetch when reading the log. This is mostly useful
   // for reading a remotely located log, as it can save the number of
   // round-trips. If 0, then the prefetching is disabled.
-
-  // If non-zero, we perform bigger reads when reading the log.
-  // This is mostly useful for reading a remotely located log, as it can save
-  // the number of round-trips. If 0, then the prefetching is disabled.
   //
   // Default: 0
   size_t log_readahead_size = 0;
@@ -1510,4 +1506,14 @@ struct ImportColumnFamilyOptions {
   bool move_files = false;
 };
 
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+  // Defines whether the returned size should include the recently written
+  // data in the mem-tables. If set to false, include_files must be true.
+  bool include_memtabtles = false;
+  // Defines whether the returned size should include data serialized to disk.
+  // If set to false, include_memtabtles must be true.
+  bool include_files = true;
+};
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index a52aff5d8b1..67bf4e2fa6b 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -209,10 +209,11 @@ class StackableDB : public DB {
   }
 
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(
-      ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes,
-      uint8_t include_flags = INCLUDE_FILES) override {
-    return db_->GetApproximateSizes(column_family, r, n, sizes, include_flags);
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* r, int n,
+                                     uint64_t* sizes) override {
+    return db_->GetApproximateSizes(options, column_family, r, n, sizes);
   }
 
   using DB::GetApproximateMemTableStats;

From 74782cec325e32824699bd4385df2c914d261721 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 26 Jul 2019 09:52:23 -0700
Subject: [PATCH 258/572] Fix target 'clean' to include parallel test binaries
 (#5629)

Summary:
current `clean` target in Makefile does not remove parallel test
binaries. Fix this.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5629

Test Plan:
(on devserver)
Take file_reader_writer_test for instance.
```
$make -j32 file_reader_writer_test
$make clean
```
Verify that binary file 'file_reader_writer_test' is delete by `make clean`.

Differential Revision: D16513176

Pulled By: riversand963

fbshipit-source-id: 70acb9f56c928a494964121b86aacc0090f31ff6
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 65d884fa4fb..7f5772c6273 100644
--- a/Makefile
+++ b/Makefile
@@ -1068,7 +1068,7 @@ rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
 	build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
 
 clean:
-	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED)
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
 	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
 	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;

From 230b909da8e9a9c807ed559f45c2d9a3dae4aa78 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Fri, 26 Jul 2019 11:31:46 -0700
Subject: [PATCH 259/572] Fix PopSavePoint to merge info into the previous
 savepoint (#5628)

Summary:
Transaction::RollbackToSavePoint undos the modification made since the SavePoint beginning, and also unlocks the corresponding keys, which are tracked in the last SavePoint. Currently ::PopSavePoint simply discard these tracked keys, leaving them locked in the lock manager. This breaks a subsequent ::RollbackToSavePoint behavior as it loses track of such keys, and thus cannot unlock them. The patch fixes ::PopSavePoint by passing on the track key information to the previous SavePoint.
Fixes https://github.com/facebook/rocksdb/issues/5618
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5628

Differential Revision: D16505325

Pulled By: lth

fbshipit-source-id: 2bc3b30963ab4d36d996d1f66543c93abf358980
---
 utilities/transactions/transaction_base.cc | 43 +++++++++++++++++-
 utilities/transactions/transaction_base.h  | 10 +++--
 utilities/transactions/transaction_test.cc | 52 ++++++++++++++++++++++
 utilities/transactions/transaction_util.h  |  8 ++++
 4 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 5621a7fa372..bf59a1c4069 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -192,7 +192,48 @@ Status TransactionBaseImpl::PopSavePoint() {
   }
 
   assert(!save_points_->empty());
-  save_points_->pop();
+  // If there is another savepoint A below the current savepoint B, then A needs
+  // to inherit tracked_keys in B so that if we rollback to savepoint A, we
+  // remember to unlock keys in B. If there is no other savepoint below, then we
+  // can safely discard savepoint info.
+  if (save_points_->size() == 1) {
+    save_points_->pop();
+  } else {
+    TransactionBaseImpl::SavePoint top;
+    std::swap(top, save_points_->top());
+    save_points_->pop();
+
+    const TransactionKeyMap& curr_cf_key_map = top.new_keys_;
+    TransactionKeyMap& prev_cf_key_map = save_points_->top().new_keys_;
+
+    for (const auto& curr_cf_key_iter : curr_cf_key_map) {
+      uint32_t column_family_id = curr_cf_key_iter.first;
+      const std::unordered_map<std::string, TransactionKeyMapInfo>& curr_keys =
+          curr_cf_key_iter.second;
+
+      // If cfid was not previously tracked, just copy everything over.
+      auto prev_keys_iter = prev_cf_key_map.find(column_family_id);
+      if (prev_keys_iter == prev_cf_key_map.end()) {
+        prev_cf_key_map.emplace(curr_cf_key_iter);
+      } else {
+        std::unordered_map<std::string, TransactionKeyMapInfo>& prev_keys =
+            prev_keys_iter->second;
+        for (const auto& key_iter : curr_keys) {
+          const std::string& key = key_iter.first;
+          const TransactionKeyMapInfo& info = key_iter.second;
+          // If key was not previously tracked, just copy the whole struct over.
+          // Otherwise, some merging needs to occur.
+          auto prev_info = prev_keys.find(key);
+          if (prev_info == prev_keys.end()) {
+            prev_keys.emplace(key_iter);
+          } else {
+            prev_info->second.Merge(info);
+          }
+        }
+      }
+    }
+  }
+
   return write_batch_.PopSavePoint();
 }
 
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 26efd51b378..657e9c59656 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -294,11 +294,11 @@ class TransactionBaseImpl : public Transaction {
 
   struct SavePoint {
     std::shared_ptr<const Snapshot> snapshot_;
-    bool snapshot_needed_;
+    bool snapshot_needed_ = false;
     std::shared_ptr<TransactionNotifier> snapshot_notifier_;
-    uint64_t num_puts_;
-    uint64_t num_deletes_;
-    uint64_t num_merges_;
+    uint64_t num_puts_ = 0;
+    uint64_t num_deletes_ = 0;
+    uint64_t num_merges_ = 0;
 
     // Record all keys tracked since the last savepoint
     TransactionKeyMap new_keys_;
@@ -312,6 +312,8 @@ class TransactionBaseImpl : public Transaction {
           num_puts_(num_puts),
           num_deletes_(num_deletes),
           num_merges_(num_merges) {}
+
+    SavePoint() = default;
   };
 
   // Records writes pending in this transaction
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 7868d0060e9..534103a545e 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -4030,6 +4030,58 @@ TEST_P(TransactionTest, SavepointTest3) {
   ASSERT_TRUE(s.IsNotFound());
 }
 
+TEST_P(TransactionTest, SavepointTest4) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  txn1->SetSavePoint();  // 1
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 2
+  ASSERT_OK(s);
+
+  // Verify that A/B still exists.
+  std::string value;
+  ASSERT_OK(txn1->Get(read_options, "A", &value));
+  ASSERT_EQ("a", value);
+
+  ASSERT_OK(txn1->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 1
+
+  // Verify that everything was rolled back.
+  s = txn1->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Nothing should be locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn2->Put("B", "");
+  ASSERT_OK(s);
+
+  delete txn2;
+  delete txn1;
+}
+
 TEST_P(TransactionTest, UndoGetForUpdateTest) {
   WriteOptions write_options;
   ReadOptions read_options;
diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h
index 1d910134b66..b1f9f24cb4e 100644
--- a/utilities/transactions/transaction_util.h
+++ b/utilities/transactions/transaction_util.h
@@ -31,6 +31,14 @@ struct TransactionKeyMapInfo {
 
   explicit TransactionKeyMapInfo(SequenceNumber seq_no)
       : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
+
+  // Used in PopSavePoint to collapse two savepoints together.
+  void Merge(const TransactionKeyMapInfo& info) {
+    assert(seq <= info.seq);
+    num_reads += info.num_reads;
+    num_writes += info.num_writes;
+    exclusive |= info.exclusive;
+  }
 };
 
 using TransactionKeyMap =

From 3617287e0ec4593587c59909079d40a32209bbe4 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 26 Jul 2019 11:44:32 -0700
Subject: [PATCH 260/572] Parallelize db_bloom_filter_test (#5632)

Summary:
This test frequently times out under TSAN; parallelizing it should fix
this issue.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5632

Test Plan:
make check
buck test mode/dev-tsan internal_repo_rocksdb/repo:db_bloom_filter_test

Differential Revision: D16519399

Pulled By: ltamasi

fbshipit-source-id: 66e05a644d6f79c6d544255ffcf6de195d2d62fe
---
 Makefile | 2 +-
 TARGETS  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 7f5772c6273..7aef277d590 100644
--- a/Makefile
+++ b/Makefile
@@ -442,7 +442,6 @@ TESTS = \
 	db_block_cache_test \
 	db_test \
 	db_blob_index_test \
-	db_bloom_filter_test \
 	db_iter_test \
 	db_iter_stress_test \
 	db_log_iter_test \
@@ -569,6 +568,7 @@ TESTS = \
 
 PARALLEL_TEST = \
 	backupable_db_test \
+	db_bloom_filter_test \
 	db_compaction_filter_test \
 	db_compaction_test \
 	db_merge_operator_test \
diff --git a/TARGETS b/TARGETS
index c0c6fd97fad..a54e56b9835 100644
--- a/TARGETS
+++ b/TARGETS
@@ -586,7 +586,7 @@ ROCKS_TESTS = [
     [
         "db_bloom_filter_test",
         "db/db_bloom_filter_test.cc",
-        "serial",
+        "parallel",
     ],
     [
         "db_compaction_filter_test",

From 41df7348308fe74fb92bbfa0e330d863524a381a Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Fri, 26 Jul 2019 12:52:07 -0700
Subject: [PATCH 261/572] WriteUnPrepared: Add new variable
 write_batch_flush_threshold (#5633)

Summary:
Instead of reusing `TransactionOptions::max_write_batch_size` for determining when to flush a write batch for write unprepared, add a new variable called `write_batch_flush_threshold` for this use case instead.

Also add `TransactionDBOptions::default_write_batch_flush_threshold` which sets the default value if `TransactionOptions::write_batch_flush_threshold` is unspecified.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5633

Differential Revision: D16520364

Pulled By: lth

fbshipit-source-id: d75ae5a2141ce7708982d5069dc3f0b58d250e8c
---
 include/rocksdb/utilities/transaction_db.h    | 10 +++++++
 utilities/transactions/transaction_test.cc    | 12 ++-------
 .../write_unprepared_transaction_test.cc      | 12 ++++-----
 .../transactions/write_unprepared_txn.cc      | 27 +++++++++++--------
 utilities/transactions/write_unprepared_txn.h |  4 +--
 5 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index db32ba0bc3a..33826bab861 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -101,6 +101,11 @@ struct TransactionDBOptions {
   // ordering rather than concurrency control.
   bool skip_concurrency_control = false;
 
+  // This option is only valid for write unprepared. If a write batch exceeds
+  // this threshold, then the transaction will implicitly flush the currently
+  // pending writes into the database. A value of 0 or less means no limit.
+  ssize_t default_write_batch_flush_threshold = 0;
+
  private:
   // 128 entries
   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
@@ -162,6 +167,11 @@ struct TransactionOptions {
   // back/commit before new transactions start.
   // Default: false
   bool skip_concurrency_control = false;
+
+  // See TransactionDBOptions::default_write_batch_flush_threshold for
+  // description. If a negative value is specified, then the default value from
+  // TransactionDBOptions is used.
+  ssize_t write_batch_flush_threshold = -1;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 534103a545e..98548dd9555 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5303,16 +5303,8 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   ASSERT_EQ(2, txn->GetNumPuts());
 
   s = txn->Put(Slice("b"), Slice("...."));
-  auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
-  // For write unprepared, write batches exceeding max_write_batch_size will
-  // just flush to DB instead of returning a memory limit error.
-  if (pdb->GetTxnDBOptions().write_policy != WRITE_UNPREPARED) {
-    ASSERT_TRUE(s.IsMemoryLimit());
-    ASSERT_EQ(2, txn->GetNumPuts());
-  } else {
-    ASSERT_OK(s);
-    ASSERT_EQ(3, txn->GetNumPuts());
-  }
+  ASSERT_TRUE(s.IsMemoryLimit());
+  ASSERT_EQ(2, txn->GetNumPuts());
 
   txn->Rollback();
   delete txn;
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index a2546229e4d..feaedea067f 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -157,7 +157,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
       Transaction* txn;
       TransactionOptions txn_options;
       // batch_size of 1 causes writes to DB for every marker.
-      txn_options.max_write_batch_size = 1;
+      txn_options.write_batch_flush_threshold = 1;
       ReadOptions read_options;
 
       for (uint32_t i = 0; i < kNumIter; i++) {
@@ -311,7 +311,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
 
   // batch_size of 1 causes writes to DB for every marker.
   for (size_t batch_size : {1, 1000000}) {
-    txn_options.max_write_batch_size = batch_size;
+    txn_options.write_batch_flush_threshold = batch_size;
     for (bool empty : {true, false}) {
       for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
         for (int num_batches = 1; num_batches < 10; num_batches++) {
@@ -332,7 +332,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
           txn->SetName("xid");
           for (int i = 0; i < num_batches; i++) {
             ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
-            if (txn_options.max_write_batch_size == 1) {
+            if (txn_options.write_batch_flush_threshold == 1) {
               ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
             } else {
               ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
@@ -398,7 +398,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
 
   // batch_size of 1 causes writes to DB for every marker.
   for (size_t batch_size : {1, 1000000}) {
-    txn_options.max_write_batch_size = batch_size;
+    txn_options.write_batch_flush_threshold = batch_size;
     for (bool prepare : {false, true}) {
       for (bool commit : {false, true}) {
         ReOpen();
@@ -408,7 +408,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
 
         for (int i = 0; i < kNumKeys; i++) {
           txn->Put("k" + ToString(i), "v" + ToString(i));
-          if (txn_options.max_write_batch_size == 1) {
+          if (txn_options.write_batch_flush_threshold == 1) {
             ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
           } else {
             ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
@@ -457,7 +457,7 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
   WriteOptions write_options;
   TransactionOptions txn_options;
   // batch_size of 1 causes writes to DB for every marker.
-  txn_options.max_write_batch_size = 1;
+  txn_options.write_batch_flush_threshold = 1;
   const int kNumKeys = 10;
 
   WriteOptions wopts;
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 9265c3d4afb..c677013aa03 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -35,13 +35,12 @@ WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
       wupt_db_(txn_db),
       recovered_txn_(false),
       largest_validated_seq_(0) {
-  max_write_batch_size_ = txn_options.max_write_batch_size;
-  // We set max bytes to zero so that we don't get a memory limit error.
-  // Instead of trying to keep write batch strictly under the size limit, we
-  // just flush to DB when the limit is exceeded in write unprepared, to avoid
-  // having retry logic. This also allows very big key-value pairs that exceed
-  // max bytes to succeed.
-  write_batch_.SetMaxBytes(0);
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
 }
 
 WriteUnpreparedTxn::~WriteUnpreparedTxn() {
@@ -71,8 +70,13 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() {
 
 void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   PessimisticTransaction::Initialize(txn_options);
-  max_write_batch_size_ = txn_options.max_write_batch_size;
-  write_batch_.SetMaxBytes(0);
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
+
   unprep_seqs_.clear();
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
@@ -222,8 +226,9 @@ Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) {
 Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
   const bool kPrepared = true;
   Status s;
-  if (max_write_batch_size_ != 0 &&
-      write_batch_.GetDataSize() > max_write_batch_size_) {
+  if (write_batch_flush_threshold_ > 0 &&
+      write_batch_.GetDataSize() >
+          static_cast<size_t>(write_batch_flush_threshold_)) {
     assert(GetState() != PREPARED);
     s = FlushWriteBatchToDB(!kPrepared);
   }
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index d81c30217df..feac749ee82 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -164,10 +164,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   Status HandleWrite(std::function<Status()> do_write);
 
   // For write unprepared, we check on every writebatch append to see if
-  // max_write_batch_size_ has been exceeded, and then call
+  // write_batch_flush_threshold_ has been exceeded, and then call
   // FlushWriteBatchToDB if so. This logic is encapsulated in
   // MaybeFlushWriteBatchToDB.
-  size_t max_write_batch_size_;
+  ssize_t write_batch_flush_threshold_;
   WriteUnpreparedTxnDB* wupt_db_;
 
   // Ordered list of unprep_seq sequence numbers that we have already written

From 70c7302fb5d343fa319e05327ecf88d09fe26a2b Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 26 Jul 2019 14:36:16 -0700
Subject: [PATCH 262/572] Block cache simulator: Add pysim to simulate caches
 using reinforcement learning. (#5610)

Summary:
This PR implements cache eviction using reinforcement learning. It includes two implementations:
1. An implementation of Thompson Sampling for the Bernoulli Bandit [1].
2. An implementation of LinUCB with disjoint linear models [2].

The idea is that a cache uses multiple eviction policies, e.g., MRU, LRU, and LFU. The cache learns which eviction policy is the best and uses it upon a cache miss.
Thompson Sampling is contextless and does not include any features.
LinUCB includes features such as level, block type, caller, column family id to decide which eviction policy to use.

[1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. Trends Mach. Learn. 11, 1 (July 2018), 1-96. DOI: https://doi.org/10.1561/2200000070
[2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. A contextual-bandit approach to personalized news article recommendation. In Proceedings of the 19th international conference on World wide web (WWW '10). ACM, New York, NY, USA, 661-670. DOI=http://dx.doi.org/10.1145/1772690.1772758
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5610

Differential Revision: D16435067

Pulled By: HaoyuHuang

fbshipit-source-id: 6549239ae14115c01cb1e70548af9e46d8dc21bb
---
 .gitignore                                    |   1 +
 CMakeLists.txt                                |   4 +-
 Makefile                                      |   4 +-
 TARGETS                                       |   6 +-
 src.mk                                        |   6 +-
 tools/block_cache_analyzer/__init__.py        |   2 +
 .../block_cache_analyzer/block_cache_pysim.py | 864 ++++++++++++++++++
 .../block_cache_analyzer/block_cache_pysim.sh | 118 +++
 .../block_cache_pysim_test.py                 | 340 +++++++
 .../block_cache_trace_analyzer.cc             |  14 +-
 .../block_cache_trace_analyzer.h              |   0
 .../block_cache_trace_analyzer_plot.py        |   0
 .../block_cache_trace_analyzer_test.cc        |   4 +-
 .../block_cache_trace_analyzer_tool.cc        |   2 +-
 14 files changed, 1345 insertions(+), 20 deletions(-)
 create mode 100644 tools/block_cache_analyzer/__init__.py
 create mode 100644 tools/block_cache_analyzer/block_cache_pysim.py
 create mode 100644 tools/block_cache_analyzer/block_cache_pysim.sh
 create mode 100644 tools/block_cache_analyzer/block_cache_pysim_test.py
 rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer.cc (99%)
 rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer.h (100%)
 rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_plot.py (100%)
 rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_test.cc (99%)
 rename tools/{ => block_cache_analyzer}/block_cache_trace_analyzer_tool.cc (91%)

diff --git a/.gitignore b/.gitignore
index c8672a8b31e..199458901ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ manifest_dump
 sst_dump
 blob_dump
 block_cache_trace_analyzer
+tools/block_cache_analyzer/*.pyc
 column_aware_encoding_exp
 util/build_version.cc
 build_tools/VALGRIND_LOGS/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 086975f3e8f..7266f3b55c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -626,7 +626,7 @@ set(SOURCES
         test_util/sync_point_impl.cc
         test_util/testutil.cc
         test_util/transaction_test_util.cc
-        tools/block_cache_trace_analyzer.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer.cc
         tools/db_bench_tool.cc
         tools/dump/db_dump_tool.cc
         tools/ldb_cmd.cc
@@ -976,7 +976,7 @@ if(WITH_TESTS)
         table/merger_test.cc
         table/sst_file_reader_test.cc
         table/table_test.cc
-        tools/block_cache_trace_analyzer_test.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
diff --git a/Makefile b/Makefile
index 7aef277d590..fbe6d2d06ff 100644
--- a/Makefile
+++ b/Makefile
@@ -1114,7 +1114,7 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
 trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 	$(AM_LINK)
 
-block_cache_trace_analyzer: tools/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 	$(AM_LINK)
 
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
@@ -1614,7 +1614,7 @@ db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS
 block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-block_cache_trace_analyzer_test: tools/block_cache_trace_analyzer_test.o tools/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS)
+block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 #-------------------------------------------------
diff --git a/TARGETS b/TARGETS
index a54e56b9835..884d69b14bc 100644
--- a/TARGETS
+++ b/TARGETS
@@ -351,7 +351,7 @@ cpp_library(
         "test_util/fault_injection_test_env.cc",
         "test_util/testharness.cc",
         "test_util/testutil.cc",
-        "tools/block_cache_trace_analyzer.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/trace_analyzer_tool.cc",
         "utilities/cassandra/test_utils.cc",
     ],
@@ -369,7 +369,7 @@ cpp_library(
     name = "rocksdb_tools_lib",
     srcs = [
         "test_util/testutil.cc",
-        "tools/block_cache_trace_analyzer.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/db_bench_tool.cc",
         "tools/trace_analyzer_tool.cc",
     ],
@@ -430,7 +430,7 @@ ROCKS_TESTS = [
     ],
     [
         "block_cache_trace_analyzer_test",
-        "tools/block_cache_trace_analyzer_test.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc",
         "serial",
     ],
     [
diff --git a/src.mk b/src.mk
index 3462a6a58bb..0c6142e41ad 100644
--- a/src.mk
+++ b/src.mk
@@ -246,7 +246,7 @@ TOOL_LIB_SOURCES =                                              \
   utilities/blob_db/blob_dump_tool.cc                           \
 
 ANALYZER_LIB_SOURCES =                                          \
-  tools/block_cache_trace_analyzer.cc                           \
+  tools/block_cache_analyzer/block_cache_trace_analyzer.cc      \
   tools/trace_analyzer_tool.cc                                  \
 
 MOCK_LIB_SOURCES =                                              \
@@ -374,8 +374,8 @@ MAIN_SOURCES =                                                          \
   table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
-  tools/block_cache_trace_analyzer_test.cc                              \
-  tools/block_cache_trace_analyzer_tool.cc                              \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc         \
   tools/db_bench.cc                                                     \
   tools/db_bench_tool_test.cc                                           \
   tools/db_sanity_test.cc                                               \
diff --git a/tools/block_cache_analyzer/__init__.py b/tools/block_cache_analyzer/__init__.py
new file mode 100644
index 00000000000..8dbe96a7850
--- /dev/null
+++ b/tools/block_cache_analyzer/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py
new file mode 100644
index 00000000000..63e367be5a7
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim.py
@@ -0,0 +1,864 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import gc
+import random
+import sys
+import time
+from os import path
+
+import numpy as np
+
+
+kSampleSize = 16  # The sample size used when performing eviction.
+kMicrosInSecond = 1000000
+kSecondsInMinute = 60
+kSecondsInHour = 3600
+
+
+class TraceRecord:
+    """
+    A trace record represents a block access.
+    It holds the same struct as BlockCacheTraceRecord in
+    trace_replay/block_cache_tracer.h
+    """
+
+    def __init__(
+        self,
+        access_time,
+        block_id,
+        block_type,
+        block_size,
+        cf_id,
+        cf_name,
+        level,
+        fd,
+        caller,
+        no_insert,
+        get_id,
+        key_id,
+        kv_size,
+        is_hit,
+    ):
+        self.access_time = access_time
+        self.block_id = block_id
+        self.block_type = block_type
+        self.block_size = block_size
+        self.cf_id = cf_id
+        self.cf_name = cf_name
+        self.level = level
+        self.fd = fd
+        self.caller = caller
+        if no_insert == 1:
+            self.no_insert = True
+        else:
+            self.no_insert = False
+        self.get_id = get_id
+        self.key_id = key_id
+        self.kv_size = kv_size
+        if is_hit == 1:
+            self.is_hit = True
+        else:
+            self.is_hit = False
+
+
+class CacheEntry:
+    """A cache entry stored in the cache."""
+
+    def __init__(self, value_size, cf_id, level, block_type, access_number):
+        self.value_size = value_size
+        self.last_access_number = access_number
+        self.num_hits = 0
+        self.cf_id = 0
+        self.level = level
+        self.block_type = block_type
+
+    def __repr__(self):
+        """Debug string."""
+        return "s={},last={},hits={},cf={},l={},bt={}".format(
+            self.value_size,
+            self.last_access_number,
+            self.num_hits,
+            self.cf_id,
+            self.level,
+            self.block_type,
+        )
+
+
+class HashEntry:
+    """A hash entry stored in a hash table."""
+
+    def __init__(self, key, hash, value):
+        self.key = key
+        self.hash = hash
+        self.value = value
+
+    def __repr__(self):
+        return "k={},h={},v=[{}]".format(self.key, self.hash, self.value)
+
+
+class HashTable:
+    """
+    A custom implementation of hash table to support fast random sampling.
+    It is closed hashing and uses chaining to resolve hash conflicts.
+    It grows/shrinks the hash table upon insertion/deletion to support
+    fast lookups and random samplings.
+    """
+
+    def __init__(self):
+        self.table = [None] * 32
+        self.elements = 0
+
+    def random_sample(self, sample_size):
+        """Randomly sample 'sample_size' hash entries from the table."""
+        samples = []
+        index = random.randint(0, len(self.table))
+        pos = (index + 1) % len(self.table)
+        searches = 0
+        # Starting from index, adding hash entries to the sample list until
+        # sample_size is met or we ran out of entries.
+        while pos != index and len(samples) < sample_size:
+            if self.table[pos] is not None:
+                for i in range(len(self.table[pos])):
+                    if self.table[pos][i] is None:
+                        continue
+                    samples.append(self.table[pos][i])
+                    if len(samples) > sample_size:
+                        break
+            pos += 1
+            pos = pos % len(self.table)
+            searches += 1
+        return samples
+
+    def insert(self, key, hash, value):
+        """
+        Insert a hash entry in the table. Replace the old entry if it already
+        exists.
+        """
+        self.grow()
+        inserted = False
+        index = hash % len(self.table)
+        if self.table[index] is None:
+            self.table[index] = []
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is not None:
+                if (
+                    self.table[index][i].hash == hash
+                    and self.table[index][i].key == key
+                ):
+                    # The entry already exists in the table.
+                    self.table[index][i] = HashEntry(key, hash, value)
+                    return
+                continue
+            self.table[index][i] = HashEntry(key, hash, value)
+            inserted = True
+            break
+        if not inserted:
+            self.table[index].append(HashEntry(key, hash, value))
+        self.elements += 1
+
+    def resize(self, new_size):
+        if new_size == len(self.table):
+            return
+        if new_size == 0:
+            return
+        if self.elements < 100:
+            return
+        new_table = [None] * new_size
+        # Copy 'self.table' to new_table.
+        for i in range(len(self.table)):
+            entries = self.table[i]
+            if entries is None:
+                continue
+            for j in range(len(entries)):
+                if entries[j] is None:
+                    continue
+                index = entries[j].hash % new_size
+                if new_table[index] is None:
+                    new_table[index] = []
+                new_table[index].append(entries[j])
+        self.table = new_table
+        del new_table
+        # Manually call python gc here to free the memory as 'self.table'
+        # might be very large.
+        gc.collect()
+
+    def grow(self):
+        if self.elements < len(self.table):
+            return
+        new_size = int(len(self.table) * 1.2)
+        self.resize(new_size)
+
+    def delete(self, key, hash):
+        index = hash % len(self.table)
+        entries = self.table[index]
+        deleted = False
+        if entries is None:
+            return
+        for i in range(len(entries)):
+            if (
+                entries[i] is not None
+                and entries[i].hash == hash
+                and entries[i].key == key
+            ):
+                entries[i] = None
+                self.elements -= 1
+                deleted = True
+                break
+        if deleted:
+            self.shrink()
+
+    def shrink(self):
+        if self.elements * 2 >= len(self.table):
+            return
+        new_size = int(len(self.table) * 0.7)
+        self.resize(new_size)
+
+    def lookup(self, key, hash):
+        index = hash % len(self.table)
+        entries = self.table[index]
+        if entries is None:
+            return None
+        for entry in entries:
+            if entry is not None and entry.hash == hash and entry.key == key:
+                return entry.value
+        return None
+
+
+class MissRatioStats:
+    def __init__(self, time_unit):
+        self.num_misses = 0
+        self.num_accesses = 0
+        self.time_unit = time_unit
+        self.time_misses = {}
+        self.time_accesses = {}
+
+    def update_metrics(self, access_time, is_hit):
+        access_time /= kMicrosInSecond * self.time_unit
+        self.num_accesses += 1
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if not is_hit:
+            self.num_misses += 1
+            if access_time not in self.time_misses:
+                self.time_misses[access_time] = 0
+            self.time_misses[access_time] += 1
+
+    def reset_counter(self):
+        self.num_misses = 0
+        self.num_accesses = 0
+
+    def miss_ratio(self):
+        return float(self.num_misses) * 100.0 / float(self.num_accesses)
+
+    def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                row += ",{}".format(self.time_misses.get(trace_time, 0))
+            file.write(row + "\n")
+
+    def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, end):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                naccesses = self.time_accesses.get(trace_time, 0)
+                miss_ratio = 0
+                if naccesses > 0:
+                    miss_ratio = float(
+                        self.time_misses.get(trace_time, 0) * 100.0
+                    ) / float(naccesses)
+                row += ",{0:.2f}".format(miss_ratio)
+            file.write(row + "\n")
+
+
+class PolicyStats:
+    def __init__(self, time_unit, policies):
+        self.time_selected_polices = {}
+        self.time_accesses = {}
+        self.policy_names = {}
+        self.time_unit = time_unit
+        for i in range(len(policies)):
+            self.policy_names[i] = policies[i].policy_name()
+
+    def update_metrics(self, access_time, selected_policy):
+        access_time /= kMicrosInSecond * self.time_unit
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if access_time not in self.time_selected_polices:
+            self.time_selected_polices[access_time] = {}
+        policy_name = self.policy_names[selected_policy]
+        if policy_name not in self.time_selected_polices[access_time]:
+            self.time_selected_polices[access_time][policy_name] = 0
+        self.time_selected_polices[access_time][policy_name] += 1
+
+    def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    row += ",{}".format(
+                        self.time_selected_polices.get(trace_time, {}).get(
+                            policy_name, 0
+                        )
+                    )
+                file.write(row + "\n")
+
+    def write_policy_ratio_timeline(
+        self, cache_type, cache_size, file_path, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    naccesses = self.time_accesses.get(trace_time, 0)
+                    ratio = 0
+                    if naccesses > 0:
+                        ratio = float(
+                            self.time_selected_polices.get(trace_time, {}).get(
+                                policy_name, 0
+                            )
+                            * 100.0
+                        ) / float(naccesses)
+                    row += ",{0:.2f}".format(ratio)
+                file.write(row + "\n")
+
+
+class Policy(object):
+    """
+    A policy maintains a set of evicted keys. It returns a reward of one to
+    itself if it has not evicted a missing key. Otherwise, it gives itself 0
+    reward.
+    """
+
+    def __init__(self):
+        self.evicted_keys = {}
+
+    def evict(self, key, max_size):
+        self.evicted_keys[key] = 0
+
+    def delete(self, key):
+        self.evicted_keys.pop(key, None)
+
+    def prioritize_samples(self, samples):
+        raise NotImplementedError
+
+    def policy_name(self):
+        raise NotImplementedError
+
+    def generate_reward(self, key):
+        if key in self.evicted_keys:
+            return 0
+        return 1
+
+
+class LRUPolicy(Policy):
+    def prioritize_samples(self, samples):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e1.value.last_access_number
+            - e2.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "lru"
+
+
+class MRUPolicy(Policy):
+    def prioritize_samples(self, samples):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e2.value.last_access_number
+            - e1.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "mru"
+
+
+class LFUPolicy(Policy):
+    def prioritize_samples(self, samples):
+        return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
+
+    def policy_name(self):
+        return "lfu"
+
+
+class MLCache(object):
+    def __init__(self, cache_size, enable_cache_row_key, policies):
+        self.cache_size = cache_size
+        self.used_size = 0
+        self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+        self.policy_stats = PolicyStats(kSecondsInMinute, policies)
+        self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
+        self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
+        self.table = HashTable()
+        self.enable_cache_row_key = enable_cache_row_key
+        self.get_id_row_key_map = {}
+        self.policies = policies
+
+    def _lookup(self, key, hash):
+        value = self.table.lookup(key, hash)
+        if value is not None:
+            value.last_access_number = self.miss_ratio_stats.num_accesses
+            value.num_hits += 1
+            return True
+        return False
+
+    def _select_policy(self, trace_record, key):
+        raise NotImplementedError
+
+    def cache_name(self):
+        raise NotImplementedError
+
+    def _evict(self, policy_index, value_size):
+        # Randomly sample n entries.
+        samples = self.table.random_sample(kSampleSize)
+        samples = self.policies[policy_index].prioritize_samples(samples)
+        for hash_entry in samples:
+            self.used_size -= hash_entry.value.value_size
+            self.table.delete(hash_entry.key, hash_entry.hash)
+            self.policies[policy_index].evict(
+                key=hash_entry.key, max_size=self.table.elements
+            )
+            if self.used_size + value_size <= self.cache_size:
+                break
+
+    def _insert(self, trace_record, key, hash, value_size):
+        if value_size > self.cache_size:
+            return
+        policy_index = self._select_policy(trace_record, key)
+        self.policies[policy_index].delete(key)
+        self.policy_stats.update_metrics(trace_record.access_time, policy_index)
+        self.per_hour_policy_stats.update_metrics(
+            trace_record.access_time, policy_index
+        )
+        while self.used_size + value_size > self.cache_size:
+            self._evict(policy_index, value_size)
+        self.table.insert(
+            key,
+            hash,
+            CacheEntry(
+                value_size,
+                trace_record.cf_id,
+                trace_record.level,
+                trace_record.block_type,
+                self.miss_ratio_stats.num_accesses,
+            ),
+        )
+        self.used_size += value_size
+
+    def _access_kv(self, trace_record, key, hash, value_size, no_insert):
+        if self._lookup(key, hash):
+            return True
+        if not no_insert and value_size > 0:
+            self._insert(trace_record, key, hash, value_size)
+        return False
+
+    def _update_stats(self, access_time, is_hit):
+        self.miss_ratio_stats.update_metrics(access_time, is_hit)
+        self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit)
+
+    def access(self, trace_record):
+        assert self.used_size <= self.cache_size
+        if (
+            self.enable_cache_row_key
+            and trace_record.caller == 1
+            and trace_record.key_id != 0
+            and trace_record.get_id != 0
+        ):
+            # This is a get request.
+            if trace_record.get_id not in self.get_id_row_key_map:
+                self.get_id_row_key_map[trace_record.get_id] = {}
+                self.get_id_row_key_map[trace_record.get_id]["h"] = False
+            if self.get_id_row_key_map[trace_record.get_id]["h"]:
+                # We treat future accesses as hits since this get request
+                # completes.
+                self._update_stats(trace_record.access_time, is_hit=True)
+                return
+            if trace_record.key_id not in self.get_id_row_key_map[trace_record.get_id]:
+                # First time seen this key.
+                is_hit = self._access_kv(
+                    trace_record,
+                    key="g{}".format(trace_record.key_id),
+                    hash=trace_record.key_id,
+                    value_size=trace_record.kv_size,
+                    no_insert=False,
+                )
+                inserted = False
+                if trace_record.kv_size > 0:
+                    inserted = True
+                self.get_id_row_key_map[trace_record.get_id][
+                    trace_record.key_id
+                ] = inserted
+                self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
+            if self.get_id_row_key_map[trace_record.get_id]["h"]:
+                # We treat future accesses as hits since this get request
+                # completes.
+                self._update_stats(trace_record.access_time, is_hit=True)
+                return
+            # Access its blocks.
+            is_hit = self._access_kv(
+                trace_record,
+                key="b{}".format(trace_record.block_id),
+                hash=trace_record.block_id,
+                value_size=trace_record.block_size,
+                no_insert=trace_record.no_insert,
+            )
+            self._update_stats(trace_record.access_time, is_hit)
+            if (
+                trace_record.kv_size > 0
+                and not self.get_id_row_key_map[trace_record.get_id][
+                    trace_record.key_id
+                ]
+            ):
+                # Insert the row key-value pair.
+                self._access_kv(
+                    trace_record,
+                    key="g{}".format(trace_record.key_id),
+                    hash=trace_record.key_id,
+                    value_size=trace_record.kv_size,
+                    no_insert=False,
+                )
+                # Mark as inserted.
+                self.get_id_row_key_map[trace_record.get_id][trace_record.key_id] = True
+            return
+        # Access the block.
+        is_hit = self._access_kv(
+            trace_record,
+            key="b{}".format(trace_record.block_id),
+            hash=trace_record.block_id,
+            value_size=trace_record.block_size,
+            no_insert=trace_record.no_insert,
+        )
+        self._update_stats(trace_record.access_time, is_hit)
+
+
+class ThompsonSamplingCache(MLCache):
+    """
+    An implementation of Thompson Sampling for the Bernoulli Bandit [1].
+    [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
+    and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
+    Trends Mach. Learn. 11, 1 (July 2018), 1-96.
+    DOI: https://doi.org/10.1561/2200000070
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b=1):
+        super(ThompsonSamplingCache, self).__init__(
+            cache_size, enable_cache_row_key, policies
+        )
+        self._as = {}
+        self._bs = {}
+        for _i in range(len(policies)):
+            self._as = [init_a] * len(self.policies)
+            self._bs = [init_b] * len(self.policies)
+
+    def _select_policy(self, trace_record, key):
+        samples = [
+            np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
+        ]
+        selected_policy = max(range(len(self.policies)), key=lambda x: samples[x])
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self._as[selected_policy] += reward
+        self._bs[selected_policy] += 1 - reward
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid ThompsonSampling (ts_hybrid)"
+        return "ThompsonSampling (ts)"
+
+
+class LinUCBCache(MLCache):
+    """
+    An implementation of LinUCB with disjoint linear models [2].
+    [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
+    A contextual-bandit approach to personalized news article recommendation.
+    In Proceedings of the 19th international conference on World wide web
+    (WWW '10). ACM, New York, NY, USA, 661-670.
+    DOI=http://dx.doi.org/10.1145/1772690.1772758
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies):
+        super(LinUCBCache, self).__init__(cache_size, enable_cache_row_key, policies)
+        self.nfeatures = 4  # Block type, caller, level, cf.
+        self.th = np.zeros((len(self.policies), self.nfeatures))
+        self.eps = 0.2
+        self.b = np.zeros_like(self.th)
+        self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        for i in range(len(self.policies)):
+            self.A[i] = np.identity(self.nfeatures)
+        self.th_hat = np.zeros_like(self.th)
+        self.p = np.zeros(len(self.policies))
+        self.alph = 0.2
+
+    def _select_policy(self, trace_record, key):
+        x_i = np.zeros(self.nfeatures)  # The current context vector
+        x_i[0] = trace_record.block_type
+        x_i[1] = trace_record.caller
+        x_i[2] = trace_record.level
+        x_i[3] = trace_record.cf_id
+        p = np.zeros(len(self.policies))
+        for a in range(len(self.policies)):
+            self.th_hat[a] = self.A_inv[a].dot(self.b[a])
+            ta = x_i.dot(self.A_inv[a]).dot(x_i)
+            a_upper_ci = self.alph * np.sqrt(ta)
+            a_mean = self.th_hat[a].dot(x_i)
+            p[a] = a_mean + a_upper_ci
+        p = p + (np.random.random(len(p)) * 0.000001)
+        selected_policy = p.argmax()
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self.A[selected_policy] += np.outer(x_i, x_i)
+        self.b[selected_policy] += reward * x_i
+        self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy])
+        del x_i
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LinUCB (linucb_hybrid)"
+        return "LinUCB (linucb)"
+
+
+def parse_cache_size(cs):
+    cs = cs.replace("\n", "")
+    if cs[-1] == "M":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024
+    if cs[-1] == "G":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024
+    if cs[-1] == "T":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024
+    return int(cs)
+
+
+def create_cache(cache_type, cache_size, downsample_size):
+    policies = []
+    policies.append(LRUPolicy())
+    policies.append(MRUPolicy())
+    policies.append(LFUPolicy())
+    cache_size = cache_size / downsample_size
+    enable_cache_row_key = False
+    if "hybrid" in cache_type:
+        enable_cache_row_key = True
+        cache_type = cache_type[:-7]
+    if cache_type == "ts":
+        return ThompsonSamplingCache(cache_size, enable_cache_row_key, policies)
+    elif cache_type == "linucb":
+        return LinUCBCache(cache_size, enable_cache_row_key, policies)
+    else:
+        print("Unknown cache type {}".format(cache_type))
+        assert False
+    return None
+
+
+def run(trace_file_path, cache_type, cache, warmup_seconds):
+    warmup_complete = False
+    num = 0
+    trace_start_time = 0
+    trace_duration = 0
+    start_time = time.time()
+    time_interval = 1
+    trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+    with open(trace_file_path, "r") as trace_file:
+        for line in trace_file:
+            num += 1
+            if num % 1000000 == 0:
+                # Force a python gc periodically to reduce memory usage.
+                gc.collect()
+            ts = line.split(",")
+            timestamp = int(ts[0])
+            if trace_start_time == 0:
+                trace_start_time = timestamp
+            trace_duration = timestamp - trace_start_time
+            if not warmup_complete and trace_duration > warmup_seconds * 1000000:
+                cache.miss_ratio_stats.reset_counter()
+                warmup_complete = True
+            record = TraceRecord(
+                access_time=int(ts[0]),
+                block_id=int(ts[1]),
+                block_type=int(ts[2]),
+                block_size=int(ts[3]),
+                cf_id=int(ts[4]),
+                cf_name=ts[5],
+                level=int(ts[6]),
+                fd=int(ts[7]),
+                caller=int(ts[8]),
+                no_insert=int(ts[9]),
+                get_id=int(ts[10]),
+                key_id=int(ts[11]),
+                kv_size=int(ts[12]),
+                is_hit=int(ts[13]),
+            )
+            trace_miss_ratio_stats.update_metrics(
+                record.access_time, is_hit=record.is_hit
+            )
+            cache.access(record)
+            del record
+            if num % 100 != 0:
+                continue
+            # Report progress every 10 seconds.
+            now = time.time()
+            if now - start_time > time_interval * 10:
+                print(
+                    "Take {} seconds to process {} trace records with trace "
+                    "duration of {} seconds. Throughput: {} records/second. "
+                    "Trace miss ratio {}".format(
+                        now - start_time,
+                        num,
+                        trace_duration / 1000000,
+                        num / (now - start_time),
+                        trace_miss_ratio_stats.miss_ratio(),
+                    )
+                )
+                time_interval += 1
+                print(
+                    "{},0,0,{},{},{}".format(
+                        cache_type,
+                        cache.cache_size,
+                        cache.miss_ratio_stats.miss_ratio(),
+                        cache.miss_ratio_stats.num_accesses,
+                    )
+                )
+    now = time.time()
+    print(
+        "Take {} seconds to process {} trace records with trace duration of {} "
+        "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
+            now - start_time,
+            num,
+            trace_duration / 1000000,
+            num / (now - start_time),
+            trace_miss_ratio_stats.miss_ratio(),
+        )
+    )
+    return trace_start_time, trace_duration
+
+
+def report_stats(
+    cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+):
+    cache_label = "{}-{}".format(cache_type, cache_size)
+    with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
+        mrc_file.write(
+            "{},0,0,{},{},{}\n".format(
+                cache_type,
+                cache_size,
+                cache.miss_ratio_stats.miss_ratio(),
+                cache.miss_ratio_stats.num_accesses,
+            )
+        )
+    cache.policy_stats.write_policy_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.policy_stats.write_policy_ratio_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.miss_ratio_stats.write_miss_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.miss_ratio_stats.write_miss_ratio_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.per_hour_policy_stats.write_policy_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.per_hour_policy_stats.write_policy_ratio_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.per_hour_miss_ratio_stats.write_miss_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+    cache.per_hour_miss_ratio_stats.write_miss_ratio_timeline(
+        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 6:
+        print(
+            "Must provide 6 arguments. "
+            "1) cache_type (ts, ts_hybrid, linucb, linucb_hybrid). "
+            "2) cache size (xM, xG, xT). "
+            "3) The sampling frequency used to collect the trace. (The "
+            "simulation scales down the cache size by the sampling frequency). "
+            "4) Warmup seconds (The number of seconds used for warmup). "
+            "5) Trace file path. "
+            "6) Result directory (A directory that saves generated results)"
+        )
+        exit(1)
+    cache_type = sys.argv[1]
+    cache_size = parse_cache_size(sys.argv[2])
+    downsample_size = int(sys.argv[3])
+    warmup_seconds = int(sys.argv[4])
+    trace_file_path = sys.argv[5]
+    result_dir = sys.argv[6]
+    cache = create_cache(cache_type, cache_size, downsample_size)
+    trace_start_time, trace_duration = run(
+        trace_file_path, cache_type, cache, warmup_seconds
+    )
+    trace_end_time = trace_start_time + trace_duration
+    report_stats(
+        cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    )
diff --git a/tools/block_cache_analyzer/block_cache_pysim.sh b/tools/block_cache_analyzer/block_cache_pysim.sh
new file mode 100644
index 00000000000..58193a0635a
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to run a batch of pysims and combine individual pysim output files.
+#
+# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs
+# trace_file_path: The file path that stores the traces.
+# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml
+# downsample_size: The downsample size used to collect the trace.
+# warmup_seconds: The number of seconds used for warmup.
+# max_jobs: The max number of concurrent pysims to run.
+
+if [ $# -ne 5 ]; then
+  echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
+  exit 0
+fi
+
+trace_file="$1"
+result_dir="$2"
+downsample_size="$3"
+warmup_seconds="$4"
+max_jobs="$5"
+current_jobs=0
+
+ml_tmp_result_dir="$result_dir/ml"
+rm -rf "$ml_tmp_result_dir"
+mkdir -p "$result_dir"
+mkdir -p "$ml_tmp_result_dir"
+
+for cache_type in "ts" "linucb" "ts_hybrid" "linucb_hybrid"
+do
+for cache_size in "16M" "256M" "1G" "2G" "4G" "8G" "12G" "16G"
+do
+    while [ "$current_jobs" -ge "$max_jobs" ]
+    do
+      sleep 10
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+      current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+    done
+    output="log-ml-$cache_type-$cache_size"
+    echo "Running simulation for $cache_type and cache size $cache_size. Number of running jobs: $current_jobs. "
+    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" >& $ml_tmp_result_dir/$output &
+    current_jobs=$((current_jobs+1))
+done
+done
+
+# Wait for all jobs to complete.
+while [ $current_jobs -gt 0 ]
+do
+  sleep 10
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+  current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+done
+
+echo "Combine individual pysim output files"
+
+rm -rf "$result_dir/ml_*"
+mrc_file="$result_dir/ml_mrc"
+for header in "header-" "data-"
+do
+for fn in $ml_tmp_result_dir/*
+do
+  sum_file=""
+  time_unit=""
+  capacity=""
+  if [[ $fn == *"timeline"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit_index=0
+    capacity_index=0
+    for i in "${elements[@]}"
+    do
+       if [[ $i == "timeline" ]]; then
+         break
+       fi
+       time_unit_index=$((time_unit_index+1))
+    done
+    time_unit_index=$((time_unit_index+1))
+    capacity_index=$((time_unit_index+2))
+    time_unit="${elements[$time_unit_index]}_"
+    capacity="${elements[$capacity_index]}_"
+  fi
+
+  if [[ $fn == "${header}ml-policy-timeline"* ]]; then
+    sum_file="$result_dir/ml_${capacity}${time_unit}policy_timeline"
+  fi
+  if [[ $fn == "${header}ml-policy-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${capacity}${time_unit}policy_ratio_timeline"
+  fi
+  if [[ $fn == "${header}ml-miss-timeline"* ]]; then
+    sum_file="$result_dir/ml_${capacity}${time_unit}miss_timeline"
+  fi
+  if [[ $fn == "${header}ml-miss-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${capacity}${time_unit}miss_ratio_timeline"
+  fi
+  if [[ $fn == "${header}ml-mrc"* ]]; then
+    sum_file="$mrc_file"
+  fi
+  if [[ $sum_file == "" ]]; then
+    continue
+  fi
+  if [[ $header == "header-" ]]; then
+    if [ -e "$sum_file" ]; then
+      continue
+    fi
+  fi
+  cat "$ml_tmp_result_dir/$fn" >> "$sum_file"
+done
+done
+
+echo "Done"
+# Sort MRC file by cache_type and cache_size.
+tmp_file="$result_dir/tmp_mrc"
+cat "$mrc_file" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
+cat "$tmp_file" > "$mrc_file"
+rm -rf "$tmp_file"
diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py
new file mode 100644
index 00000000000..e298d7bbd6f
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim_test.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import random
+
+from block_cache_pysim import (
+    HashTable,
+    LFUPolicy,
+    LinUCBCache,
+    LRUPolicy,
+    MRUPolicy,
+    ThompsonSamplingCache,
+    TraceRecord,
+    kSampleSize,
+)
+
+
+def test_hash_table():
+    print("Test hash table")
+    table = HashTable()
+    data_size = 10000
+    for i in range(data_size):
+        table.insert("k{}".format(i), i, "v{}".format(i))
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is not None
+    for i in range(data_size):
+        table.delete("k{}".format(i), i)
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is None
+
+    truth_map = {}
+    n = 1000000
+    records = 100
+    for i in range(n):
+        key_id = random.randint(0, records)
+        key = "k{}".format(key_id)
+        value = "v{}".format(key_id)
+        action = random.randint(0, 2)
+        # print "{}:{}:{}".format(action, key, value)
+        assert len(truth_map) == table.elements, "{} {} {}".format(
+            len(truth_map), table.elements, i
+        )
+        if action == 0:
+            table.insert(key, key_id, value)
+            truth_map[key] = value
+        elif action == 1:
+            if key in truth_map:
+                assert table.lookup(key, key_id) is not None
+                assert truth_map[key] == table.lookup(key, key_id)
+            else:
+                assert table.lookup(key, key_id) is None
+        else:
+            table.delete(key, key_id)
+            if key in truth_map:
+                del truth_map[key]
+    print("Test hash table: Success")
+
+
+def assert_metrics(cache, expected_value):
+    assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format(
+        expected_value[0], cache.used_size
+    )
+    assert (
+        cache.miss_ratio_stats.num_accesses == expected_value[1]
+    ), "Expected {}, Actual {}".format(
+        expected_value[1], cache.miss_ratio_stats.num_accesses
+    )
+    assert (
+        cache.miss_ratio_stats.num_misses == expected_value[2]
+    ), "Expected {}, Actual {}".format(
+        expected_value[2], cache.miss_ratio_stats.num_misses
+    )
+    assert cache.table.elements == len(expected_value[3]) + len(
+        expected_value[4]
+    ), "Expected {}, Actual {}".format(
+        len(expected_value[3]) + len(expected_value[4]), cache.table.elements
+    )
+    for expeceted_k in expected_value[3]:
+        val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
+        assert val is not None
+        assert val.value_size == 1
+    for expeceted_k in expected_value[4]:
+        val = cache.table.lookup("g{}".format(expeceted_k), expeceted_k)
+        assert val is not None
+        assert val.value_size == 1
+
+
+# Access k1, k1, k2, k3, k3, k3, k4
+def test_cache(policies, expected_value):
+    cache = ThompsonSamplingCache(3, False, policies)
+    k1 = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+    )
+    k2 = TraceRecord(
+        access_time=1,
+        block_id=2,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+    )
+    k3 = TraceRecord(
+        access_time=2,
+        block_id=3,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+    )
+    k4 = TraceRecord(
+        access_time=3,
+        block_id=4,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+    )
+    sequence = [k1, k1, k2, k3, k3, k3]
+    index = 0
+    expected_values = []
+    # Access k1, miss.
+    expected_values.append([1, 1, 1, [1], []])
+    # Access k1, hit.
+    expected_values.append([1, 2, 1, [1], []])
+    # Access k2, miss.
+    expected_values.append([2, 3, 2, [1, 2], []])
+    # Access k3, miss.
+    expected_values.append([3, 4, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 5, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 6, 3, [1, 2, 3], []])
+    for access in sequence:
+        cache.access(access)
+        assert_metrics(cache, expected_values[index])
+        index += 1
+    cache.access(k4)
+    assert_metrics(cache, expected_value)
+
+
+def test_lru_cache():
+    print("Test LRU cache")
+    policies = []
+    policies.append(LRUPolicy())
+    # Access k4, miss. evict k1
+    test_cache(policies, [3, 7, 4, [2, 3, 4], []])
+    print("Test LRU cache: Success")
+
+
+def test_mru_cache():
+    print("Test MRU cache")
+    policies = []
+    policies.append(MRUPolicy())
+    # Access k4, miss. evict k3
+    test_cache(policies, [3, 7, 4, [1, 2, 4], []])
+    print("Test MRU cache: Success")
+
+
+def test_lfu_cache():
+    print("Test LFU cache")
+    policies = []
+    policies.append(LFUPolicy())
+    # Access k4, miss. evict k2
+    test_cache(policies, [3, 7, 4, [1, 3, 4], []])
+    print("Test LFU cache: Success")
+
+
+def test_mix(cache):
+    print("Test Mix {} cache".format(cache.cache_name()))
+    n = 100000
+    records = 199
+    for i in range(n):
+        key_id = random.randint(0, records)
+        vs = random.randint(0, 10)
+        k = TraceRecord(
+            access_time=i,
+            block_id=key_id,
+            block_type=1,
+            block_size=vs,
+            cf_id=0,
+            cf_name="",
+            level=0,
+            fd=0,
+            caller=1,
+            no_insert=0,
+            get_id=key_id,
+            key_id=key_id,
+            kv_size=5,
+            is_hit=1,
+        )
+        cache.access(k)
+    assert cache.miss_ratio_stats.miss_ratio() > 0
+    print("Test Mix {} cache: Success".format(cache.cache_name()))
+
+
+def test_hybrid(cache):
+    print("Test {} cache".format(cache.cache_name()))
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+    )
+    cache.access(k)  # Expect a miss.
+    # used size, num accesses, num misses, hash table size, blocks, get keys.
+    assert_metrics(cache, [1, 1, 1, [1], []])
+    k.access_time += 1
+    k.kv_size = 1
+    k.block_id = 2
+    cache.access(k)  # k should be inserted.
+    assert_metrics(cache, [3, 2, 2, [1, 2], [1]])
+    k.access_time += 1
+    k.block_id = 3
+    cache.access(k)  # k should not be inserted again.
+    assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]])
+    # A second get request referencing the same key.
+    k.access_time += 1
+    k.get_id = 2
+    k.block_id = 4
+    k.kv_size = 0
+    cache.access(k)  # k should observe a hit. No block access.
+    assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]])
+
+    # A third get request searches three files, three different keys.
+    # And the second key observes a hit.
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 3
+    k.key_id = 2
+    cache.access(k)  # k should observe a miss. block 3 observes a hit.
+    assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 1
+    cache.access(k)  # k1 should observe a hit.
+    assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 3
+    # k3 should observe a miss.
+    # However, as the get already complete, we should not access k3 any more.
+    cache.access(k)
+    assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]])
+
+    # A fourth get request searches one file and two blocks. One row key.
+    k.access_time += 1
+    k.get_id = 4
+    k.block_id = 5
+    k.key_id = 4
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]])
+
+    # A bunch of insertions which evict cached row keys.
+    for i in range(6, 100):
+        k.access_time += 1
+        k.get_id = 0
+        k.block_id = i
+        cache.access(k)
+
+    k.get_id = 4
+    k.block_id = 100  # A different block.
+    k.key_id = 4  # Same row key and should not be inserted again.
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(cache, [16, 103, 99, [i for i in range(101 - kSampleSize, 101)], []])
+    print("Test {} cache: Success".format(cache.cache_name()))
+
+
+if __name__ == "__main__":
+    policies = []
+    policies.append(MRUPolicy())
+    policies.append(LRUPolicy())
+    policies.append(LFUPolicy())
+    test_hash_table()
+    test_lru_cache()
+    test_mru_cache()
+    test_lfu_cache()
+    test_mix(ThompsonSamplingCache(100, False, policies))
+    test_mix(ThompsonSamplingCache(100, True, policies))
+    test_mix(LinUCBCache(100, False, policies))
+    test_mix(LinUCBCache(100, True, policies))
+    test_hybrid(ThompsonSamplingCache(kSampleSize, True, [LRUPolicy()]))
+    test_hybrid(LinUCBCache(kSampleSize, True, [LRUPolicy()]))
diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
similarity index 99%
rename from tools/block_cache_trace_analyzer.cc
rename to tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 761395a6654..032ed2be24f 100644
--- a/tools/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 #ifdef GFLAGS
-#include "tools/block_cache_trace_analyzer.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -1395,13 +1395,12 @@ Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord(
   }
   int ret = snprintf(
       trace_record_buffer_, sizeof(trace_record_buffer_),
-      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%" PRIu32 ",%" PRIu64
-      ""
-      ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n",
+      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
+      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n",
       access.access_timestamp, block_id, access.block_type, access.block_size,
-      access.cf_id, access.level, access.sst_fd_number, access.caller,
-      access.no_insert, access.get_id, get_key_id, access.referenced_data_size,
-      access.is_cache_hit);
+      access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
+      access.caller, access.no_insert, access.get_id, get_key_id,
+      access.referenced_data_size, access.is_cache_hit);
   if (ret < 0) {
     return Status::IOError("failed to format the output");
   }
@@ -2134,6 +2133,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
         analyzer.WriteAccessTimeline(label, kSecondInHour, false);
       } else {
         analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
       }
     }
   }
diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
similarity index 100%
rename from tools/block_cache_trace_analyzer.h
rename to tools/block_cache_analyzer/block_cache_trace_analyzer.h
diff --git a/tools/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
similarity index 100%
rename from tools/block_cache_trace_analyzer_plot.py
rename to tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
similarity index 99%
rename from tools/block_cache_trace_analyzer_test.cc
rename to tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index a028bf197c9..9917d5b9e78 100644
--- a/tools/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -23,7 +23,7 @@ int main() {
 #include "rocksdb/trace_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "tools/block_cache_trace_analyzer.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
 #include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
@@ -343,7 +343,7 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
           std::string l;
           ASSERT_TRUE(getline(ss, l, ','));
           if (l.find("block") == std::string::npos) {
-            if (unit != "_60" || user_access_only != "all_access_") {
+            if (user_access_only != "all_access_") {
               continue;
             }
           }
diff --git a/tools/block_cache_trace_analyzer_tool.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
similarity index 91%
rename from tools/block_cache_trace_analyzer_tool.cc
rename to tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
index b7b36c5d241..63382cf8c22 100644
--- a/tools/block_cache_trace_analyzer_tool.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
@@ -11,7 +11,7 @@ int main() {
   return 1;
 }
 #else  // GFLAGS
-#include "tools/block_cache_trace_analyzer.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
 int main(int argc, char** argv) {
   return rocksdb::block_cache_trace_analyzer_tool(argc, argv);
 }

From 3f89af1c39da4991ef6c544fc5e3f164a688b375 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 26 Jul 2019 15:48:35 -0700
Subject: [PATCH 263/572] Reduce the number of random iterations in
 compact_on_deletion_collector_test (#5635)

Summary:
This test frequently times out under TSAN; reducing the number of random
iterations to make it complete faster.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5635

Test Plan: buck test mode/dev-tsan internal_repo_rocksdb/repo:compact_on_deletion_collector_test

Differential Revision: D16523505

Pulled By: ltamasi

fbshipit-source-id: 6a69909bce9d204c891150fcb3d536547b3253d0
---
 .../compact_on_deletion_collector_test.cc                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 101aa988b66..57eed107011 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -40,7 +40,7 @@ int main(int /*argc*/, char** /*argv*/) {
   // randomize tests
   rocksdb::Random rnd(301);
   const int kMaxTestSize = 100000l;
-  for (int random_test = 0; random_test < 30; random_test++) {
+  for (int random_test = 0; random_test < 10; random_test++) {
     int window_size = rnd.Uniform(kMaxTestSize) + 1;
     int deletion_trigger = rnd.Uniform(window_size);
     window_sizes.emplace_back(window_size);

From 80d7067cb2e1d675104fac9e7d5e52b3aa56aa3b Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Fri, 26 Jul 2019 16:28:38 -0700
Subject: [PATCH 264/572] Use int64_t instead of ssize_t (#5638)

Summary:
The ssize_t type was introduced in https://github.com/facebook/rocksdb/pull/5633, but it seems like it's a POSIX specific type.

I just need a signed type to represent number of bytes, so use int64_t instead. It seems like we have a typedef from SSIZE_T for Windows, but it doesn't seem like we ever include "port/port.h" in our public header files.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5638

Differential Revision: D16526269

Pulled By: lth

fbshipit-source-id: 8d3a5c41003951b74b29bc5f1d949b2b22da0cee
---
 include/rocksdb/utilities/transaction_db.h    | 4 ++--
 utilities/transactions/write_unprepared_txn.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 33826bab861..7798e63da7b 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -104,7 +104,7 @@ struct TransactionDBOptions {
   // This option is only valid for write unprepared. If a write batch exceeds
   // this threshold, then the transaction will implicitly flush the currently
   // pending writes into the database. A value of 0 or less means no limit.
-  ssize_t default_write_batch_flush_threshold = 0;
+  int64_t default_write_batch_flush_threshold = 0;
 
  private:
   // 128 entries
@@ -171,7 +171,7 @@ struct TransactionOptions {
   // See TransactionDBOptions::default_write_batch_flush_threshold for
   // description. If a negative value is specified, then the default value from
   // TransactionDBOptions is used.
-  ssize_t write_batch_flush_threshold = -1;
+  int64_t write_batch_flush_threshold = -1;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index feac749ee82..bc952544ab0 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -167,7 +167,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // write_batch_flush_threshold_ has been exceeded, and then call
   // FlushWriteBatchToDB if so. This logic is encapsulated in
   // MaybeFlushWriteBatchToDB.
-  ssize_t write_batch_flush_threshold_;
+  int64_t write_batch_flush_threshold_;
   WriteUnpreparedTxnDB* wupt_db_;
 
   // Ordered list of unprep_seq sequence numbers that we have already written

From e648c1d9eb093e6cbb2ed500b17915a43c5aa172 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 29 Jul 2019 10:52:32 -0700
Subject: [PATCH 265/572] Cache simulator: Optimize hybrid row-block cache.
 (#5616)

Summary:
This PR optimizes the hybrid row-block cache simulator. If a Get request hits the cache, we treat all its future accesses as hits.

Consider a Get request (no snapshot) accesses multiple files, e.g, file1, file2, file3. We construct the row key as "fdnumber_key_0". Before this PR, if it hits the cache when searching the key in file1, we continue to process its accesses in file2 and file3 which is unnecessary.

With this PR, if "file1_key_0" is in the cache, we treat all future accesses of this Get request as hits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5616

Differential Revision: D16453187

Pulled By: HaoyuHuang

fbshipit-source-id: 56f3169cc322322305baaf5543226a0824fae19f
---
 utilities/simulator_cache/cache_simulator.cc  |  45 +++---
 utilities/simulator_cache/cache_simulator.h   |  19 ++-
 .../simulator_cache/cache_simulator_test.cc   | 149 +++++++++++++++++-
 3 files changed, 186 insertions(+), 27 deletions(-)

diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
index 06de4c11996..98a5c8a695f 100644
--- a/utilities/simulator_cache/cache_simulator.cc
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -122,14 +122,26 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
   // TODO (haoyu): We only support Get for now. We need to extend the tracing
   // for MultiGet, i.e., non-data block accesses must log all keys in a
   // MultiGet.
-  bool is_cache_miss = false;
+  bool is_cache_miss = true;
   bool admitted = false;
   if (access.caller == TableReaderCaller::kUserGet &&
       access.get_id != BlockCacheTraceHelper::kReservedGetId) {
-    // This is a Get/MultiGet request.
+    // This is a Get request.
     const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
-    if (getid_getkeys_map_[access.get_id].find(row_key) ==
-        getid_getkeys_map_[access.get_id].end()) {
+    GetRequestStatus& status = getid_status_map_[access.get_id];
+    if (status.is_complete) {
+      // This Get request completes.
+      // Skip future accesses to its index/filter/data
+      // blocks. These block lookups are unnecessary if we observe a hit for the
+      // referenced key-value pair already. Thus, we treat these lookups as
+      // hits. This is also to ensure the total number of accesses are the same
+      // when comparing to other policies.
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
+      return;
+    }
+    if (status.row_key_status.find(row_key) == status.row_key_status.end()) {
       // This is the first time that this key is accessed. Look up the key-value
       // pair first. Do not update the miss/accesses metrics here since it will
       // be updated later.
@@ -144,37 +156,30 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
       } else if (admitted) {
         result = InsertResult::ADMITTED;
       }
-      getid_getkeys_map_[access.get_id][row_key] =
-          std::make_pair(is_cache_miss, result);
+      status.row_key_status[row_key] = result;
     }
-    std::pair<bool, InsertResult> miss_inserted =
-        getid_getkeys_map_[access.get_id][row_key];
-    if (!miss_inserted.first) {
-      // This is a cache hit. Skip future accesses to its index/filter/data
-      // blocks. These block lookups are unnecessary if we observe a hit for the
-      // referenced key-value pair already. Thus, we treat these lookups as
-      // hits. This is also to ensure the total number of accesses are the same
-      // when comparing to other policies.
+    if (!is_cache_miss) {
+      // A cache hit.
+      status.is_complete = true;
       miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
                                       /*is_user_access=*/true,
                                       /*is_cache_miss=*/false);
       return;
     }
-    // The key-value pair observes a cache miss. We need to access its
+    // The row key-value pair observes a cache miss. We need to access its
     // index/filter/data blocks.
+    InsertResult inserted = status.row_key_status[row_key];
     AccessKVPair(
-        access.block_key, access.block_type, ComputeBlockPriority(access),
+        access.block_key, access.block_size, ComputeBlockPriority(access),
         access,
         /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
         /*is_user_access=*/true, &is_cache_miss, &admitted,
         /*update_metrics=*/true);
-    if (access.referenced_data_size > 0 &&
-        miss_inserted.second == InsertResult::ADMITTED) {
+    if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
       sim_cache_->Insert(row_key, /*value=*/nullptr,
                          access.referenced_data_size, /*deleter=*/nullptr,
                          /*handle=*/nullptr, Cache::Priority::HIGH);
-      getid_getkeys_map_[access.get_id][row_key] =
-          std::make_pair(true, InsertResult::INSERTED);
+      status.row_key_status[row_key] = InsertResult::INSERTED;
     }
     return;
   }
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
index 3863fcf88dd..6f2a7e84d2b 100644
--- a/utilities/simulator_cache/cache_simulator.h
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -47,6 +47,7 @@ class MissRatioStats {
     return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
   }
   uint64_t total_accesses() const { return num_accesses_; }
+  uint64_t total_misses() const { return num_misses_; }
 
   const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
     return num_accesses_timeline_;
@@ -63,6 +64,7 @@ class MissRatioStats {
     return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
   }
   uint64_t user_accesses() const { return user_accesses_; }
+  uint64_t user_misses() const { return user_misses_; }
 
   void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
                      bool is_cache_miss);
@@ -168,17 +170,24 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
     NO_INSERT,
   };
 
-  // A map stores get_id to a map of row keys. For each row key, it stores a
-  // boolean and an enum. The first bool is true when we observe a miss upon the
-  // first time we encounter the row key. The second arg is INSERTED when the
+  // We set is_complete to true when the referenced row-key of a get request
+  // hits the cache. If is_complete is true, we treat future accesses of this
+  // get request as hits.
+  //
+  // For each row key, it stores an enum. It is INSERTED when the
   // kv-pair has been inserted into the cache, ADMITTED if it should be inserted
   // but haven't been, NO_INSERT if it should not be inserted.
   //
   // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
   // know its size. This may happen if the first access on the referenced key is
   // an index/filter block.
-  std::map<uint64_t, std::map<std::string, std::pair<bool, InsertResult>>>
-      getid_getkeys_map_;
+  struct GetRequestStatus {
+    bool is_complete = false;
+    std::map<std::string, InsertResult> row_key_status;
+  };
+
+  // A map stores get_id to a map of row keys.
+  std::map<uint64_t, GetRequestStatus> getid_status_map_;
   bool insert_blocks_upon_row_kvpair_miss_;
 };
 
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
index dc3b8327e01..babdd431f5a 100644
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -14,6 +14,7 @@ namespace rocksdb {
 namespace {
 const std::string kBlockKeyPrefix = "test-block-";
 const std::string kRefKeyPrefix = "test-get-";
+const std::string kRefKeySequenceNumber = std::string(8, 'c');
 const uint64_t kGetId = 1;
 const uint64_t kGetBlockId = 100;
 const uint64_t kCompactionBlockId = 1000;
@@ -38,12 +39,12 @@ class CacheSimulatorTest : public testing::Test {
     record.cf_name = "test";
     record.caller = TableReaderCaller::kUserGet;
     record.level = 6;
-    record.sst_fd_number = kGetBlockId;
+    record.sst_fd_number = 0;
     record.get_id = getid;
     record.is_cache_hit = Boolean::kFalse;
     record.no_insert = Boolean::kFalse;
     record.referenced_key =
-        kRefKeyPrefix + std::to_string(kGetId) + std::string(8, 'c');
+        kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber;
     record.referenced_key_exist_in_block = Boolean::kTrue;
     record.referenced_data_size = 100;
     record.num_keys_in_block = 300;
@@ -66,6 +67,29 @@ class CacheSimulatorTest : public testing::Test {
     return record;
   }
 
+  void AssertCache(std::shared_ptr<Cache> sim_cache,
+                   const MissRatioStats& miss_ratio_stats,
+                   uint64_t expected_usage, uint64_t expected_num_accesses,
+                   uint64_t expected_num_misses,
+                   std::vector<std::string> blocks,
+                   std::vector<std::string> keys) {
+    EXPECT_EQ(expected_usage, sim_cache->GetUsage());
+    EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses());
+    EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses());
+    for (auto const& block : blocks) {
+      auto handle = sim_cache->Lookup(block);
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+    for (auto const& key : keys) {
+      std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
+      auto handle =
+          sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString() + "_0");
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+  }
+
   Env* env_;
 };
 
@@ -277,6 +301,127 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   }
 }
 
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
+  BlockCacheTraceRecord get = GenerateGetRecord(kGetId);
+  get.block_size = 1;
+  get.referenced_data_size = 0;
+  get.access_timestamp = 0;
+  get.block_key = "1";
+  get.get_id = 1;
+  get.get_from_user_specified_snapshot = Boolean::kFalse;
+  get.referenced_key =
+      kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber;
+  get.no_insert = Boolean::kFalse;
+  get.sst_fd_number = 0;
+  get.get_from_user_specified_snapshot = Boolean::kFalse;
+
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // Expect a miss and does not insert the row key-value pair since it does not
+  // have size.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"},
+              {});
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.block_key = "2";
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2,
+              {"1", "2"}, {"1"});
+  get.access_timestamp += 1;
+  get.block_key = "3";
+  // K1 should not inserted again.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A second get request referencing the same key.
+  get.access_timestamp += 1;
+  get.get_id = 2;
+  get.block_key = "4";
+  get.referenced_data_size = 0;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A third get request searches three files, three different keys.
+  // And the second key observes a hit.
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "3";
+  get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber;
+  // K2 should observe a miss. Block 3 observes a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber;
+  // K1 should observe a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber;
+  // K3 should observe a miss.
+  // However, as the get already complete, we should not access k3 any more.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  // A fourth get request searches one file and two blocks. One row key.
+  get.access_timestamp += 1;
+  get.get_id = 4;
+  get.block_key = "5";
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
+              {"1", "2", "3", "5"}, {"1", "2", "4"});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // A bunch of insertions which evict cached row keys.
+  for (uint32_t i = 6; i < 100; i++) {
+    get.access_timestamp += 1;
+    get.get_id = 0;
+    get.block_key = std::to_string(i);
+    cache_simulator->Access(get);
+  }
+
+  get.get_id = 4;
+  // A different block.
+  get.block_key = "100";
+  // Same row key and should not be inserted again.
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
+              {});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
 TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
   uint64_t block_id = 100;
   BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);

From 399f477818578c0d3e4614f6f148e8d7859121a2 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Mon, 29 Jul 2019 17:51:30 -0700
Subject: [PATCH 266/572] WriteUnPrepared: Use WriteUnpreparedTxnReadCallback
 for MultiGet (#5634)

Summary:
The `TransactionTest.MultiGetBatchedTest` were failing with unprepared batches because we were not using the correct callbacks. Override MultiGet to pass down the correct ReadCallback. A similar problem is also fixed in WritePrepared.

This PR also fixes an issue similar to (https://github.com/facebook/rocksdb/pull/5147), but for MultiGet instead of Get.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5634

Differential Revision: D16552674

Pulled By: lth

fbshipit-source-id: 736eaf8e919c6b13d5f5655b1c0d36b57ad04804
---
 db/db_impl/db_impl.cc                         | 18 +++++++++++++++++
 utilities/transactions/write_prepared_txn.cc  | 19 ++++++++++++++++++
 utilities/transactions/write_prepared_txn.h   |  7 +++++++
 .../transactions/write_unprepared_txn.cc      | 20 +++++++++++++++++++
 utilities/transactions/write_unprepared_txn.h |  7 +++++++
 5 files changed, 71 insertions(+)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 16a6d86a658..29b7f6f1470 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1857,6 +1857,24 @@ void DBImpl::MultiGetImpl(
     snapshot = last_seq_same_as_publish_seq_
                    ? versions_->LastSequence()
                    : versions_->LastPublishedSequence();
+    if (callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
+      callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = callback->max_visible_seq();
+    }
   }
 
   // For each of the given keys, apply the entire "get" process as follows:
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index f4c21d4769e..97bebac5d57 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -40,6 +40,25 @@ void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
   prepare_batch_cnt_ = 0;
 }
 
+void WritePreparedTxn::MultiGet(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const size_t num_keys, const Slice* keys,
+                                PinnableSlice* values, Status* statuses,
+                                bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const bool backed_by_snapshot =
+      wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
 Status WritePreparedTxn::Get(const ReadOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& key, PinnableSlice* pinnable_val) {
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index 2cd729cd2c7..c574f62310f 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -53,6 +53,13 @@ class WritePreparedTxn : public PessimisticTransaction {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        bool sorted_input = false) override;
+
   // Note: The behavior is undefined in presence of interleaved writes to the
   // same transaction.
   // To make WAL commit markers visible, the snapshot will be
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index c677013aa03..d8c5eea5561 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -524,6 +524,26 @@ void WriteUnpreparedTxn::Clear() {
   TransactionBaseImpl::Clear();
 }
 
+void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
+                                  ColumnFamilyHandle* column_family,
+                                  const size_t num_keys, const Slice* keys,
+                                  PinnableSlice* values, Status* statuses,
+                                  bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const bool backed_by_snapshot =
+      wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          unprep_seqs_);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
 Status WriteUnpreparedTxn::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index bc952544ab0..2c23155946a 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -146,6 +146,13 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        bool sorted_input = false) override;
+
   using Transaction::GetIterator;
   virtual Iterator* GetIterator(const ReadOptions& options) override;
   virtual Iterator* GetIterator(const ReadOptions& options,

From 849a8c0ae0a0d72e0872f8c497626e1ff6dd8af9 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Tue, 30 Jul 2019 14:09:02 -0700
Subject: [PATCH 267/572] fix sign compare warnings (#5651)

Summary:
Fix -Wsign-compare warnings for gcc9.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5651

Test Plan: Tested with ubuntu19.10+gcc9

Differential Revision: D16567428

fbshipit-source-id: 730b2704d42ba0c4e4ea946a3199bbb34be4c25c
---
 env/io_posix.cc    | 14 +++++++-------
 port/port_posix.cc |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index 293516feee8..bcc9ab5272e 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -803,8 +803,8 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
 Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(len <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
   int alloc_status = 0;
   if (allow_fallocate_) {
@@ -873,7 +873,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
     assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
   }
-  assert(offset <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   const char* src = data.data();
   size_t nbytes = data.size();
   if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
@@ -1009,8 +1009,8 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
 Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(len <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
   IOSTATS_TIMER_GUARD(allocate_nanos);
   int alloc_status = 0;
@@ -1031,8 +1031,8 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
 
 Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
 #ifdef ROCKSDB_RANGESYNC_PRESENT
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(nbytes <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   if (sync_file_range_supported_) {
     int ret;
     if (strict_bytes_per_sync_) {
diff --git a/port/port_posix.cc b/port/port_posix.cc
index f19d18ff0e6..167159d83c8 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -192,7 +192,8 @@ int GetMaxOpenFiles() {
     return -1;
   }
   // protect against overflow
-  if (no_files_limit.rlim_cur >= std::numeric_limits<int>::max()) {
+  if (static_cast<uintmax_t>(no_files_limit.rlim_cur) >=
+      static_cast<uintmax_t>(std::numeric_limits<int>::max())) {
     return std::numeric_limits<int>::max();
   }
   return static_cast<int>(no_files_limit.rlim_cur);

From 55f4f5486d4fc0657100d34a0ca0d4fa81a18350 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 30 Jul 2019 15:56:41 -0700
Subject: [PATCH 268/572] Update buckifier templates (#5647)

Summary:
Update buckifier templates in the scripts.

Test plan (on devserver)
```
$python buckifier/buckify_rocksdb.py
```
Then
```
$git diff
```
Verify that generated TARGETS file is the same (except for indentation).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5647

Differential Revision: D16555647

Pulled By: riversand963

fbshipit-source-id: 32574a4d0e820858eab2391304dd731141719bcd
---
 buckifier/targets_cfg.py | 82 ++++++++++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 79648bb6a6d..0ebd6d9427e 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -11,29 +11,9 @@
 
 ROCKSDB_COMPILER_FLAGS = [
     "-fno-builtin-memcmp",
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
-    "-DROCKSDB_FALLOCATE_PRESENT",
-    "-DROCKSDB_MALLOC_USABLE_SIZE",
-    "-DROCKSDB_RANGESYNC_PRESENT",
-    "-DROCKSDB_SCHED_GETCPU_PRESENT",
-    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-    "-DOS_LINUX",
-    # Flags to enable libs we include
-    "-DSNAPPY",
-    "-DZLIB",
-    "-DBZIP2",
-    "-DLZ4",
-    "-DZSTD",
-    "-DZSTD_STATIC_LINKING_ONLY",
-    "-DGFLAGS=gflags",
-    "-DNUMA",
-    "-DTBB",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
     # Added missing flags from output of build_detect_platform
-    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
-    "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
     "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
@@ -46,11 +26,54 @@
     ("lz4", None, "lz4"),
     ("zstd", None),
     ("tbb", None),
-    ("numa", None, "numa"),
     ("googletest", None, "gtest"),
 ]
 
+ROCKSDB_OS_DEPS = [
+    (
+        "linux",
+        ["third-party//numa:numa"],
+    ),
+]
+
+ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+    (
+        "linux",
+        [
+            "-DOS_LINUX",
+            "-DROCKSDB_FALLOCATE_PRESENT",
+            "-DROCKSDB_MALLOC_USABLE_SIZE",
+            "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+            "-DROCKSDB_RANGESYNC_PRESENT",
+            "-DROCKSDB_SCHED_GETCPU_PRESENT",
+            "-DHAVE_SSE42",
+            "-DNUMA",
+        ],
+    ),
+    (
+        "macos",
+        ["-DOS_MACOSX"],
+    ),
+]
+
 ROCKSDB_PREPROCESSOR_FLAGS = [
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DTBB",
+
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_BACKTRACE",
+
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
@@ -58,7 +81,6 @@
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
     "x86_64": [
-        "-DHAVE_SSE42",
         "-DHAVE_PCLMUL",
     ],
 }
@@ -75,9 +97,15 @@
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
-
-ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
+ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+    "linux",
+    ["-DROCKSDB_JEMALLOC"],
+)] if sanitizer == "" else [])
+
+ROCKSDB_OS_DEPS += ([(
+    "linux",
+    ["third-party//jemalloc:headers"],
+)] if sanitizer == "" else [])
 """
 
 
@@ -88,6 +116,8 @@
     {headers_attr_prefix}headers = {headers},
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [{deps}],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -127,6 +157,8 @@
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
         rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_os_deps = ROCKSDB_OS_DEPS,
+        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
         test_cc = test_cc,
         test_name = test_name,

From 265db3ebb525460c78d9ee8dfb573905beb972eb Mon Sep 17 00:00:00 2001
From: Fosco Marotto <gfosco@gmail.com>
Date: Tue, 30 Jul 2019 16:05:19 -0700
Subject: [PATCH 269/572] Update history and version for 6.4.0 (#5652)

Summary:
Master branch had been left at 6.2 and history of 6.3 and beyond were merged.  Updated this to correct.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5652

Differential Revision: D16570498

Pulled By: gfosco

fbshipit-source-id: 79f62ec570539a3e3d7d7c84a6cf7b722395fafe
---
 HISTORY.md                | 51 ++++++++++++++++++++++++++-------------
 include/rocksdb/version.h |  2 +-
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index ace55cab404..ba96b0e4ba5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,46 +1,64 @@
 # Rocksdb Change Log
 ## Unreleased
+
+## 6.4.0 (7/30/2019)
 ### Default Option Change
 * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
 * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
 
 ### Public API Change
-* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 * Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers.
-* Partitions of partitioned indexes no longer affect the read amplification statistics.
-* Due to the above refactoring, block cache eviction statistics for indexes, filters, and compression dictionaries are temporarily broken. We plan to reintroduce them in a later phase.
 * Errors related to the retrieval of the compression dictionary are now propagated to the user.
-* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
-* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
-* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
-* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
-* Add C bindings for secondary instance, i.e. DBImplSecondary.
 * db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
-* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
 * Overload GetAllKeyVersions() to support non-default column family.
 * Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
 * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
 * Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors.
 
+### New Features
+* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
+* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
+* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
+
+### Performance Improvements
+* Reduce iterator key comparision for upper/lower bound check.
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
+* The compression dictionary is no longer copied to a new object upon retrieval.
+
+### Bug Fixes
+* Fix ingested file and directory not being fsync.
+* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
+
+
+## 6.3.1 (7/24/2019)
+### Bug Fixes
+* Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails.
+
+## 6.3.0 (6/18/2019)
+### Public API Change
+* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Partitions of partitioned indexes no longer affect the read amplification statistics.
+* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
+* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
+* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+* Add C bindings for secondary instance, i.e. DBImplSecondary.
+* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
+
 ### New Features
 * Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
-* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
-* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
-* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
 * DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
 * Merging iterator to avoid child iterator reseek for some cases
-* Reduce iterator key comparision for upper/lower bound check.
 * Log Writer will flush after finishing the whole record, rather than a fragment.
 * Lower MultiGet batching API latency by reading data blocks from disk in parallel
-* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
-* The compression dictionary is no longer copied to a new object upon retrieval.
 
 ### General Improvements
 * Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
@@ -50,11 +68,10 @@
 * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
 * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
-* Fix ingested file and directory not being fsync.
-* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 * On DB open, delete WAL trash files left behind in wal_dir
 
+
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 7b7d7e86224..d86c5fc886c 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 2
+#define ROCKSDB_MINOR 4
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From b538e756c29eac69e5362d9dff52833200d3e242 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 30 Jul 2019 17:41:15 -0700
Subject: [PATCH 270/572] Split the recent block based table changes between
 6.3 and 6.4 in HISTORY.md

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5653

Differential Revision: D16573445

Pulled By: ltamasi

fbshipit-source-id: 19c639044fcfd43b5d5c627c8def33ff2dbb2af8
---
 HISTORY.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index ba96b0e4ba5..9e057250aee 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,7 +7,9 @@
 * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
 
 ### Public API Change
-* Index, filter, and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index, filter, and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any), and cached index blocks can be shared among multiple table readers.
+* Filter and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, filter and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed.
+* Due to the above refactoring, block cache eviction statistics for filter and compression dictionary blocks are temporarily broken. We plan to reintroduce them in a later phase.
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 * Errors related to the retrieval of the compression dictionary are now propagated to the user.
 * db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
 * Overload GetAllKeyVersions() to support non-default column family.
@@ -38,12 +40,12 @@
 ## 6.3.0 (6/18/2019)
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers.
 * Partitions of partitioned indexes no longer affect the read amplification statistics.
-* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* Due to the above refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
 * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
 * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
 * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
-* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 * Add C bindings for secondary instance, i.e. DBImplSecondary.
 * Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
 

From 4834dab578114b429163746acbcb93073bb5784f Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Wed, 31 Jul 2019 08:46:48 -0700
Subject: [PATCH 271/572] Improve CPU Efficiency of ApproximateSize (part 2)
 (#5609)

Summary:
In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609

Differential Revision: D16433481

Pulled By: elipoz

fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b
---
 HISTORY.md                      |   1 +
 db/compaction/compaction_job.cc |   3 +-
 db/db_impl/db_impl.cc           |   4 +-
 db/db_test.cc                   |  96 ++++++++++++++++---
 db/version_set.cc               | 161 +++++++++++++++++++++-----------
 db/version_set.h                |   8 +-
 include/rocksdb/options.h       |  10 ++
 7 files changed, 208 insertions(+), 75 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 9e057250aee..201cef2b1b3 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,7 @@
 * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
 * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
 * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
+* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
 
 ### Performance Improvements
 * Reduce iterator key comparision for upper/lower bound check.
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index db701d19dad..663c8aa0a80 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
     // to the index block and may incur I/O cost in the process. Unlock db
     // mutex to reduce contention
     db_mutex_->Unlock();
-    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1,
+    uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
+                                               b, start_lvl, out_lvl + 1,
                                                TableReaderCaller::kCompaction);
     db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 29b7f6f1470..81c44388bcf 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
     sizes[i] = 0;
     if (options.include_files) {
       sizes[i] += versions_->ApproximateSize(
-          v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
-          TableReaderCaller::kUserApproximateSize);
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
     }
     if (options.include_memtabtles) {
       sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
diff --git a/db/db_test.cc b/db/db_test.cc
index f247ddb80fa..f53afa17d9d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   options.compression = kNoCompression;
   options.create_if_missing = true;
   DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
 
   const int N = 128;
   Random rnd(301);
@@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   std::string start = Key(50);
   std::string end = Key(60);
   Range r(start, end);
-  uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES |
-                         DB::SizeApproximationFlags::INCLUDE_MEMTABLES;
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = true;
+  size_approx_options.include_files = true;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
   ASSERT_LT(size, 204800);
   // Zero if not including mem table
@@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   for (int i = 0; i < N; i++) {
@@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(100);
   end = Key(1020);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
 
   options.max_write_buffer_number = 8;
   options.min_write_buffer_number_to_merge = 5;
   options.write_buffer_size = 1024 * N;  // Not very large
   DestroyAndReopen(options);
+  default_cf = db_->DefaultColumnFamily();
 
   int keys[N * 3];
   for (int i = 0; i < N; i++) {
@@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(100);
   end = Key(300);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
 
   start = Key(2100);
   end = Key(2300);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
   uint64_t size_with_mt, size_without_mt;
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
   ASSERT_GT(size_with_mt, 6000);
   db_->GetApproximateSizes(&r, 1, &size_without_mt);
   ASSERT_EQ(size_without_mt, 0);
@@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
   db_->GetApproximateSizes(&r, 1, &size_without_mt);
   ASSERT_GT(size_with_mt, size_without_mt);
   ASSERT_GT(size_without_mt, 6000);
+
+  // Check that include_memtabtles flag works as expected
+  size_approx_options.include_memtabtles = false;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, size_without_mt);
+
+  // Check that files_size_error_margin works as expected, when the heuristic
+  // conditions are not met
+  start = Key(1);
+  end = Key(1000 + N - 2);
+  r = Range(start, end);
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  uint64_t size2;
+  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024 * 1024;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.target_file_size_base = 1024 * 1024;
+  DestroyAndReopen(options);
+  const auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 64000;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files
+  Flush();
+  // Compact the entire key space into the next level
+  db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
+
+  // Write more keys
+  for (int i = N; i < (N + N / 4); i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files again
+  Flush();
+
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const std::string start = Key(0);
+  const std::string end = Key(2 * N);
+  const Range r(start, end);
+
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = false;
+  size_approx_options.include_files = true;
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+  // Get the precise size without any approximation heuristic
+  uint64_t size;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_NE(size, 0);
+
+  // Get the size with an approximation heuristic
+  uint64_t size2;
+  const double error_margin = 0.2;
+  size_approx_options.files_size_error_margin = error_margin;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_LT(size2, size * (1 + error_margin));
+  ASSERT_GT(size2, size * (1 - error_margin));
 }
 
 TEST_F(DBTest, GetApproximateMemTableStats) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 7d477a6806b..3a1f47790c5 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
 // we avoid doing binary search for the keys b and c twice and instead somehow
 // maintain state of where they first appear in the files.
-uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+                                     Version* v, const Slice& start,
                                      const Slice& end, int start_level,
                                      int end_level, TableReaderCaller caller) {
+  const auto& icmp = v->cfd_->internal_comparator();
+
   // pre-condition
-  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
+  assert(icmp.Compare(start, end) <= 0);
 
-  uint64_t size = 0;
+  uint64_t total_full_size = 0;
   const auto* vstorage = v->storage_info();
-  end_level = end_level == -1
-                  ? vstorage->num_non_empty_levels()
-                  : std::min(end_level, vstorage->num_non_empty_levels());
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
+  end_level = (end_level == -1) ? num_non_empty_levels
+                                : std::min(end_level, num_non_empty_levels);
 
   assert(start_level <= end_level);
 
-  for (int level = start_level; level < end_level; level++) {
+  // Outline of the optimization that uses options.files_size_error_margin.
+  // When approximating the files total size that is used to store a keys range,
+  // we first sum up the sizes of the files that fully fall into the range.
+  // Then we sum up the sizes of all the files that may intersect with the range
+  // (this includes all files in L0 as well). Then, if total_intersecting_size
+  // is smaller than total_full_size * options.files_size_error_margin - we can
+  // infer that the intersecting files have a sufficiently negligible
+  // contribution to the total size, and we can approximate the storage required
+  // for the keys in range as just half of the intersecting_files_size.
+  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+  // approximation is limited to only ~10% of the total size of files that fully
+  // fall into the keys range. In such case, this helps to avoid a costly
+  // process of binary searching the intersecting files that is required only
+  // for a more precise calculation of the total size.
+
+  autovector<FdWithKeyRange*, 32> first_files;
+  autovector<FdWithKeyRange*, 16> last_files;
+
+  // scan all the levels
+  for (int level = start_level; level < end_level; ++level) {
     const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
-    if (!files_brief.num_files) {
+    if (files_brief.num_files == 0) {
       // empty level, skip exploration
       continue;
     }
 
-    if (!level) {
-      // level 0 data is sorted order, handle the use case explicitly
-      size += ApproximateSizeLevel0(v, files_brief, start, end, caller);
+    if (level == 0) {
+      // level 0 files are not in sorted order, we need to iterate through
+      // the list to compute the total bytes that require scanning,
+      // so handle the case explicitly (similarly to first_files case)
+      for (size_t i = 0; i < files_brief.num_files; i++) {
+        first_files.push_back(&files_brief.files[i]);
+      }
       continue;
     }
 
     assert(level > 0);
     assert(files_brief.num_files > 0);
 
-    // identify the file position for starting key
-    const uint64_t idx_start = FindFileInRange(
-        v->cfd_->internal_comparator(), files_brief, start,
-        /*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1));
-    assert(idx_start < files_brief.num_files);
-
-    // scan all files from the starting position until the ending position
-    // inferred from the sorted order
-    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
-      uint64_t val;
-      val = ApproximateSize(v, files_brief.files[i], end, caller);
-      if (!val) {
-        // the files after this will not have the range
-        break;
-      }
+    // identify the file position for start key
+    const int idx_start =
+        FindFileInRange(icmp, files_brief, start, 0,
+                        static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
 
-      size += val;
+    // identify the file position for end key
+    int idx_end = idx_start;
+    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+      idx_end =
+          FindFileInRange(icmp, files_brief, end, idx_start,
+                          static_cast<uint32_t>(files_brief.num_files - 1));
+    }
+    assert(idx_end >= idx_start &&
+           static_cast<size_t>(idx_end) < files_brief.num_files);
 
-      if (i == idx_start) {
-        // subtract the bytes needed to be scanned to get to the starting
-        // key
-        val = ApproximateSize(v, files_brief.files[i], start, caller);
-        assert(size >= val);
-        size -= val;
-      }
+    // scan all files from the starting index to the ending index
+    // (inferred from the sorted order)
+
+    // first scan all the intermediate full files (excluding first and last)
+    for (int i = idx_start + 1; i < idx_end; ++i) {
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+      // The entire file falls into the range, so we can just take its size.
+      assert(file_size ==
+             ApproximateSize(v, files_brief.files[i], end, caller));
+      total_full_size += file_size;
+    }
+
+    // save the first and the last files (which may be the same file), so we
+    // can scan them later.
+    first_files.push_back(&files_brief.files[idx_start]);
+    if (idx_start != idx_end) {
+      // we need to estimate size for both files, only if they are different
+      last_files.push_back(&files_brief.files[idx_end]);
     }
   }
 
-  return size;
-}
+  // The sum of all file sizes that intersect the [start, end] keys range.
+  uint64_t total_intersecting_size = 0;
+  for (const auto* file_ptr : first_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+  for (const auto* file_ptr : last_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
 
-uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
-                                           const LevelFilesBrief& files_brief,
-                                           const Slice& key_start,
-                                           const Slice& key_end,
-                                           TableReaderCaller caller) {
-  // level 0 files are not in sorted order, we need to iterate through
-  // the list to compute the total bytes that require scanning
-  uint64_t size = 0;
-  for (size_t i = 0; i < files_brief.num_files; i++) {
-    const uint64_t start =
-        ApproximateSize(v, files_brief.files[i], key_start, caller);
-    const uint64_t end =
-        ApproximateSize(v, files_brief.files[i], key_end, caller);
-    assert(end >= start);
-    size += end - start;
+  // Now scan all the first & last files at each level, and estimate their size.
+  // If the total_intersecting_size is less than X% of the total_full_size - we
+  // want to approximate the result in order to avoid the costly binary search
+  // inside ApproximateSize. We use half of file size as an approximation below.
+
+  const double margin = options.files_size_error_margin;
+  if (margin > 0 && total_intersecting_size <
+                        static_cast<uint64_t>(total_full_size * margin)) {
+    total_full_size += total_intersecting_size / 2;
+  } else {
+    // Estimate for all the first files, at each level
+    for (const auto file_ptr : first_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
+      // subtract the bytes needed to be scanned to get to the starting key
+      uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
+      assert(total_full_size >= val);
+      total_full_size -= val;
+    }
+
+    // Estimate for all the last files, at each level
+    for (const auto file_ptr : last_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
+    }
   }
-  return size;
+
+  return total_full_size;
 }
 
 uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
@@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
                                      TableReaderCaller caller) {
   // pre-condition
   assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
 
   uint64_t result = 0;
-  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
+  if (icmp.Compare(f.largest_key, key) <= 0) {
     // Entire file is before "key", so just add the file size
     result = f.fd.GetFileSize();
-  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
     // Entire file is after "key", so ignore
     result = 0;
   } else {
@@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
     TableCache* table_cache = v->cfd_->table_cache();
     if (table_cache != nullptr) {
       result = table_cache->ApproximateOffsetOf(
-          key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(),
+          key, f.file_metadata->fd, caller, icmp,
           v->GetMutableCFOptions().prefix_extractor.get());
     }
   }
diff --git a/db/version_set.h b/db/version_set.h
index ee94f5966df..391bb902c4b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -983,7 +983,8 @@ class VersionSet {
   // Return the approximate size of data to be scanned for range [start, end)
   // in levels [start_level, end_level). If end_level == -1 it will search
   // through all non-empty levels
-  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+                           const Slice& start, const Slice& end,
                            int start_level, int end_level,
                            TableReaderCaller caller);
 
@@ -1033,11 +1034,6 @@ class VersionSet {
     }
   };
 
-  // ApproximateSize helper
-  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
-                                 const Slice& start, const Slice& end,
-                                 TableReaderCaller caller);
-
   uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
                            const Slice& key, TableReaderCaller caller);
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 5ae010b8f52..bda44d4417c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1514,6 +1514,16 @@ struct SizeApproximationOptions {
   // Defines whether the returned size should include data serialized to disk.
   // If set to false, include_memtabtles must be true.
   bool include_files = true;
+  // When approximating the files total size that is used to store a keys range
+  // using DB::GetApproximateSizes, allow approximation with an error margin of
+  // up to total_files_size * files_size_error_margin. This allows to take some
+  // shortcuts in files size approximation, resulting in better performance,
+  // while guaranteeing the resulting error is within a reasonable margin.
+  // E.g., if the value is 0.1, then the error margin of the returned files size
+  // approximation will be within 10%.
+  // If the value is non-positive - a more precise yet more CPU intensive
+  // estimation is performed.
+  double files_size_error_margin = -1.0;
 };
 
 }  // namespace rocksdb

From d599135a0332a8aa08abe56d08027f61331ef9e3 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Wed, 31 Jul 2019 10:41:05 -0700
Subject: [PATCH 272/572] WriteUnPrepared: use WriteUnpreparedTxnReadCallback
 for ValidateSnapshot (#5657)

Summary:
In DeferSnapshotSavePointTest, writes were failing with snapshot validation error because the key with the latest sequence number was an unprepared key from the current transaction.

Fix this by passing down the correct read callback.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5657

Differential Revision: D16582466

Pulled By: lth

fbshipit-source-id: 11645dac0e7c1374d917ef5fdf757d13c1d1108d
---
 .../transactions/write_unprepared_txn.cc      | 33 +++++++++++++++++++
 utilities/transactions/write_unprepared_txn.h |  4 +++
 2 files changed, 37 insertions(+)

diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index d8c5eea5561..c5f4db5bd56 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -574,6 +574,39 @@ Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
   return write_batch_.NewIteratorWithBase(column_family, db_iter);
 }
 
+Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                            const Slice& key,
+                                            SequenceNumber* tracked_at_seq) {
+  // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic.
+  assert(snapshot_);
+
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(
+          snapshot_.get())
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  // tracked_at_seq is either max or the last snapshot with which this key was
+  // trackeed so there is no need to apply the IsInSnapshot to this comparison
+  // here as tracked_at_seq is not a prepare seq.
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated at a sequence number earlier
+    // than the curent snapshot's sequence number, we already know it has not
+    // been modified.
+    return Status::OK();
+  }
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  WriteUnpreparedTxnReadCallback snap_checker(wupt_db_, snap_seq,
+                                              min_uncommitted, unprep_seqs_);
+  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
+                                               snap_seq, false /* cache_only */,
+                                               &snap_checker, min_uncommitted);
+}
+
 const std::map<SequenceNumber, size_t>&
 WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() {
   return unprep_seqs_;
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 2c23155946a..77c18033898 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -158,6 +158,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   virtual Iterator* GetIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) override;
 
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq) override;
+
  private:
   friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;

From f622ca2c7c12ff13b24083b57d1279aaa38a2ccd Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Wed, 31 Jul 2019 13:36:22 -0700
Subject: [PATCH 273/572] WriteUnPrepared: savepoint support (#5627)

Summary:
Add savepoint support when the current transaction has flushed unprepared batches.

Rolling back to savepoint is similar to rolling back a transaction. It requires the set of keys that have changed since the savepoint, re-reading the keys at the snapshot at that savepoint, and the restoring the old keys by writing out another unprepared batch.

For this strategy to work though, we must be capable of reading keys at a savepoint. This does not work if keys were written out using the same sequence number before and after a savepoint. Therefore, when we flush out unprepared batches, we must split the batch by savepoint if any savepoints exist.

eg. If we have the following:
```
Put(A)
Put(B)
Put(C)
SetSavePoint()
Put(D)
Put(E)
SetSavePoint()
Put(F)
```

Then we will write out 3 separate unprepared batches:
```
Put(A) 1
Put(B) 1
Put(C) 1
Put(D) 2
Put(E) 2
Put(F) 3
```

This is so that when we rollback to eg. the first savepoint, we can just read keys at snapshot_seq = 1.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5627

Differential Revision: D16584130

Pulled By: lth

fbshipit-source-id: 6d100dd548fb20c4b76661bd0f8a2647e64477fa
---
 db/write_batch.cc                             |  48 ++--
 db/write_batch_internal.h                     |   4 +
 .../utilities/write_batch_with_index.h        |   2 +
 include/rocksdb/write_batch.h                 |   2 +-
 utilities/transactions/transaction_base.cc    |   4 +-
 utilities/transactions/transaction_base.h     |  20 +-
 .../transactions/write_unprepared_txn.cc      | 258 +++++++++++++++++-
 utilities/transactions/write_unprepared_txn.h |  60 +++-
 .../transactions/write_unprepared_txn_db.cc   |   4 +-
 .../write_batch_with_index.cc                 |   5 +
 10 files changed, 378 insertions(+), 29 deletions(-)

diff --git a/db/write_batch.cc b/db/write_batch.cc
index 2c2d81e87f6..8a896644fc2 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -511,12 +511,25 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
 }
 
 Status WriteBatch::Iterate(Handler* handler) const {
-  Slice input(rep_);
-  if (input.size() < WriteBatchInternal::kHeader) {
+  if (rep_.size() < WriteBatchInternal::kHeader) {
     return Status::Corruption("malformed WriteBatch (too small)");
   }
 
-  input.remove_prefix(WriteBatchInternal::kHeader);
+  return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+                                     rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+                                   WriteBatch::Handler* handler, size_t begin,
+                                   size_t end) {
+  if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+    return Status::Corruption("Invalid start/end bounds for Iterate");
+  }
+  assert(begin <= end);
+  Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+  bool whole_batch =
+      (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
   Slice key, value, blob, xid;
   // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
   // the batch boundary symbols otherwise we would mis-count the number of
@@ -547,7 +560,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
       }
     } else {
       assert(s.IsTryAgain());
-      assert(!last_was_try_again); // to detect infinite loop bugs
+      assert(!last_was_try_again);  // to detect infinite loop bugs
       if (UNLIKELY(last_was_try_again)) {
         return Status::Corruption(
             "two consecutive TryAgain in WriteBatch handler; this is either a "
@@ -560,7 +573,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
     switch (tag) {
       case kTypeColumnFamilyValue:
       case kTypeValue:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
         s = handler->PutCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -570,7 +583,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyDeletion:
       case kTypeDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
         s = handler->DeleteCF(column_family, key);
         if (LIKELY(s.ok())) {
@@ -580,7 +593,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilySingleDeletion:
       case kTypeSingleDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
         s = handler->SingleDeleteCF(column_family, key);
         if (LIKELY(s.ok())) {
@@ -590,7 +603,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyRangeDeletion:
       case kTypeRangeDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
         s = handler->DeleteRangeCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -600,7 +613,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
         s = handler->MergeCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -610,7 +623,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyBlobIndex:
       case kTypeBlobIndex:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
         s = handler->PutBlobIndexCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -623,7 +636,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         empty_batch = false;
         break;
       case kTypeBeginPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
         handler->MarkBeginPrepare();
         empty_batch = false;
@@ -642,7 +655,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeBeginPersistedPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
         handler->MarkBeginPrepare();
         empty_batch = false;
@@ -655,7 +668,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeBeginUnprepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
         handler->MarkBeginPrepare(true /* unprepared */);
         empty_batch = false;
@@ -674,19 +687,19 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeEndPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
         handler->MarkEndPrepare(xid);
         empty_batch = true;
         break;
       case kTypeCommitXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
         handler->MarkCommit(xid);
         empty_batch = true;
         break;
       case kTypeRollbackXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
         handler->MarkRollback(xid);
         empty_batch = true;
@@ -702,7 +715,8 @@ Status WriteBatch::Iterate(Handler* handler) const {
   if (!s.ok()) {
     return s;
   }
-  if (handler_continue && found != WriteBatchInternal::Count(this)) {
+  if (handler_continue && whole_batch &&
+      found != WriteBatchInternal::Count(wb)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index bae62bf0317..67136a84716 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -192,6 +192,10 @@ class WriteBatchInternal {
   // leftByteSize and a WriteBatch with ByteSize rightByteSize
   static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
 
+  // Iterate over [begin, end) range of a write batch
+  static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+                        size_t begin, size_t end);
+
   // This write batch includes the latest state that should be persisted. Such
   // state meant to be used only during recovery.
   static void SetAsLastestPersistentState(WriteBatch* b);
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 34e6c46895c..586088d7519 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -100,6 +100,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
       size_t max_bytes = 0);
 
   ~WriteBatchWithIndex() override;
+  WriteBatchWithIndex(WriteBatchWithIndex&&);
+  WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
 
   using WriteBatchBase::Put;
   Status Put(ColumnFamilyHandle* column_family, const Slice& key,
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 393c5d9c6ab..b6b7c8bb820 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -271,7 +271,7 @@ class WriteBatch : public WriteBatchBase {
     virtual bool Continue();
 
    protected:
-    friend class WriteBatch;
+    friend class WriteBatchInternal;
     virtual bool WriteAfterCommit() const { return true; }
     virtual bool WriteBeforePrepare() const { return false; }
   };
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index bf59a1c4069..30861f09148 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -30,7 +30,7 @@ TransactionBaseImpl::TransactionBaseImpl(DB* db,
   assert(dynamic_cast<DBImpl*>(db_) != nullptr);
   log_number_ = 0;
   if (dbimpl_->allow_2pc()) {
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    InitWriteBatch();
   }
 }
 
@@ -49,7 +49,7 @@ void TransactionBaseImpl::Clear() {
   num_merges_ = 0;
 
   if (dbimpl_->allow_2pc()) {
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    InitWriteBatch();
   }
 }
 
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 657e9c59656..72fa9d26af4 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <vector>
 
+#include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/snapshot.h"
@@ -273,6 +274,15 @@ class TransactionBaseImpl : public Transaction {
   // Sets a snapshot if SetSnapshotOnNextOperation() has been called.
   void SetSnapshotIfNeeded();
 
+  // Initialize write_batch_ for 2PC by inserting Noop.
+  inline void InitWriteBatch(bool clear = false) {
+    if (clear) {
+      write_batch_.Clear();
+    }
+    assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
+    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+  }
+
   DB* db_;
   DBImpl* dbimpl_;
 
@@ -325,16 +335,18 @@ class TransactionBaseImpl : public Transaction {
   // Optimistic Transactions will wait till commit time to do conflict checking.
   TransactionKeyMap tracked_keys_;
 
+  // Stack of the Snapshot saved at each save point. Saved snapshots may be
+  // nullptr if there was no snapshot at the time SetSavePoint() was called.
+  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint,
+                             autovector<TransactionBaseImpl::SavePoint>>>
+      save_points_;
+
  private:
   friend class WritePreparedTxn;
   // Extra data to be persisted with the commit. Note this is only used when
   // prepare phase is not skipped.
   WriteBatch commit_time_batch_;
 
-  // Stack of the Snapshot saved at each save point.  Saved snapshots may be
-  // nullptr if there was no snapshot at the time SetSavePoint() was called.
-  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint, autovector<TransactionBaseImpl::SavePoint>>> save_points_;
-
   // If true, future Put/Merge/Deletes will be indexed in the
   // WriteBatchWithIndex.
   // If false, future Put/Merge/Deletes will be inserted directly into the
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index c5f4db5bd56..993c3b8b60c 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -78,6 +78,8 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   }
 
   unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
 }
@@ -236,6 +238,20 @@ Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
 }
 
 Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
+  // If the current write batch contains savepoints, then some special handling
+  // is required so that RollbackToSavepoint can work.
+  //
+  // RollbackToSavepoint is not supported after Prepare() is called, so only do
+  // this for unprepared batches.
+  if (!prepared && unflushed_save_points_ != nullptr &&
+      !unflushed_save_points_->empty()) {
+    return FlushWriteBatchWithSavePointToDB();
+  }
+
+  return FlushWriteBatchToDBInternal(prepared);
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   if (name_.empty()) {
     return Status::InvalidArgument("Cannot write to DB without SetName.");
   }
@@ -285,13 +301,118 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
   // Reset transaction state.
   if (!prepared) {
     prepare_batch_cnt_ = 0;
-    write_batch_.Clear();
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
   }
 
   return s;
 }
 
+Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
+  assert(unflushed_save_points_ != nullptr &&
+         unflushed_save_points_->size() > 0);
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  assert(save_points_->size() >= unflushed_save_points_->size());
+
+  // Handler class for creating an unprepared batch from a savepoint.
+  struct SavePointBatchHandler : public WriteBatch::Handler {
+    WriteBatchWithIndex* wb_;
+    const std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+
+    SavePointBatchHandler(
+        WriteBatchWithIndex* wb,
+        const std::map<uint32_t, ColumnFamilyHandle*>& handles)
+        : wb_(wb), handles_(handles) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Put(handles_.at(cf), key, value);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->Delete(handles_.at(cf), key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->SingleDelete(handles_.at(cf), key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Merge(handles_.at(cf), key, value);
+    }
+
+    // The only expected 2PC marker is the initial Noop marker.
+    Status MarkNoop(bool empty_batch) override {
+      return empty_batch ? Status::OK() : Status::InvalidArgument();
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  // The comparator of the default cf is passed in, similar to the
+  // initialization of TransactionBaseImpl::write_batch_. This comparator is
+  // only used if the write batch encounters an invalid cf id, and falls back to
+  // this comparator.
+  WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0,
+                         true, 0);
+  // Swap with write_batch_ so that wb contains the complete write batch. The
+  // actual write batch that will be flushed to DB will be built in
+  // write_batch_, and will be read by FlushWriteBatchToDBInternal.
+  std::swap(wb, write_batch_);
+  TransactionBaseImpl::InitWriteBatch();
+
+  size_t prev_boundary = WriteBatchInternal::kHeader;
+  const bool kPrepared = true;
+  for (size_t i = 0; i < unflushed_save_points_->size(); i++) {
+    SavePointBatchHandler sp_handler(&write_batch_,
+                                     *wupt_db_->GetCFHandleMap().get());
+    size_t curr_boundary = (*unflushed_save_points_)[i];
+
+    // Construct the partial write batch up to the savepoint.
+    //
+    // Theoretically, a memcpy between the write batches should be sufficient
+    // since the rewriting into the batch should produce the exact same byte
+    // representation. Rebuilding the WriteBatchWithIndex index is still
+    // necessary though, and would imply doing two passes over the batch though.
+    Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler,
+                                           prev_boundary, curr_boundary);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Flush the write batch.
+    s = FlushWriteBatchToDBInternal(!kPrepared);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (flushed_save_points_ == nullptr) {
+      flushed_save_points_.reset(
+          new autovector<WriteUnpreparedTxn::SavePoint>());
+    }
+    flushed_save_points_->emplace_back(
+        unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
+
+    prev_boundary = curr_boundary;
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
+  }
+
+  unflushed_save_points_->clear();
+  return Status::OK();
+}
+
 Status WriteUnpreparedTxn::PrepareInternal() {
   const bool kPrepared = true;
   return FlushWriteBatchToDB(kPrepared);
@@ -379,6 +500,8 @@ Status WriteUnpreparedTxn::CommitInternal() {
       wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
     }
     unprep_seqs_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
     return s;
   }  // else do the 2nd write to publish seq
 
@@ -410,6 +533,8 @@ Status WriteUnpreparedTxn::CommitInternal() {
     wpt_db_->RemovePrepared(seq.first, seq.second);
   }
   unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
   return s;
 }
 
@@ -488,6 +613,8 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       wpt_db_->RemovePrepared(seq.first, seq.second);
     }
     unprep_seqs_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
     return s;
   }  // else do the 2nd write for commit
   uint64_t& prepare_seq = seq_used;
@@ -514,6 +641,8 @@ Status WriteUnpreparedTxn::RollbackInternal() {
   }
 
   unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
   return s;
 }
 
@@ -524,6 +653,131 @@ void WriteUnpreparedTxn::Clear() {
   TransactionBaseImpl::Clear();
 }
 
+void WriteUnpreparedTxn::SetSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  PessimisticTransaction::SetSavePoint();
+  if (unflushed_save_points_ == nullptr) {
+    unflushed_save_points_.reset(new autovector<size_t>());
+  }
+  unflushed_save_points_->push_back(write_batch_.GetDataSize());
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::RollbackToSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    return RollbackToSavePointInternal();
+  }
+
+  return Status::NotFound();
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
+  Status s;
+
+  const bool kClear = true;
+  TransactionBaseImpl::InitWriteBatch(kClear);
+
+  assert(flushed_save_points_->size() > 0);
+  WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
+
+  assert(top.unprep_seqs_.size() > 0);
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  const TransactionKeyMap& tracked_keys = save_points_->top().new_keys_;
+
+  // TODO(lth): Reduce duplicate code with RollbackInternal logic.
+  ReadOptions roptions;
+  roptions.snapshot = top.snapshot_->snapshot();
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(
+          roptions.snapshot)
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          top.unprep_seqs_);
+  const auto& cf_map = *wupt_db_->GetCFHandleMap();
+  for (const auto& cfkey : tracked_keys) {
+    const auto cfid = cfkey.first;
+    const auto& keys = cfkey.second;
+
+    for (const auto& pair : keys) {
+      const auto& key = pair.first;
+      const auto& cf_handle = cf_map.at(cfid);
+      PinnableSlice pinnable_val;
+      bool not_used;
+      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+                            &callback);
+
+      if (s.ok()) {
+        s = write_batch_.Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        s = write_batch_.Delete(cf_handle, key);
+        assert(s.ok());
+      } else {
+        return s;
+      }
+    }
+  }
+
+  const bool kPrepared = true;
+  s = FlushWriteBatchToDBInternal(!kPrepared);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // PessimisticTransaction::RollbackToSavePoint will call also call
+  // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has
+  // no savepoints because this savepoint has already been flushed. Work around
+  // this by setting a fake savepoint.
+  write_batch_.SetSavePoint();
+  s = PessimisticTransaction::RollbackToSavePoint();
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  flushed_save_points_->pop_back();
+  return s;
+}
+
+Status WriteUnpreparedTxn::PopSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on
+    // write_batch_. However, write_batch_ is empty and has no savepoints
+    // because this savepoint has already been flushed. Work around this by
+    // setting a fake savepoint.
+    write_batch_.SetSavePoint();
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    flushed_save_points_->pop_back();
+    return s;
+  }
+
+  return Status::NotFound();
+}
+
 void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
                                   ColumnFamilyHandle* column_family,
                                   const size_t num_keys, const Slice* keys,
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 77c18033898..774d90e8d37 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -73,7 +73,6 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
     wup_snapshot_ = seq;
   }
 
- private:
   static SequenceNumber CalcMaxVisibleSeq(
       const std::map<SequenceNumber, size_t>& unprep_seqs,
       SequenceNumber snapshot_seq) {
@@ -84,6 +83,8 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
     }
     return std::max(max_unprepared, snapshot_seq);
   }
+
+ private:
   WritePreparedTxnDB* db_;
   const std::map<SequenceNumber, size_t>& unprep_seqs_;
   SequenceNumber wup_snapshot_;
@@ -139,6 +140,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   void Clear() override;
 
+  void SetSavePoint() override;
+  Status RollbackToSavePoint() override;
+  Status PopSavePoint() override;
+
   // Get and GetIterator needs to be overridden so that a ReadCallback to
   // handle read-your-own-write is used.
   using Transaction::Get;
@@ -172,6 +177,9 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   Status MaybeFlushWriteBatchToDB();
   Status FlushWriteBatchToDB(bool prepared);
+  Status FlushWriteBatchToDBInternal(bool prepared);
+  Status FlushWriteBatchWithSavePointToDB();
+  Status RollbackToSavePointInternal();
   Status HandleWrite(std::function<Status()> do_write);
 
   // For write unprepared, we check on every writebatch append to see if
@@ -210,6 +218,56 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // but in some cases, we should be able to restore the previously largest
   // value when calling RollbackToSavepoint.
   SequenceNumber largest_validated_seq_;
+
+  struct SavePoint {
+    // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
+    // used during RollbackToSavepoint to determine visibility when restoring
+    // old values.
+    //
+    // TODO(lth): Since all unprep_seqs_ sets further down the stack must be
+    // subsets, this can potentially be deduplicated by just storing set
+    // difference. Investigate if this is worth it.
+    std::map<SequenceNumber, size_t> unprep_seqs_;
+
+    // This snapshot will be used to read keys at this savepoint if we call
+    // RollbackToSavePoint.
+    std::unique_ptr<ManagedSnapshot> snapshot_;
+
+    SavePoint(const std::map<SequenceNumber, size_t>& seqs,
+              ManagedSnapshot* snapshot)
+        : unprep_seqs_(seqs), snapshot_(snapshot){};
+  };
+
+  // We have 3 data structures holding savepoint information:
+  // 1. TransactionBaseImpl::save_points_
+  // 2. WriteUnpreparedTxn::flushed_save_points_
+  // 3. WriteUnpreparecTxn::unflushed_save_points_
+  //
+  // TransactionBaseImpl::save_points_ holds information about all write
+  // batches, including the current in-memory write_batch_, or unprepared
+  // batches that have been written out. Its responsibility is just to track
+  // which keys have been modified in every savepoint.
+  //
+  // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
+  // set on unprepared batches that have already flushed. It holds the snapshot
+  // and unprep_seqs at that savepoint, so that the rollback process can
+  // determine which keys were visible at that point in time.
+  //
+  // WriteUnpreparecTxn::unflushed_save_points_ holds information about
+  // savepoints on the current in-memory write_batch_. It simply records the
+  // size of the write batch at every savepoint.
+  //
+  // TODO(lth): Remove the redundancy between save_point_boundaries_ and
+  // write_batch_.save_points_.
+  //
+  // Based on this information, here are some invariants:
+  // size(unflushed_save_points_) = size(write_batch_.save_points_)
+  // size(flushed_save_points_) + size(unflushed_save_points_)
+  //   = size(save_points_)
+  //
+  std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
+      flushed_save_points_;
+  std::unique_ptr<autovector<size_t>> unflushed_save_points_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 875d5416763..4381619e782 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -279,8 +279,8 @@ Status WriteUnpreparedTxnDB::Initialize(
       }
     }
 
-    wupt->write_batch_.Clear();
-    WriteBatchInternal::InsertNoop(wupt->write_batch_.GetWriteBatch());
+    const bool kClear = true;
+    wupt->InitWriteBatch(kClear);
 
     real_trx->SetState(Transaction::PREPARED);
     if (!s.ok()) {
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index cf17abf22e9..3ffa2e0c62a 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -627,6 +627,11 @@ WriteBatchWithIndex::WriteBatchWithIndex(
 
 WriteBatchWithIndex::~WriteBatchWithIndex() {}
 
+WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default;
+
+WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) =
+    default;
+
 WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
 
 size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }

From 1dfc5eaab03f998ab13a6953b53e41cdfd2c8237 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Wed, 31 Jul 2019 15:16:01 -0700
Subject: [PATCH 274/572] Test the various configurations in parallel in
 MergeOperatorPinningTest (#5659)

Summary:
MergeOperatorPinningTest.Randomized frequently times out under TSAN
because it tests ~40 option configurations sequentially in a loop. The
patch parallelizes the tests of the various configurations to make the
test complete faster.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5659

Test Plan: Tested using buck test mode/dev-tsan ...

Differential Revision: D16587518

Pulled By: ltamasi

fbshipit-source-id: 65bd25c0ad9a23587fed5592e69c1a0097fa27f6
---
 db/db_merge_operator_test.cc | 142 ++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 62 deletions(-)

diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 2b5e4a445ea..31bd2e491b1 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -275,68 +275,6 @@ TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
   VerifyDBFromMap(true_data);
 }
 
-TEST_P(MergeOperatorPinningTest, Randomized) {
-  do {
-    Options options = CurrentOptions();
-    options.merge_operator = MergeOperators::CreateMaxOperator();
-    BlockBasedTableOptions table_options;
-    table_options.no_block_cache = disable_block_cache_;
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    DestroyAndReopen(options);
-
-    Random rnd(301);
-    std::map<std::string, std::string> true_data;
-
-    const int kTotalMerges = 5000;
-    // Every key gets ~10 operands
-    const int kKeyRange = kTotalMerges / 10;
-    const int kOperandSize = 20;
-    const int kNumPutBefore = kKeyRange / 10;  // 10% value
-    const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
-    const int kNumDelete = kKeyRange / 10;     // 10% delete
-
-    // kNumPutBefore keys will have base values
-    for (int i = 0; i < kNumPutBefore; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Put(WriteOptions(), key, value));
-
-      true_data[key] = value;
-    }
-
-    // Do kTotalMerges merges
-    for (int i = 0; i < kTotalMerges; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
-
-      if (true_data[key] < value) {
-        true_data[key] = value;
-      }
-    }
-
-    // Overwrite random kNumPutAfter keys
-    for (int i = 0; i < kNumPutAfter; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Put(WriteOptions(), key, value));
-
-      true_data[key] = value;
-    }
-
-    // Delete random kNumDelete keys
-    for (int i = 0; i < kNumDelete; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      ASSERT_OK(db_->Delete(WriteOptions(), key));
-
-      true_data.erase(key);
-    }
-
-    VerifyDBFromMap(true_data);
-
-  } while (ChangeOptions(kSkipMergePut));
-}
-
 class MergeOperatorHook : public MergeOperator {
  public:
   explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
@@ -637,6 +575,86 @@ TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
   db_->ReleaseSnapshot(snapshot2);
 }
 
+class PerConfigMergeOperatorPinningTest
+    : public DBMergeOperatorTest,
+      public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+  PerConfigMergeOperatorPinningTest() {
+    std::tie(disable_block_cache_, option_config_) = GetParam();
+  }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+                                        static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+  if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+
+  const int kTotalMerges = 5000;
+  // Every key gets ~10 operands
+  const int kKeyRange = kTotalMerges / 10;
+  const int kOperandSize = 20;
+  const int kNumPutBefore = kKeyRange / 10;  // 10% value
+  const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
+  const int kNumDelete = kKeyRange / 10;     // 10% delete
+
+  // kNumPutBefore keys will have base values
+  for (int i = 0; i < kNumPutBefore; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Do kTotalMerges merges
+  for (int i = 0; i < kTotalMerges; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+    if (true_data[key] < value) {
+      true_data[key] = value;
+    }
+  }
+
+  // Overwrite random kNumPutAfter keys
+  for (int i = 0; i < kNumPutAfter; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Delete random kNumDelete keys
+  for (int i = 0; i < kNumDelete; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+    true_data.erase(key);
+  }
+
+  VerifyDBFromMap(true_data);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From d1c9ede1956a29472fbe7202cd3e8ee7aefa7c31 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Thu, 1 Aug 2019 15:45:19 -0700
Subject: [PATCH 275/572] Fix duplicated file names in PurgeObsoleteFiles
 (#5603)

Summary:
Currently in `DBImpl::PurgeObsoleteFiles`, the list of candidate files is create through a combination of calling LogFileName using `log_delete_files` and `full_scan_candidate_files`.

In full_scan_candidate_files, the filenames look like this
{file_name = "074715.log", file_path = "/txlogs/3306"},
but LogFileName produces filenames like this that prepends a slash:
{file_name = "/074715.log", file_path = "/txlogs/3306"},

This confuses the dedup step here: https://github.com/facebook/rocksdb/blob/bb4178066dc4f18b9b7f1d371e641db027b3edbe/db/db_impl/db_impl_files.cc#L339-L345

Because duplicates still exist, DeleteFile is called on the same file twice, and hits an error on the second try. Error message: Failed to mark /txlogs/3302/764418.log as trash.

The root cause is the use of `kDumbDbName` when generating file names, it creates file names like /074715.log. This PR removes the use of `kDumbDbName` and create paths without leading '/' when dbname can be ignored.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5603

Test Plan: make check

Differential Revision: D16413203

Pulled By: miasantreble

fbshipit-source-id: 6ba8288382c55f7d5e3892d722fc94b57d2e4491
---
 db/db_impl/db_impl_files.cc |  5 ++---
 file/filename.cc            | 21 +++++++++++++++++----
 file/filename.h             |  4 ++++
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 7afe3955e5b..e3b2f576523 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -316,10 +316,9 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       candidate_files.size() + state.sst_delete_files.size() +
       state.log_delete_files.size() + state.manifest_delete_files.size());
   // We may ignore the dbname when generating the file names.
-  const char* kDumbDbName = "";
   for (auto& file : state.sst_delete_files) {
     candidate_files.emplace_back(
-        MakeTableFileName(kDumbDbName, file.metadata->fd.GetNumber()),
+        MakeTableFileName(file.metadata->fd.GetNumber()),
         file.path);
     if (file.metadata->table_reader_handle) {
       table_cache_->Release(file.metadata->table_reader_handle);
@@ -329,7 +328,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
 
   for (auto file_num : state.log_delete_files) {
     if (file_num > 0) {
-      candidate_files.emplace_back(LogFileName(kDumbDbName, file_num),
+      candidate_files.emplace_back(LogFileName(file_num),
                                    immutable_db_options_.wal_dir);
     }
   }
diff --git a/file/filename.cc b/file/filename.cc
index d4f7dd9ec7c..65ec3314995 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -57,13 +57,17 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
   return write_idx;
 }
 
-static std::string MakeFileName(const std::string& name, uint64_t number,
-                                const char* suffix) {
+static std::string MakeFileName(uint64_t number, const char* suffix) {
   char buf[100];
-  snprintf(buf, sizeof(buf), "/%06llu.%s",
+  snprintf(buf, sizeof(buf), "%06llu.%s",
            static_cast<unsigned long long>(number),
            suffix);
-  return name + buf;
+  return buf;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  return name + "/" + MakeFileName(number, suffix);
 }
 
 std::string LogFileName(const std::string& name, uint64_t number) {
@@ -71,6 +75,11 @@ std::string LogFileName(const std::string& name, uint64_t number) {
   return MakeFileName(name, number, "log");
 }
 
+std::string LogFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, "log");
+}
+
 std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
   assert(number > 0);
   return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
@@ -95,6 +104,10 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) {
   return MakeFileName(path, number, kRocksDbTFileExt.c_str());
 }
 
+std::string MakeTableFileName(uint64_t number) {
+  return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
 std::string Rocks2LevelTableFileName(const std::string& fullname) {
   assert(fullname.size() > kRocksDbTFileExt.size() + 1);
   if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
diff --git a/file/filename.h b/file/filename.h
index db06f4664e2..91b905f07ab 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -47,6 +47,8 @@ enum FileType {
 // "dbname".
 extern std::string LogFileName(const std::string& dbname, uint64_t number);
 
+extern std::string LogFileName(uint64_t number);
+
 extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
 
 extern std::string BlobFileName(const std::string& dbname,
@@ -63,6 +65,8 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
 
 extern std::string MakeTableFileName(const std::string& name, uint64_t number);
 
+extern std::string MakeTableFileName(uint64_t number);
+
 // Return the name of sstable with LevelDB suffix
 // created from RocksDB sstable suffixed name
 extern std::string Rocks2LevelTableFileName(const std::string& fullname);

From 30edf1874c11762a6cacf4434112ce34d13100d3 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 2 Aug 2019 10:40:32 -0700
Subject: [PATCH 276/572] Change buckifier to support parameterized
 dependencies (#5648)

Summary:
Users may desire to specify extra dependencies via buck. This PR allows users to pass additional dependencies as a JSON object so that the buckifier script can generate TARGETS file with desired extra dependencies.

Test plan (on dev server)
```
$python buckifier/buckify_rocksdb.py '{"fake": {"extra_deps": [":test_dep", "//fakes/module:mock1"], "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]}}'
Generating TARGETS
Extra dependencies:
{'': {'extra_compiler_flags': [], 'extra_deps': []}, 'test_dep1': {'extra_compiler_flags': ['-O2', '-DROCKSDB_LITE'], 'extra_deps': [':fake', '//dep1/mock']}}
Generated TARGETS Summary:
- 5 libs
- 0 binarys
- 296 tests
```
Verify the TARGETS file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5648

Differential Revision: D16565043

Pulled By: riversand963

fbshipit-source-id: a6ef02274174fcf159692d7b846e828454d01e89
---
 TARGETS                      | 302 ++++++++++++++++++++++++++++++++++-
 buckifier/buckify_rocksdb.py |  97 ++++++++---
 buckifier/targets_builder.py |  11 +-
 buckifier/targets_cfg.py     |   8 +-
 defs.bzl                     |   8 +-
 5 files changed, 398 insertions(+), 28 deletions(-)

diff --git a/TARGETS b/TARGETS
index 884d69b14bc..25d7ff66759 100644
--- a/TARGETS
+++ b/TARGETS
@@ -396,747 +396,1043 @@ cpp_library(
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
-# [test_name, test_src, test_type]
+# [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
 ROCKS_TESTS = [
     [
         "arena_test",
         "memory/arena_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "auto_roll_logger_test",
         "logging/auto_roll_logger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "autovector_test",
         "util/autovector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "backupable_db_test",
         "utilities/backupable/backupable_db_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "blob_db_test",
         "utilities/blob_db/blob_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "block_based_filter_block_test",
         "table/block_based/block_based_filter_block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "block_cache_trace_analyzer_test",
         "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "block_cache_tracer_test",
         "trace_replay/block_cache_tracer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "block_test",
         "table/block_based/block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "bloom_test",
         "util/bloom_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "c_test",
         "db/c_test.c",
         "serial",
+        [],
+        [],
     ],
     [
         "cache_simulator_test",
         "utilities/simulator_cache/cache_simulator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cache_test",
         "cache/cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_format_test",
         "utilities/cassandra/cassandra_format_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_functional_test",
         "utilities/cassandra/cassandra_functional_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_row_merge_test",
         "utilities/cassandra/cassandra_row_merge_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_serialize_test",
         "utilities/cassandra/cassandra_serialize_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "checkpoint_test",
         "utilities/checkpoint/checkpoint_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cleanable_test",
         "table/cleanable_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "coding_test",
         "util/coding_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "column_family_test",
         "db/column_family_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compact_files_test",
         "db/compact_files_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compact_on_deletion_collector_test",
         "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_iterator_test",
         "db/compaction/compaction_iterator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_job_stats_test",
         "db/compaction/compaction_job_stats_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_job_test",
         "db/compaction/compaction_job_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_picker_test",
         "db/compaction/compaction_picker_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "comparator_db_test",
         "db/comparator_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "corruption_test",
         "db/corruption_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "crc32c_test",
         "util/crc32c_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_builder_test",
         "table/cuckoo/cuckoo_table_builder_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_db_test",
         "db/cuckoo_table_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_reader_test",
         "table/cuckoo/cuckoo_table_reader_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "data_block_hash_index_test",
         "table/block_based/data_block_hash_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_basic_test",
         "db/db_basic_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_blob_index_test",
         "db/db_blob_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_block_cache_test",
         "db/db_block_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_bloom_filter_test",
         "db/db_bloom_filter_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_compaction_filter_test",
         "db/db_compaction_filter_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_compaction_test",
         "db/db_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_dynamic_level_test",
         "db/db_dynamic_level_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_encryption_test",
         "db/db_encryption_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_flush_test",
         "db/db_flush_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_inplace_update_test",
         "db/db_inplace_update_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_io_failure_test",
         "db/db_io_failure_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iter_stress_test",
         "db/db_iter_stress_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iter_test",
         "db/db_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iterator_test",
         "db/db_iterator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_log_iter_test",
         "db/db_log_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_memtable_test",
         "db/db_memtable_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_merge_operator_test",
         "db/db_merge_operator_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_options_test",
         "db/db_options_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_properties_test",
         "db/db_properties_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_range_del_test",
         "db/db_range_del_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_secondary_test",
         "db/db_impl/db_secondary_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_sst_test",
         "db/db_sst_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_statistics_test",
         "db/db_statistics_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_table_properties_test",
         "db/db_table_properties_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_tailing_iter_test",
         "db/db_tailing_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_test",
         "db/db_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_test2",
         "db/db_test2.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_universal_compaction_test",
         "db/db_universal_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_wal_test",
         "db/db_wal_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_write_test",
         "db/db_write_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "dbformat_test",
         "db/dbformat_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "delete_scheduler_test",
         "file/delete_scheduler_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "deletefile_test",
         "db/deletefile_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "dynamic_bloom_test",
         "util/dynamic_bloom_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_basic_test",
         "env/env_basic_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_logger_test",
         "logging/env_logger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_test",
         "env/env_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_timed_test",
         "utilities/env_timed_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "error_handler_test",
         "db/error_handler_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "event_logger_test",
         "logging/event_logger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "external_sst_file_basic_test",
         "db/external_sst_file_basic_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "external_sst_file_test",
         "db/external_sst_file_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "fault_injection_test",
         "db/fault_injection_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "file_indexer_test",
         "db/file_indexer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "file_reader_writer_test",
         "util/file_reader_writer_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "filelock_test",
         "util/filelock_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "filename_test",
         "db/filename_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "flush_job_test",
         "db/flush_job_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "full_filter_block_test",
         "table/block_based/full_filter_block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "hash_table_test",
         "utilities/persistent_cache/hash_table_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "hash_test",
         "util/hash_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "heap_test",
         "util/heap_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "histogram_test",
         "monitoring/histogram_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "import_column_family_test",
         "db/import_column_family_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "inlineskiplist_test",
         "memtable/inlineskiplist_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "iostats_context_test",
         "monitoring/iostats_context_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "ldb_cmd_test",
         "tools/ldb_cmd_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "listener_test",
         "db/listener_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "log_test",
         "db/log_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "lru_cache_test",
         "cache/lru_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "manual_compaction_test",
         "db/manual_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "memory_test",
         "utilities/memory/memory_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "memtable_list_test",
         "db/memtable_list_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merge_helper_test",
         "db/merge_helper_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merge_test",
         "db/merge_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merger_test",
         "table/merger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "mock_env_test",
         "env/mock_env_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "object_registry_test",
         "utilities/object_registry_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "obsolete_files_test",
         "db/obsolete_files_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "optimistic_transaction_test",
         "utilities/transactions/optimistic_transaction_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "option_change_migration_test",
         "utilities/option_change_migration/option_change_migration_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_file_test",
         "db/options_file_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_settable_test",
         "options/options_settable_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_test",
         "options/options_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_util_test",
         "utilities/options/options_util_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "partitioned_filter_block_test",
         "table/block_based/partitioned_filter_block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "perf_context_test",
         "db/perf_context_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "persistent_cache_test",
         "utilities/persistent_cache/persistent_cache_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "plain_table_db_test",
         "db/plain_table_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "prefix_test",
         "db/prefix_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "range_del_aggregator_test",
         "db/range_del_aggregator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "range_tombstone_fragmenter_test",
         "db/range_tombstone_fragmenter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "rate_limiter_test",
         "util/rate_limiter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "reduce_levels_test",
         "tools/reduce_levels_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "repair_test",
         "db/repair_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "repeatable_thread_test",
         "util/repeatable_thread_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sim_cache_test",
         "utilities/simulator_cache/sim_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "skiplist_test",
         "memtable/skiplist_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "slice_transform_test",
         "util/slice_transform_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sst_dump_test",
         "tools/sst_dump_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sst_file_reader_test",
         "table/sst_file_reader_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "statistics_test",
         "monitoring/statistics_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "stats_history_test",
         "monitoring/stats_history_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "stringappend_test",
         "utilities/merge_operators/string_append/stringappend_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "table_properties_collector_test",
         "db/table_properties_collector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "table_test",
         "table/table_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "thread_list_test",
         "util/thread_list_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "thread_local_test",
         "util/thread_local_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "timer_queue_test",
         "util/timer_queue_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "trace_analyzer_test",
         "tools/trace_analyzer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "transaction_test",
         "utilities/transactions/transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "ttl_test",
         "utilities/ttl/ttl_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "util_merge_operators_test",
         "utilities/util_merge_operators_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_builder_test",
         "db/version_builder_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_edit_test",
         "db/version_edit_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_set_test",
         "db/version_set_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "wal_manager_test",
         "db/wal_manager_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_batch_test",
         "db/write_batch_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_batch_with_index_test",
         "utilities/write_batch_with_index/write_batch_with_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_buffer_manager_test",
         "memtable/write_buffer_manager_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_callback_test",
         "db/write_callback_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_controller_test",
         "db/write_controller_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_prepared_transaction_test",
         "utilities/transactions/write_prepared_transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "write_unprepared_transaction_test",
         "utilities/transactions/write_unprepared_transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
 ]
 
@@ -1145,6 +1441,8 @@ ROCKS_TESTS = [
 # will not be included.
 [
     test_binary(
+        extra_compiler_flags = extra_compiler_flags,
+        extra_deps = extra_deps,
         parallelism = parallelism,
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
@@ -1155,6 +1453,6 @@ ROCKS_TESTS = [
         test_cc = test_cc,
         test_name = test_name,
     )
-    for test_name, test_cc, parallelism in ROCKS_TESTS
+    for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
 ]
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index 94b63a4e8bf..fc59cf5830a 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -4,12 +4,31 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 from targets_builder import TARGETSBuilder
+import json
 import os
 import fnmatch
 import sys
 
 from util import ColorString
 
+# This script generates TARGETS file for Buck.
+# Buck is a build tool specifying dependencies among different build targets.
+# User can pass extra dependencies as a JSON object via command line, and this
+# script can include these dependencies in the generate TARGETS file.
+# Usage:
+# $python buckifier/buckify_rocksdb.py
+# (This generates a TARGET file without user-specified dependency for unit
+# tests.)
+# $python buckifier/buckify_rocksdb.py \
+#        '{"fake": { \
+#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],  \
+#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \
+#                  } \
+#         }'
+# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
+# unit tests, and will use the extra_compiler_flags to compile the unit test
+# source.)
+
 # tests to export as libraries for inclusion in other projects
 _EXPORTED_TEST_LIBS = ["env_basic_test"]
 
@@ -86,8 +105,38 @@ def get_tests(repo_path):
     return tests
 
 
+# Parse extra dependencies passed by user from command line
+def get_dependencies():
+    deps_map = {
+        ''.encode('ascii'): {
+            'extra_deps'.encode('ascii'): [],
+            'extra_compiler_flags'.encode('ascii'): []
+        }
+    }
+    if len(sys.argv) < 2:
+        return deps_map
+
+    def encode_dict(data):
+        rv = {}
+        for k, v in data.items():
+            if isinstance(k, unicode):
+                k = k.encode('ascii')
+            if isinstance(v, unicode):
+                v = v.encode('ascii')
+            elif isinstance(v, list):
+                v = [x.encode('ascii') for x in v]
+            elif isinstance(v, dict):
+                v = encode_dict(v)
+            rv[k] = v
+        return rv
+    extra_deps = json.loads(sys.argv[1], object_hook=encode_dict)
+    for target_alias, deps in extra_deps.items():
+        deps_map[target_alias] = deps
+    return deps_map
+
+
 # Prepare TARGETS file for buck
-def generate_targets(repo_path):
+def generate_targets(repo_path, deps_map):
     print(ColorString.info("Generating TARGETS"))
     # parsed src.mk file
     src_mk = parse_src_mk(repo_path)
@@ -121,24 +170,33 @@ def generate_targets(repo_path):
         ["test_util/testutil.cc"],
         [":rocksdb_lib"])
 
+    print("Extra dependencies:\n{0}".format(str(deps_map)))
     # test for every test we found in the Makefile
-    for test in sorted(tests):
-        match_src = [src for src in cc_files if ("/%s.c" % test) in src]
-        if len(match_src) == 0:
-            print(ColorString.warning("Cannot find .cc file for %s" % test))
-            continue
-        elif len(match_src) > 1:
-            print(ColorString.warning("Found more than one .cc for %s" % test))
-            print(match_src)
-            continue
-
-        assert(len(match_src) == 1)
-        is_parallel = tests[test]
-        TARGETS.register_test(test, match_src[0], is_parallel)
-
-        if test in _EXPORTED_TEST_LIBS:
-            test_library = "%s_lib" % test
-            TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"])
+    for target_alias, deps in deps_map.items():
+        for test in sorted(tests):
+            match_src = [src for src in cc_files if ("/%s.c" % test) in src]
+            if len(match_src) == 0:
+                print(ColorString.warning("Cannot find .cc file for %s" % test))
+                continue
+            elif len(match_src) > 1:
+                print(ColorString.warning("Found more than one .cc for %s" % test))
+                print(match_src)
+                continue
+
+            assert(len(match_src) == 1)
+            is_parallel = tests[test]
+            test_target_name = \
+                test if not target_alias else test + "_" + target_alias
+            TARGETS.register_test(
+                test_target_name,
+                match_src[0],
+                is_parallel,
+                deps['extra_deps'],
+                deps['extra_compiler_flags'])
+
+            if test in _EXPORTED_TEST_LIBS:
+                test_library = "%s_lib" % test_target_name
+                TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"])
     TARGETS.flush_tests()
 
     print(ColorString.info("Generated TARGETS Summary:"))
@@ -163,8 +221,9 @@ def exit_with_error(msg):
 
 
 def main():
+    deps_map = get_dependencies()
     # Generate TARGETS file for buck
-    ok = generate_targets(get_rocksdb_path())
+    ok = generate_targets(get_rocksdb_path(), deps_map)
     if not ok:
         exit_with_error("Failed to generate TARGETS files")
 
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index 493cd8a8a8a..78db6a169b3 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -51,14 +51,21 @@ def add_binary(self, name, srcs, deps=None):
             pretty_list(deps)))
         self.total_bin = self.total_bin + 1
 
-    def register_test(self, test_name, src, is_parallel):
+    def register_test(self,
+                      test_name,
+                      src,
+                      is_parallel,
+                      extra_deps,
+                      extra_compiler_flags):
         exec_mode = "serial"
         if is_parallel:
             exec_mode = "parallel"
         self.tests_cfg += targets_cfg.test_cfg_template % (
             test_name,
             str(src),
-            str(exec_mode))
+            str(exec_mode),
+            extra_deps,
+            extra_compiler_flags)
 
         self.total_test = self.total_test + 1
 
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 0ebd6d9427e..19ea777270d 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -140,11 +140,13 @@
         "%s",
         "%s",
         "%s",
+        %s,
+        %s,
     ],
 """
 
 unittests_template = """
-# [test_name, test_src, test_type]
+# [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
 ROCKS_TESTS = [
 %s]
 
@@ -153,6 +155,8 @@
 # will not be included.
 [
     test_binary(
+        extra_compiler_flags = extra_compiler_flags,
+        extra_deps = extra_deps,
         parallelism = parallelism,
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
@@ -163,7 +167,7 @@
         test_cc = test_cc,
         test_name = test_name,
     )
-    for test_name, test_cc, parallelism in ROCKS_TESTS
+    for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
 ]
 """
diff --git a/defs.bzl b/defs.bzl
index a9f25ebcc42..d5b7b6af718 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -12,7 +12,9 @@ def test_binary(
         rocksdb_compiler_flags,
         rocksdb_preprocessor_flags,
         rocksdb_external_deps,
-        rocksdb_os_deps):
+        rocksdb_os_deps,
+        extra_deps,
+        extra_compiler_flags):
     TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh"
 
     ttype = "gtest" if parallelism == "parallel" else "simple"
@@ -23,9 +25,9 @@ def test_binary(
         srcs = [test_cc],
         arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
         os_preprocessor_flags = rocksdb_os_preprocessor_flags,
-        compiler_flags = rocksdb_compiler_flags,
+        compiler_flags = rocksdb_compiler_flags + extra_compiler_flags,
         preprocessor_flags = rocksdb_preprocessor_flags,
-        deps = [":rocksdb_test_lib"],
+        deps = [":rocksdb_test_lib"] + extra_deps,
         os_deps = rocksdb_os_deps,
         external_deps = rocksdb_external_deps,
     )

From e579e32eaa33ba368c7b1d4de61da6ae4c7b1351 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 5 Aug 2019 13:30:31 -0700
Subject: [PATCH 277/572] Disable ReadYourOwnWriteStress when run under
 Valgrind (#5671)

Summary:
It sometimes times out when run under valgrind taking around 20m. The patch skips the test under Valgrind.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5671

Differential Revision: D16652382

Pulled By: maysamyabandeh

fbshipit-source-id: 0f6f4f76d37337d56226b689e01b14523dd07aae
---
 utilities/transactions/write_unprepared_transaction_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index feaedea067f..e9d305c69e9 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -115,6 +115,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
   }
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
   // This is a stress test where different threads are writing random keys, and
   // then before committing or aborting the transaction, it validates to see
@@ -294,6 +295,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
     }
   }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // This tests how write unprepared behaves during recovery when the DB crashes
 // after a transaction has either been unprepared or prepared, and tests if

From 208556ee13306050f20cfddb4eac6cdcc2b1c850 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 5 Aug 2019 13:30:56 -0700
Subject: [PATCH 278/572] WritePrepared: fix Get without snapshot (#5664)

Summary:
if read_options.snapshot is not set, ::Get will take the last sequence number after taking a super-version and uses that as the sequence number. Theoretically max_eviceted_seq_ could advance this sequence number. This could lead ::IsInSnapshot that will be invoked by the ReadCallback to notice the absence of the snapshot. In this case, the ReadCallback should have passed a non-value to snap_released so that it could be set by the ::IsInSnapshot. The patch does that, and adds a unit test to verify it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5664

Differential Revision: D16614033

Pulled By: maysamyabandeh

fbshipit-source-id: 06fb3fd4aacd75806ed1a1acec7961f5d02486f2
---
 include/rocksdb/statistics.h                  |  2 +
 java/rocksjni/portal.h                        |  4 ++
 .../src/main/java/org/rocksdb/TickerType.java |  5 ++
 monitoring/statistics.cc                      |  1 +
 .../write_prepared_transaction_test.cc        | 60 +++++++++++++++-
 utilities/transactions/write_prepared_txn.cc  | 36 ++++++----
 .../transactions/write_prepared_txn_db.cc     | 14 ++--
 .../transactions/write_prepared_txn_db.h      | 68 ++++++++++++++-----
 .../transactions/write_unprepared_txn.cc      | 32 ++++++---
 utilities/transactions/write_unprepared_txn.h | 23 ++++++-
 .../transactions/write_unprepared_txn_db.cc   |  3 +-
 11 files changed, 199 insertions(+), 49 deletions(-)

diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index a8d01e03415..b6b78ef99a3 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -324,6 +324,8 @@ enum Tickers : uint32_t {
   TXN_DUPLICATE_KEY_OVERHEAD,
   // # of times snapshot_mutex_ is acquired in the fast path.
   TXN_SNAPSHOT_MUTEX_OVERHEAD,
+  // # of times ::Get returned TryAgain due to expired snapshot seq
+  TXN_GET_TRY_AGAIN,
 
   // Number of keys actually found in MultiGet calls (vs number requested by
   // caller)
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 667af809bdc..e9dc3fb82b1 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -4620,6 +4620,8 @@ class TickerTypeJni {
         return -0x0B;
       case rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD:
         return -0x0C;
+      case rocksdb::Tickers::TXN_GET_TRY_AGAIN:
+        return -0x0D;
       case rocksdb::Tickers::TICKER_ENUM_MAX:
         // 0x5F for backwards compatibility on current minor version.
         return 0x5F;
@@ -4912,6 +4914,8 @@ class TickerTypeJni {
         return rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD;
       case -0x0C:
         return rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD;
+      case -0x0D:
+        return rocksdb::Tickers::TXN_GET_TRY_AGAIN;
       case 0x5F:
         // 0x5F for backwards compatibility on current minor version.
         return rocksdb::Tickers::TICKER_ENUM_MAX;
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 551e366dc53..40a642bd666 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -717,6 +717,11 @@ public enum TickerType {
      */
     TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C),
 
+    /**
+     * # of times ::Get returned TryAgain due to expired snapshot seq
+     */
+    TXN_GET_TRY_AGAIN((byte) -0x0D),
+
     TICKER_ENUM_MAX((byte) 0x5F);
 
     private final byte value;
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 15d702d1f4a..70c993b201a 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -162,6 +162,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      "rocksdb.txn.overhead.mutex.old.commit.map"},
     {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
     {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
+    {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"},
     {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
     {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
     {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index ef89aaeb8c7..2cb91f0d350 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1372,7 +1372,7 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
     for (int i = 0; i < writes; i++) {
       WriteBatch batch;
       // For duplicate keys cause 4 commit entries, each evicting an entry that
-      // is not published yet, thus causing max ecited seq go higher than last
+      // is not published yet, thus causing max evicted seq go higher than last
       // published.
       for (int b = 0; b < batch_cnt; b++) {
         batch.Put("foo", "foo");
@@ -1404,6 +1404,64 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
   db->ReleaseSnapshot(snap);
 }
 
+// Test that reads without snapshots would not hit an undefined state
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  const int writes = 50;
+  rocksdb::port::Thread t1([&]() {
+    for (int i = 0; i < writes; i++) {
+      WriteBatch batch;
+      batch.Put("key", "foo");
+      db->Write(woptions, &batch);
+    }
+  });
+
+  rocksdb::port::Thread t2([&]() {
+    while (wp_db->max_evicted_seq_ == 0) {  // wait for insert thread
+      std::this_thread::yield();
+    }
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    TransactionOptions txn_options;
+    for (int i = 0; i < 10; i++) {
+      auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      Transaction* txn = db->BeginTransaction(woptions, txn_options);
+      s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      std::vector<std::string> values;
+      auto s_vec =
+          txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values);
+      ASSERT_EQ(1, values.size());
+      ASSERT_EQ(1, s_vec.size());
+      s = s_vec[0];
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      Slice key("key");
+      txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val,
+                    &s, true);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      delete txn;
+    }
+  });
+
+  t1.join();
+  t2.join();
+
+  // Make sure that the test has worked and seq number has advanced as we
+  // thought
+  auto snap = db->GetSnapshot();
+  ASSERT_GT(snap->GetSequenceNumber(), writes - 1);
+  db->ReleaseSnapshot(snap);
+}
+
 // Check that old_commit_map_ cleanup works correctly if the snapshot equals
 // max_evicted_seq_.
 TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 97bebac5d57..188f61120be 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -46,13 +46,16 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options,
                                 PinnableSlice* values, Status* statuses,
                                 bool sorted_input) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
-  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
   write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
                                       keys, values, statuses, sorted_input,
                                       &callback);
-  if (UNLIKELY(!wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+  if (UNLIKELY(!callback.valid() ||
+               !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     for (size_t i = 0; i < num_keys; i++) {
       statuses[i] = Status::TryAgain();
     }
@@ -63,15 +66,18 @@ Status WritePreparedTxn::Get(const ReadOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& key, PinnableSlice* pinnable_val) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
-  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
   auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
                                             pinnable_val, &callback);
-  if (LIKELY(wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+  if (LIKELY(callback.valid() &&
+             wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
                                        backed_by_snapshot))) {
     return res;
   } else {
+    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
@@ -241,9 +247,11 @@ Status WritePreparedTxn::RollbackInternal() {
   auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
   auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
   auto read_at_seq = kMaxSequenceNumber;
+  ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
   struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
     DBImpl* db_;
-    ReadOptions roptions;
     WritePreparedTxnReadCallback callback;
     WriteBatch* rollback_batch_;
     std::map<uint32_t, const Comparator*>& comparators_;
@@ -251,18 +259,20 @@ Status WritePreparedTxn::RollbackInternal() {
     using CFKeys = std::set<Slice, SetComparator>;
     std::map<uint32_t, CFKeys> keys_;
     bool rollback_merge_operands_;
+    ReadOptions roptions_;
     RollbackWriteBatchBuilder(
         DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
         WriteBatch* dst_batch,
         std::map<uint32_t, const Comparator*>& comparators,
         std::map<uint32_t, ColumnFamilyHandle*>& handles,
-        bool rollback_merge_operands)
+        bool rollback_merge_operands, ReadOptions _roptions)
         : db_(db),
           callback(wpt_db, snap_seq),  // disable min_uncommitted optimization
           rollback_batch_(dst_batch),
           comparators_(comparators),
           handles_(handles),
-          rollback_merge_operands_(rollback_merge_operands) {}
+          rollback_merge_operands_(rollback_merge_operands),
+          roptions_(_roptions) {}
 
     Status Rollback(uint32_t cf, const Slice& key) {
       Status s;
@@ -280,7 +290,7 @@ Status WritePreparedTxn::RollbackInternal() {
       PinnableSlice pinnable_val;
       bool not_used;
       auto cf_handle = handles_[cf];
-      s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+      s = db_->GetImpl(roptions_, cf_handle, key, &pinnable_val, &not_used,
                        &callback);
       assert(s.ok() || s.IsNotFound());
       if (s.ok()) {
@@ -330,7 +340,8 @@ Status WritePreparedTxn::RollbackInternal() {
     bool WriteAfterCommit() const override { return false; }
   } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
                      *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
-                     wpt_db_->txn_db_options_.rollback_merge_operands);
+                     wpt_db_->txn_db_options_.rollback_merge_operands,
+                     roptions);
   auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
   assert(s.ok());
   if (!s.ok()) {
@@ -434,7 +445,8 @@ Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
   ColumnFamilyHandle* cfh =
       column_family ? column_family : db_impl_->DefaultColumnFamily();
 
-  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
+                                            kBackedByDBSnapshot);
   return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
                                                snap_seq, false /* cache_only */,
                                                &snap_checker, min_uncommitted);
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index a3b523a22cf..e6d71020685 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -226,16 +226,18 @@ Status WritePreparedTxnDB::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
-  WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
   bool* dont_care = nullptr;
   auto res = db_impl_->GetImpl(options, column_family, key, value, dont_care,
                                &callback);
-  if (LIKELY(
-          ValidateSnapshot(callback.max_visible_seq(), backed_by_snapshot))) {
+  if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(),
+                                                  backed_by_snapshot))) {
     return res;
   } else {
+    WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
@@ -298,7 +300,8 @@ struct WritePreparedTxnDB::IteratorState {
   IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
                 std::shared_ptr<ManagedSnapshot> s,
                 SequenceNumber min_uncommitted)
-      : callback(txn_db, sequence, min_uncommitted), snapshot(s) {}
+      : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot),
+        snapshot(s) {}
 
   WritePreparedTxnReadCallback callback;
   std::shared_ptr<ManagedSnapshot> snapshot;
@@ -392,6 +395,7 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
       new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
   commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
       new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
+  dummy_max_snapshot_.number_ = kMaxSequenceNumber;
 }
 
 void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 9561bfada17..4ee7d8e6cf8 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -30,6 +30,7 @@
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace rocksdb {
+enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot };
 
 // A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
 // In this way some data in the DB might not be committed. The DB provides
@@ -448,18 +449,21 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       const ColumnFamilyOptions& cf_options) override;
   // Assign the min and max sequence numbers for reading from the db. A seq >
   // max is not valid, and a seq < min is valid, and a min <= seq < max requires
-  // further checkings. Normally max is defined by the snapshot and min is by
+  // further checking. Normally max is defined by the snapshot and min is by
   // minimum uncommitted seq.
-  inline bool AssignMinMaxSeqs(const Snapshot* snapshot, SequenceNumber* min,
-                               SequenceNumber* max);
+  inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot,
+                                         SequenceNumber* min,
+                                         SequenceNumber* max);
   // Validate is a snapshot sequence number is still valid based on the latest
   // db status. backed_by_snapshot specifies if the number is baked by an actual
   // snapshot object. order specified the memory order with which we load the
   // atomic variables: relax is enough for the default since we care about last
   // value seen by same thread.
   inline bool ValidateSnapshot(
-      const SequenceNumber snap_seq, const bool backed_by_snapshot,
+      const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
       std::memory_order order = std::memory_order_relaxed);
+  // Get a dummy snapshot that refers to kMaxSequenceNumber
+  Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; }
 
  private:
   friend class AddPreparedCallback;
@@ -488,6 +492,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
   friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test;
   friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test;
+  friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test;
   friend class
       WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test;
   friend class
@@ -783,26 +788,55 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   // Thread safety: since the handle is read-only object it is a const it is
   // safe to read it concurrently
   std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_;
+  // A dummy snapshot object that refers to kMaxSequenceNumber
+  SnapshotImpl dummy_max_snapshot_;
 };
 
 class WritePreparedTxnReadCallback : public ReadCallback {
  public:
   WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
-      : ReadCallback(snapshot), db_(db) {}
+      : ReadCallback(snapshot),
+        db_(db),
+        backed_by_snapshot_(kBackedByDBSnapshot) {}
   WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot,
-                               SequenceNumber min_uncommitted)
-      : ReadCallback(snapshot, min_uncommitted), db_(db) {}
+                               SequenceNumber min_uncommitted,
+                               SnapshotBackup backed_by_snapshot)
+      : ReadCallback(snapshot, min_uncommitted),
+        db_(db),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WritePreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
 
   // Will be called to see if the seq number visible; if not it moves on to
   // the next seq number.
   inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override {
     auto snapshot = max_visible_seq_;
-    return db_->IsInSnapshot(seq, snapshot, min_uncommitted_);
+    bool snap_released = false;
+    auto ret =
+        db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released);
+    assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+    snap_released_ |= snap_released;
+    return ret;
+  }
+
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
   }
 
   // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
  private:
   WritePreparedTxnDB* db_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
 };
 
 class AddPreparedCallback : public PreReleaseCallback {
@@ -1034,26 +1068,26 @@ struct SubBatchCounter : public WriteBatch::Handler {
   bool WriteAfterCommit() const override { return false; }
 };
 
-bool WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
-                                          SequenceNumber* min,
-                                          SequenceNumber* max) {
+SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
+                                                    SequenceNumber* min,
+                                                    SequenceNumber* max) {
   if (snapshot != nullptr) {
     *min = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
                ->min_uncommitted_;
     *max = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
                ->number_;
-    return true;
+    return kBackedByDBSnapshot;
   } else {
     *min = SmallestUnCommittedSeq();
     *max = 0;  // to be assigned later after sv is referenced.
-    return false;
+    return kUnbackedByDBSnapshot;
   }
 }
 
-bool WritePreparedTxnDB::ValidateSnapshot(const SequenceNumber snap_seq,
-                                          const bool backed_by_snapshot,
-                                          std::memory_order order) {
-  if (backed_by_snapshot) {
+bool WritePreparedTxnDB::ValidateSnapshot(
+    const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+    std::memory_order order) {
+  if (backed_by_snapshot == kBackedByDBSnapshot) {
     return true;
   } else {
     SequenceNumber max = max_evicted_seq_.load(order);
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 993c3b8b60c..a1862d32d44 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -25,7 +25,11 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
     }
   }
 
-  return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
+  bool snap_released = false;
+  auto ret = db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
+  assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+  snap_released_ |= snap_released;
+  return ret;
 }
 
 WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
@@ -547,8 +551,9 @@ Status WriteUnpreparedTxn::RollbackInternal() {
   Status s;
   const auto& cf_map = *wupt_db_->GetCFHandleMap();
   auto read_at_seq = kMaxSequenceNumber;
-
   ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
   // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
   // need to read our own writes when reading prior versions of the key for
   // rollback.
@@ -704,7 +709,8 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
           ->min_uncommitted_;
   SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
-                                          top.unprep_seqs_);
+                                          top.unprep_seqs_,
+                                          kBackedByDBSnapshot);
   const auto& cf_map = *wupt_db_->GetCFHandleMap();
   for (const auto& cfkey : tracked_keys) {
     const auto cfid = cfkey.first;
@@ -784,14 +790,16 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
                                   PinnableSlice* values, Status* statuses,
                                   bool sorted_input) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
-                                          unprep_seqs_);
+                                          unprep_seqs_, backed_by_snapshot);
   write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
                                       keys, values, statuses, sorted_input,
                                       &callback);
-  if (UNLIKELY(!wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+  if (UNLIKELY(!callback.valid() ||
+               !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     for (size_t i = 0; i < num_keys; i++) {
       statuses[i] = Status::TryAgain();
     }
@@ -802,15 +810,17 @@ Status WriteUnpreparedTxn::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
-                                          unprep_seqs_);
+                                          unprep_seqs_, backed_by_snapshot);
   auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
                                             value, &callback);
-  if (LIKELY(wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+  if (LIKELY(callback.valid() &&
+             wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
     return res;
   } else {
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
@@ -854,8 +864,8 @@ Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
   ColumnFamilyHandle* cfh =
       column_family ? column_family : db_impl_->DefaultColumnFamily();
 
-  WriteUnpreparedTxnReadCallback snap_checker(wupt_db_, snap_seq,
-                                              min_uncommitted, unprep_seqs_);
+  WriteUnpreparedTxnReadCallback snap_checker(
+      wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot);
   return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
                                                snap_seq, false /* cache_only */,
                                                &snap_checker, min_uncommitted);
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 774d90e8d37..5c654b05ba8 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -56,7 +56,8 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
   WriteUnpreparedTxnReadCallback(
       WritePreparedTxnDB* db, SequenceNumber snapshot,
       SequenceNumber min_uncommitted,
-      const std::map<SequenceNumber, size_t>& unprep_seqs)
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SnapshotBackup backed_by_snapshot)
       // Pass our last uncommitted seq as the snapshot to the parent class to
       // ensure that the parent will not prematurely filter out own writes. We
       // will do the exact comparison against snapshots in IsVisibleFullCheck
@@ -64,10 +65,23 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
       : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
         db_(db),
         unprep_seqs_(unprep_seqs),
-        wup_snapshot_(snapshot) {}
+        wup_snapshot_(snapshot),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WriteUnpreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
 
   virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
 
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
+  }
+
   void Refresh(SequenceNumber seq) override {
     max_visible_seq_ = std::max(max_visible_seq_, seq);
     wup_snapshot_ = seq;
@@ -88,6 +102,11 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
   WritePreparedTxnDB* db_;
   const std::map<SequenceNumber, size_t>& unprep_seqs_;
   SequenceNumber wup_snapshot_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
 };
 
 class WriteUnpreparedTxn : public WritePreparedTxn {
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 4381619e782..defaf9fce6e 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -348,7 +348,8 @@ struct WriteUnpreparedTxnDB::IteratorState {
   IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
                 std::shared_ptr<ManagedSnapshot> s,
                 SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
-      : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_),
+      : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
+                 kBackedByDBSnapshot),
         snapshot(s) {}
   SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
 

From b1a02ffeabb3ad3edceddf31f88c7543f01a03d4 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 5 Aug 2019 15:40:31 -0700
Subject: [PATCH 279/572] Fix make target 'all' and 'check' (#5672)

Summary:
If a test is one of parallel tests, then it should also be one of the 'tests'.
Otherwise, `make all` won't build the binaries. For examle,
```
$COMPILE_WITH_ASAN=1 make -j32 all
```
Then if you do
```
$make check
```
The second command will invoke the compilation and building for db_bloom_test
and file_reader_writer_test **without** the `COMPILE_WITH_ASAN=1`, causing the
command to fail.

Test plan (on devserver):
```
$make -j32 all
```
Verify all binaries are built so that `make check` won't have to compile any
thing.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5672

Differential Revision: D16655834

Pulled By: riversand963

fbshipit-source-id: 050131412b5313496f85ae3deeeeb8d28af75746
---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index fbe6d2d06ff..4502be8e46b 100644
--- a/Makefile
+++ b/Makefile
@@ -445,6 +445,7 @@ TESTS = \
 	db_iter_test \
 	db_iter_stress_test \
 	db_log_iter_test \
+	db_bloom_filter_test \
 	db_compaction_filter_test \
 	db_compaction_test \
 	db_dynamic_level_test \
@@ -479,6 +480,7 @@ TESTS = \
 	fault_injection_test \
 	filelock_test \
 	filename_test \
+	file_reader_writer_test \
 	block_based_filter_block_test \
 	full_filter_block_test \
 	partitioned_filter_block_test \

From f4a616ebf9e4417fe74e459ae58e4d31642bafcb Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Mon, 5 Aug 2019 18:31:42 -0700
Subject: [PATCH 280/572] Block cache analyzer: python script to plot graphs
 (#5673)

Summary:
This PR updated the python script to plot graphs for stats output from block cache analyzer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5673

Test Plan: Manually run the script to generate graphs.

Differential Revision: D16657145

Pulled By: HaoyuHuang

fbshipit-source-id: fd510b5fd4307835f9a986fac545734dbe003d28
---
 .../block_cache_trace_analyzer_plot.py        | 402 ++++++++++++++++--
 1 file changed, 360 insertions(+), 42 deletions(-)

diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
index 22d56b932c5..0fdaa41586e 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
@@ -1,12 +1,17 @@
 #!/usr/bin/env python3
 import csv
+import math
 import os
 import random
 import sys
 
+import matplotlib
+matplotlib.use("Agg")
 import matplotlib.backends.backend_pdf
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+import seaborn as sns
 
 
 # Make sure a legend has the same color across all generated graphs.
@@ -19,7 +24,7 @@ def get_cmap(n, name="hsv"):
 color_index = 0
 bar_color_maps = {}
 colors = []
-n_colors = 60
+n_colors = 360
 linear_colors = get_cmap(n_colors)
 for i in range(n_colors):
     colors.append(linear_colors(i))
@@ -35,41 +40,95 @@ def num_to_gb(n):
     return "{0:.2f}".format(float(n) / one_gb)
 
 
-def plot_miss_ratio_graphs(csv_result_dir, output_result_dir):
-    mrc_file_path = csv_result_dir + "/mrc"
-    if not os.path.exists(mrc_file_path):
-        return
+def plot_miss_stats_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
     miss_ratios = {}
-    print("Processing file {}".format(mrc_file_path))
-    with open(mrc_file_path, "r") as csvfile:
-        rows = csv.reader(csvfile, delimiter=",")
-        is_header = False
-        for row in rows:
-            if not is_header:
-                is_header = True
-                continue
-            cache_name = row[0]
-            num_shard_bits = int(row[1])
-            ghost_capacity = int(row[2])
-            capacity = int(row[3])
-            miss_ratio = float(row[4])
-            config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
-            if config not in miss_ratios:
-                miss_ratios[config] = {}
-                miss_ratios[config]["x"] = []
-                miss_ratios[config]["y"] = []
-            miss_ratios[config]["x"].append(num_to_gb(capacity))
-            miss_ratios[config]["y"].append(miss_ratio)
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+            fig = plt.figure()
+            for config in miss_ratios:
+                plt.plot(
+                    miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
+                )
+            plt.xlabel("Cache capacity")
+            plt.ylabel(ylabel)
+            plt.xscale("log", basex=2)
+            plt.ylim(ymin=0)
+            plt.title("{}".format(file))
+            plt.legend()
+            fig.savefig(
+                output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+            )
+
+
+def plot_miss_stats_diff_lru_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+    miss_ratios = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+    if "lru-0-0" not in miss_ratios:
+        return
     fig = plt.figure()
     for config in miss_ratios:
-        plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config)
-    plt.xlabel("Cache capacity (GB)")
-    plt.ylabel("Miss Ratio (%)")
-    # plt.xscale('log', basex=2)
-    plt.ylim(ymin=0)
-    plt.title("RocksDB block cache miss ratios")
+        diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
+        for i in range(len(miss_ratios["lru-0-0"]["x"])):
+            for j in range(len(miss_ratios[config]["x"])):
+                if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
+                    diffs[i] = (
+                        miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
+                    )
+                    break
+        plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
+    plt.xlabel("Cache capacity")
+    plt.ylabel(ylabel)
+    plt.xscale("log", basex=2)
+    plt.title("{}".format(file))
     plt.legend()
-    fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight")
+    fig.savefig(
+        output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+    )
 
 
 def sanitize(label):
@@ -143,6 +202,7 @@ def read_data_for_plot(csvfile, vertical):
 def plot_line_charts(
     csv_result_dir,
     output_result_dir,
+    filename_prefix,
     filename_suffix,
     pdf_name,
     xlabel,
@@ -151,11 +211,14 @@ def plot_line_charts(
     vertical,
     legend,
 ):
+    global color_index, bar_color_maps, colors
     pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
     for file in os.listdir(csv_result_dir):
         if not file.endswith(filename_suffix):
             continue
-        print("Processing file {}".format(file))
+        if not file.startswith(filename_prefix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
         with open(csv_result_dir + "/" + file, "r") as csvfile:
             x, labels, label_stats = read_data_for_plot(csvfile, vertical)
             if len(x) == 0 or len(labels) == 0:
@@ -163,10 +226,15 @@ def plot_line_charts(
             # plot figure
             fig = plt.figure()
             for label_index in label_stats:
+                # Assign a unique color to this label.
+                if labels[label_index] not in bar_color_maps:
+                    bar_color_maps[labels[label_index]] = colors[color_index]
+                    color_index += 1
                 plt.plot(
-                    [int(x[i]) for i in range(len(x))],
-                    label_stats[label_index],
+                    [int(x[i]) for i in range(len(x) - 1)],
+                    label_stats[label_index][:-1],
                     label=labels[label_index],
+                    color=bar_color_maps[labels[label_index]],
                 )
 
             # Translate time unit into x labels.
@@ -239,10 +307,29 @@ def plot_stacked_bar_charts(
     pdf.close()
 
 
-def plot_access_timeline(csv_result_dir, output_result_dir):
+def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        corr_table = corr_table.pivot("label", "corr", "value")
+        fig = plt.figure()
+        sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
+        plt.title("{} filename:{}".format(title, file))
+        pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_timeline(csv_result_dir, output_result_dir):
     plot_line_charts(
         csv_result_dir,
         output_result_dir,
+        filename_prefix="",
         filename_suffix="access_timeline",
         pdf_name="access_time.pdf",
         xlabel="Time",
@@ -253,6 +340,109 @@ def plot_access_timeline(csv_result_dir, output_result_dir):
     )
 
 
+def convert_to_0_if_nan(n):
+    if math.isnan(n):
+        return 0.0
+    return n
+
+
+def plot_correlation(csv_result_dir, output_result_dir):
+    # Processing the correlation input first.
+    label_str_file = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith("correlation_input"):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        label_str = file.split("_")[0]
+        label = file[len(label_str) + 1 :]
+        label = label[: len(label) - len("_correlation_input")]
+
+        output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
+        if output_file not in label_str_file:
+            f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
+            label_str_file[output_file] = f
+            f.write("label,corr,value\n")
+        f = label_str_file[output_file]
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+A",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+T",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+    for label_str in label_str_file:
+        label_str_file[label_str].close()
+
+    plot_heatmap(
+        csv_result_dir,
+        output_result_dir,
+        "correlation_output",
+        "correlation.pdf",
+        "Correlation",
+    )
+
+
 def plot_reuse_graphs(csv_result_dir, output_result_dir):
     plot_stacked_bar_charts(
         csv_result_dir,
@@ -301,6 +491,7 @@ def plot_reuse_graphs(csv_result_dir, output_result_dir):
     plot_line_charts(
         csv_result_dir,
         output_result_dir,
+        filename_prefix="",
         filename_suffix="reuse_blocks_timeline",
         pdf_name="reuse_blocks_timeline.pdf",
         xlabel="",
@@ -370,14 +561,90 @@ def plot_access_count_summary(csv_result_dir, output_result_dir):
         vertical=True,
         x_prefix="< ",
     )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="skewness",
+        pdf_name="skew.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Skewness",
+        vertical=True,
+        legend=False,
+    )
+
+
+def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_ratio_timeline",
+        pdf_name="miss_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Miss Ratio (%)",
+        title="Miss ratio timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_timeline",
+        pdf_name="policy_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_ratio_timeline",
+        pdf_name="policy_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Percentage of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
 
 
 if __name__ == "__main__":
     if len(sys.argv) < 3:
         print(
-            "Must provide two arguments: 1) The directory that saves a list of "
-            "directories which contain block cache trace analyzer result files "
-            "2) the directory to save plotted graphs."
+            "Must provide two arguments: \n"
+            "1) The directory that saves a list of "
+            "directories which contain block cache trace analyzer result files. \n"
+            "2) the directory to save plotted graphs. \n"
         )
         exit(1)
     csv_result_dir = sys.argv[1]
@@ -396,8 +663,59 @@ def plot_access_count_summary(csv_result_dir, output_result_dir):
         print("Processing experiment dir: {}".format(csv_relative_dir))
         if not os.path.exists(result_dir):
             os.makedirs(result_dir)
-        plot_miss_ratio_graphs(csv_abs_dir, result_dir)
-        plot_access_timeline(csv_abs_dir, result_dir)
+        plot_access_count_summary(csv_abs_dir, result_dir)
+        plot_timeline(csv_abs_dir, result_dir)
+        plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
+        plot_correlation(csv_abs_dir, result_dir)
         plot_reuse_graphs(csv_abs_dir, result_dir)
         plot_percentage_access_summary(csv_abs_dir, result_dir)
-        plot_access_count_summary(csv_abs_dir, result_dir)
+        plot_miss_stats_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc",
+        )
+        plot_miss_stats_diff_lru_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc_diff_lru",
+        )
+        # The following stats are only available in pysim.
+        for time_unit in ["1", "60", "3600"]:
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
+            )

From cc9fa7fcdb35fdd12505053b2a6cd38140c93d3b Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Mon, 5 Aug 2019 19:47:33 -0700
Subject: [PATCH 281/572] cmake: cmake related cleanups (#5662)

Summary:
- cmake: use the builtin FindBzip2.cmake from CMake
- cmake: require CMake v3.5.1
- cmake: add imported target for 3rd party libraries
- cmake: extract ReadVersion.cmake out and refactor it
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5662

Differential Revision: D16660974

Pulled By: maysamyabandeh

fbshipit-source-id: 681594910e74253251fe14ad0befc41a4d0f4fd4
---
 CMakeLists.txt                   | 65 ++++++++++++--------------------
 cmake/modules/FindJeMalloc.cmake | 24 ++++++++----
 cmake/modules/FindNUMA.cmake     | 16 ++++++--
 cmake/modules/FindTBB.cmake      | 26 ++++++++-----
 cmake/modules/Findbzip2.cmake    | 21 -----------
 cmake/modules/Findlz4.cmake      | 28 +++++++++-----
 cmake/modules/Findsnappy.cmake   | 26 ++++++++-----
 cmake/modules/Findzstd.cmake     | 28 +++++++++-----
 cmake/modules/ReadVersion.cmake  | 10 +++++
 9 files changed, 132 insertions(+), 112 deletions(-)
 delete mode 100644 cmake/modules/Findbzip2.cmake
 create mode 100644 cmake/modules/ReadVersion.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7266f3b55c8..bb99d1b7ec8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,18 +32,19 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 2.8.12)
-project(rocksdb)
-enable_language(CXX)
-enable_language(C)
-enable_language(ASM)
+cmake_minimum_required(VERSION 3.5.1)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
+include(ReadVersion)
+get_rocksdb_version(rocksdb_VERSION)
+project(rocksdb
+  VERSION ${rocksdb_VERSION}
+  LANGUAGES CXX C ASM)
 
 if(POLICY CMP0042)
   cmake_policy(SET CMP0042 NEW)
 endif()
 
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
-
 find_program(CCACHE_FOUND ccache)
 if(CCACHE_FOUND)
   set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
@@ -74,8 +75,7 @@ else()
     if(WITH_JEMALLOC)
       find_package(JeMalloc REQUIRED)
       add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
-      include_directories(${JEMALLOC_INCLUDE_DIR})
-      list(APPEND THIRDPARTY_LIBS ${JEMALLOC_LIBRARIES})
+      list(APPEND THIRDPARTY_LIBS JeMalloc::JeMalloc)
     endif()
   endif()
 
@@ -93,43 +93,38 @@ else()
   if(WITH_SNAPPY)
     find_package(snappy REQUIRED)
     add_definitions(-DSNAPPY)
-    include_directories(${SNAPPY_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${SNAPPY_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS snappy::snappy)
   endif()
 
   if(WITH_ZLIB)
     find_package(ZLIB REQUIRED)
     add_definitions(-DZLIB)
-    if(ZLIB_INCLUDE_DIRS)
-      # CMake 3
-      include_directories(${ZLIB_INCLUDE_DIRS})
-    else()
-      # CMake 2
-      include_directories(${ZLIB_INCLUDE_DIR})
-    endif()
-    list(APPEND THIRDPARTY_LIBS ${ZLIB_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS ZLIB::ZLIB)
   endif()
 
   option(WITH_BZ2 "build with bzip2" OFF)
   if(WITH_BZ2)
-    find_package(bzip2 REQUIRED)
+    find_package(BZip2 REQUIRED)
     add_definitions(-DBZIP2)
-    include_directories(${BZIP2_INCLUDE_DIR})
+    if(BZIP2_INCLUDE_DIRS)
+      include_directories(${BZIP2_INCLUDE_DIRS})
+    else()
+      include_directories(${BZIP2_INCLUDE_DIR})
+    endif()
     list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES})
   endif()
 
   if(WITH_LZ4)
     find_package(lz4 REQUIRED)
     add_definitions(-DLZ4)
-    include_directories(${LZ4_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${LZ4_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS lz4::lz4)
   endif()
 
   if(WITH_ZSTD)
     find_package(zstd REQUIRED)
     add_definitions(-DZSTD)
     include_directories(${ZSTD_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${ZSTD_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS zstd::zstd)
   endif()
 endif()
 
@@ -150,17 +145,6 @@ endif()
 string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}")
 
 
-# Read rocksdb version from version.h header file.
-file(READ include/rocksdb/version.h version_header_file)
-string(REGEX MATCH "#define ROCKSDB_MAJOR ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_MAJOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "#define ROCKSDB_MINOR ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_MINOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "#define ROCKSDB_PATCH ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_PATCH ${CMAKE_MATCH_1})
-set(ROCKSDB_VERSION ${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH})
-
-
 option(WITH_MD_LIBRARY "build with MD" ON)
 if(WIN32 AND MSVC)
   if(WITH_MD_LIBRARY)
@@ -316,15 +300,14 @@ if(WITH_NUMA)
   find_package(NUMA REQUIRED)
   add_definitions(-DNUMA)
   include_directories(${NUMA_INCLUDE_DIR})
-  list(APPEND THIRDPARTY_LIBS ${NUMA_LIBRARIES})
+  list(APPEND THIRDPARTY_LIBS NUMA::NUMA)
 endif()
 
 option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF)
 if(WITH_TBB)
   find_package(TBB REQUIRED)
   add_definitions(-DTBB)
-  include_directories(${TBB_INCLUDE_DIR})
-  list(APPEND THIRDPARTY_LIBS ${TBB_LIBRARIES})
+  list(APPEND THIRDPARTY_LIBS TBB::TBB)
 endif()
 
 # Stall notifications eat some performance from inserts
@@ -777,8 +760,8 @@ else()
     ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
   set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
                         LINKER_LANGUAGE CXX
-                        VERSION ${ROCKSDB_VERSION}
-                        SOVERSION ${ROCKSDB_VERSION_MAJOR}
+                        VERSION ${rocksdb_VERSION}
+                        SOVERSION ${rocksdb_VERSION_MAJOR}
                         CXX_STANDARD 11
                         OUTPUT_NAME "rocksdb")
 endif()
@@ -833,7 +816,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
 
   write_basic_package_version_file(
     RocksDBConfigVersion.cmake
-    VERSION ${ROCKSDB_VERSION}
+    VERSION ${rocksdb_VERSION}
     COMPATIBILITY SameMajorVersion
   )
 
diff --git a/cmake/modules/FindJeMalloc.cmake b/cmake/modules/FindJeMalloc.cmake
index 7911f77c4c3..f695b3ed1b3 100644
--- a/cmake/modules/FindJeMalloc.cmake
+++ b/cmake/modules/FindJeMalloc.cmake
@@ -1,21 +1,29 @@
 # - Find JeMalloc library
 # Find the native JeMalloc includes and library
 #
-# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
-# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
-# JEMALLOC_FOUND - True if jemalloc found.
+# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc.
+# JeMalloc_LIBRARIES - List of libraries when using jemalloc.
+# JeMalloc_FOUND - True if jemalloc found.
 
-find_path(JEMALLOC_INCLUDE_DIR
+find_path(JeMalloc_INCLUDE_DIRS
   NAMES jemalloc/jemalloc.h
   HINTS ${JEMALLOC_ROOT_DIR}/include)
 
-find_library(JEMALLOC_LIBRARIES
+find_library(JeMalloc_LIBRARIES
   NAMES jemalloc
   HINTS ${JEMALLOC_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
+find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS)
 
 mark_as_advanced(
-  JEMALLOC_LIBRARIES
-  JEMALLOC_INCLUDE_DIR)
+  JeMalloc_LIBRARIES
+  JeMalloc_INCLUDE_DIRS)
+
+if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc))
+  add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED)
+  set_target_properties(JeMalloc::JeMalloc
+    PROPERTIES
+      IMPORTED_LOCATION ${JeMalloc_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/FindNUMA.cmake b/cmake/modules/FindNUMA.cmake
index 02760344c68..69b95c9b60b 100644
--- a/cmake/modules/FindNUMA.cmake
+++ b/cmake/modules/FindNUMA.cmake
@@ -1,11 +1,11 @@
 # - Find NUMA
 # Find the NUMA library and includes
 #
-# NUMA_INCLUDE_DIR - where to find numa.h, etc.
+# NUMA_INCLUDE_DIRS - where to find numa.h, etc.
 # NUMA_LIBRARIES - List of libraries when using NUMA.
 # NUMA_FOUND - True if NUMA found.
 
-find_path(NUMA_INCLUDE_DIR
+find_path(NUMA_INCLUDE_DIRS
   NAMES numa.h numaif.h
   HINTS ${NUMA_ROOT_DIR}/include)
 
@@ -14,8 +14,16 @@ find_library(NUMA_LIBRARIES
   HINTS ${NUMA_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR)
+find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS)
 
 mark_as_advanced(
   NUMA_LIBRARIES
-  NUMA_INCLUDE_DIR)
+  NUMA_INCLUDE_DIRS)
+
+if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA))
+  add_library (NUMA::NUMA UNKNOWN IMPORTED)
+  set_target_properties(NUMA::NUMA
+    PROPERTIES
+      IMPORTED_LOCATION ${NUMA_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/FindTBB.cmake b/cmake/modules/FindTBB.cmake
index 556ce872b17..f6861fa5521 100644
--- a/cmake/modules/FindTBB.cmake
+++ b/cmake/modules/FindTBB.cmake
@@ -1,7 +1,7 @@
 # - Find TBB
 # Find the Thread Building Blocks library and includes
 #
-# TBB_INCLUDE_DIR - where to find tbb.h, etc.
+# TBB_INCLUDE_DIRS - where to find tbb.h, etc.
 # TBB_LIBRARIES - List of libraries when using TBB.
 # TBB_FOUND - True if TBB found.
 
@@ -9,17 +9,25 @@ if(NOT DEFINED TBB_ROOT_DIR)
   set(TBB_ROOT_DIR "$ENV{TBBROOT}")
 endif()
 
-find_path(TBB_INCLUDE_DIR
-NAMES tbb/tbb.h
-HINTS ${TBB_ROOT_DIR}/include)
+find_path(TBB_INCLUDE_DIRS
+  NAMES tbb/tbb.h
+  HINTS ${TBB_ROOT_DIR}/include)
 
 find_library(TBB_LIBRARIES
-NAMES tbb
-HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
+  NAMES tbb
+  HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIR)
+find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS)
 
 mark_as_advanced(
-TBB_LIBRARIES
-TBB_INCLUDE_DIR)
+  TBB_LIBRARIES
+  TBB_INCLUDE_DIRS)
+
+if(TBB_FOUND AND NOT (TARGET TBB::TBB))
+  add_library (TBB::TBB UNKNOWN IMPORTED)
+  set_target_properties(TBB::TBB
+    PROPERTIES
+      IMPORTED_LOCATION ${TBB_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findbzip2.cmake b/cmake/modules/Findbzip2.cmake
deleted file mode 100644
index 87abbe941e0..00000000000
--- a/cmake/modules/Findbzip2.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# - Find Bzip2
-# Find the bzip2 compression library and includes
-#
-# BZIP2_INCLUDE_DIR - where to find bzlib.h, etc.
-# BZIP2_LIBRARIES - List of libraries when using bzip2.
-# BZIP2_FOUND - True if bzip2 found.
-
-find_path(BZIP2_INCLUDE_DIR
-  NAMES bzlib.h
-  HINTS ${BZIP2_ROOT_DIR}/include)
-
-find_library(BZIP2_LIBRARIES
-  NAMES bz2
-  HINTS ${BZIP2_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(bzip2 DEFAULT_MSG BZIP2_LIBRARIES BZIP2_INCLUDE_DIR)
-
-mark_as_advanced(
-  BZIP2_LIBRARIES
-  BZIP2_INCLUDE_DIR)
diff --git a/cmake/modules/Findlz4.cmake b/cmake/modules/Findlz4.cmake
index c34acef5e39..7cf7d7f5fe3 100644
--- a/cmake/modules/Findlz4.cmake
+++ b/cmake/modules/Findlz4.cmake
@@ -1,21 +1,29 @@
 # - Find Lz4
 # Find the lz4 compression library and includes
 #
-# LZ4_INCLUDE_DIR - where to find lz4.h, etc.
-# LZ4_LIBRARIES - List of libraries when using lz4.
-# LZ4_FOUND - True if lz4 found.
+# lz4_INCLUDE_DIRS - where to find lz4.h, etc.
+# lz4_LIBRARIES - List of libraries when using lz4.
+# lz4_FOUND - True if lz4 found.
 
-find_path(LZ4_INCLUDE_DIR
+find_path(lz4_INCLUDE_DIRS
   NAMES lz4.h
-  HINTS ${LZ4_ROOT_DIR}/include)
+  HINTS ${lz4_ROOT_DIR}/include)
 
-find_library(LZ4_LIBRARIES
+find_library(lz4_LIBRARIES
   NAMES lz4
-  HINTS ${LZ4_ROOT_DIR}/lib)
+  HINTS ${lz4_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(lz4 DEFAULT_MSG LZ4_LIBRARIES LZ4_INCLUDE_DIR)
+find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS)
 
 mark_as_advanced(
-  LZ4_LIBRARIES
-  LZ4_INCLUDE_DIR)
+  lz4_LIBRARIES
+  lz4_INCLUDE_DIRS)
+
+if(lz4_FOUND AND NOT (TARGET lz4::lz4))
+  add_library(lz4::lz4 UNKNOWN IMPORTED)
+  set_target_properties(lz4::lz4
+    PROPERTIES
+      IMPORTED_LOCATION ${lz4_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake
index 6ed5fda3d57..2de2889c1a6 100644
--- a/cmake/modules/Findsnappy.cmake
+++ b/cmake/modules/Findsnappy.cmake
@@ -1,21 +1,29 @@
 # - Find Snappy
 # Find the snappy compression library and includes
 #
-# SNAPPY_INCLUDE_DIR - where to find snappy.h, etc.
-# SNAPPY_LIBRARIES - List of libraries when using snappy.
-# SNAPPY_FOUND - True if snappy found.
+# snappy_INCLUDE_DIRS - where to find snappy.h, etc.
+# snappy_LIBRARIES - List of libraries when using snappy.
+# snappy_FOUND - True if snappy found.
 
-find_path(SNAPPY_INCLUDE_DIR
+find_path(snappy_INCLUDE_DIRS
   NAMES snappy.h
-  HINTS ${SNAPPY_ROOT_DIR}/include)
+  HINTS ${snappy_ROOT_DIR}/include)
 
 find_library(SNAPPY_LIBRARIES
   NAMES snappy
-  HINTS ${SNAPPY_ROOT_DIR}/lib)
+  HINTS ${snappy_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(snappy DEFAULT_MSG SNAPPY_LIBRARIES SNAPPY_INCLUDE_DIR)
+find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS)
 
 mark_as_advanced(
-  SNAPPY_LIBRARIES
-  SNAPPY_INCLUDE_DIR)
+  snappy_LIBRARIES
+  snappy_INCLUDE_DIRS)
+
+if(snappy_FOUND AND NOT (TARGET snappy::snappy))
+  add_library (snappy::snappy UNKNOWN IMPORTED)
+  set_target_properties(snappy::snappy
+    PROPERTIES
+      IMPORTED_LOCATION ${snappy_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findzstd.cmake b/cmake/modules/Findzstd.cmake
index a2964aa9f80..9430821df6e 100644
--- a/cmake/modules/Findzstd.cmake
+++ b/cmake/modules/Findzstd.cmake
@@ -1,21 +1,29 @@
 # - Find zstd
 # Find the zstd compression library and includes
 #
-# ZSTD_INCLUDE_DIR - where to find zstd.h, etc.
-# ZSTD_LIBRARIES - List of libraries when using zstd.
-# ZSTD_FOUND - True if zstd found.
+# zstd_INCLUDE_DIRS - where to find zstd.h, etc.
+# zstd_LIBRARIES - List of libraries when using zstd.
+# zstd_FOUND - True if zstd found.
 
-find_path(ZSTD_INCLUDE_DIR
+find_path(zstd_INCLUDE_DIRS
   NAMES zstd.h
-  HINTS ${ZSTD_ROOT_DIR}/include)
+  HINTS ${zstd_ROOT_DIR}/include)
 
-find_library(ZSTD_LIBRARIES
+find_library(zstd_LIBRARIES
   NAMES zstd
-  HINTS ${ZSTD_ROOT_DIR}/lib)
+  HINTS ${zstd_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(zstd DEFAULT_MSG ZSTD_LIBRARIES ZSTD_INCLUDE_DIR)
+find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS)
 
 mark_as_advanced(
-  ZSTD_LIBRARIES
-  ZSTD_INCLUDE_DIR)
+  zstd_LIBRARIES
+  zstd_INCLUDE_DIRS)
+
+if(zstd_FOUND AND NOT (TARGET zstd::zstd))
+  add_library (zstd::zstd UNKNOWN IMPORTED)
+  set_target_properties(zstd::zstd
+    PROPERTIES
+      IMPORTED_LOCATION ${zstd_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/ReadVersion.cmake b/cmake/modules/ReadVersion.cmake
new file mode 100644
index 00000000000..ae356d99659
--- /dev/null
+++ b/cmake/modules/ReadVersion.cmake
@@ -0,0 +1,10 @@
+# Read rocksdb version from version.h header file.
+
+function(get_rocksdb_version version_var)
+  file(READ "${CMAKE_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
+  foreach(component MAJOR MINOR PATCH)
+    string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file})
+    set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1})
+  endforeach()
+  set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()

From 4f98b43ba3d5ee469af4429981b01f086bf6102a Mon Sep 17 00:00:00 2001
From: Yun Tang <myasuka@live.com>
Date: Tue, 6 Aug 2019 09:10:32 -0700
Subject: [PATCH 282/572] Correct the default write buffer size of java doc
 (#5670)

Summary:
The actual value of default write buffer size within `rocksdb/include/rocksdb/options.h` is 64 MB, we should correct this value in java doc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5670

Differential Revision: D16668815

Pulled By: maysamyabandeh

fbshipit-source-id: cc3a981c9f1c2cd4a8392b0ed5f1fd0a2d729afb
---
 .../java/org/rocksdb/MutableColumnFamilyOptionsInterface.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
index 4f4749646f8..4ae96daaf8a 100644
--- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -21,7 +21,7 @@ public interface MutableColumnFamilyOptionsInterface
    * Also, a larger write buffer will result in a longer recovery time
    * the next time the database is opened.
    *
-   * Default: 4MB
+   * Default: 64MB
    * @param writeBufferSize the size of write buffer.
    * @return the instance of the current object.
    * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms

From d150e01474a0cb281792f51b81260b629b18457f Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Tue, 6 Aug 2019 14:22:34 -0700
Subject: [PATCH 283/572] New API to get all merge operands for a Key (#5604)

Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
      const ReadOptions& options, ColumnFamilyHandle* column_family,
      const Slice& key, PinnableSlice* merge_operands,
      GetMergeOperandsOptions* get_merge_operands_options,
      int* number_of_operands)

Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);

Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604

Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist

Differential Revision: D16657366

Pulled By: vjnadimpalli

fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
---
 CMakeLists.txt                                |   2 +
 Makefile                                      |   4 +
 TARGETS                                       |   8 +
 appveyor.yml                                  |   2 +-
 db/compacted_db_impl.cc                       |   4 +-
 db/db_blob_index_test.cc                      |   8 +-
 db/db_impl/db_impl.cc                         | 112 +++++---
 db/db_impl/db_impl.h                          |  43 +++-
 db/db_impl/db_impl_files.cc                   |   3 +-
 db/db_merge_operand_test.cc                   | 240 ++++++++++++++++++
 db/db_merge_operator_test.cc                  |   8 +-
 db/db_test.cc                                 |   9 +
 db/db_test2.cc                                |   8 +-
 db/memtable.cc                                |  32 ++-
 db/memtable.h                                 |  13 +-
 db/memtable_list.cc                           |  14 +
 db/memtable_list.h                            |   7 +
 db/version_set.cc                             |  17 +-
 db/version_set.h                              |  30 ++-
 file/filename.cc                              |   3 +-
 include/rocksdb/db.h                          |  20 ++
 include/rocksdb/status.h                      |   1 +
 include/rocksdb/utilities/stackable_db.h      |  11 +
 src.mk                                        |   2 +
 .../block_based/data_block_hash_index_test.cc |   8 +-
 table/cuckoo/cuckoo_table_reader_test.cc      |  12 +-
 table/get_context.cc                          | 113 ++++++---
 table/get_context.h                           |  15 +-
 table/table_reader_bench.cc                   |   2 +-
 table/table_test.cc                           |  16 +-
 tools/db_bench_tool.cc                        | 105 +++++++-
 utilities/blob_db/blob_db_impl.cc             |  17 +-
 utilities/merge_operators.h                   |   3 +
 utilities/merge_operators/sortlist.cc         | 100 ++++++++
 utilities/merge_operators/sortlist.h          |  38 +++
 utilities/transactions/write_prepared_txn.cc  |   8 +-
 .../transactions/write_prepared_txn_db.cc     |   8 +-
 .../transactions/write_unprepared_txn.cc      |  16 +-
 .../transactions/write_unprepared_txn_db.cc   |   8 +-
 .../write_batch_with_index.cc                 |   7 +-
 40 files changed, 914 insertions(+), 163 deletions(-)
 create mode 100644 db/db_merge_operand_test.cc
 create mode 100644 utilities/merge_operators/sortlist.cc
 create mode 100644 utilities/merge_operators/sortlist.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb99d1b7ec8..8622242aa75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -661,6 +661,7 @@ set(SOURCES
         utilities/merge_operators/bytesxor.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
+        utilities/merge_operators/sortlist.cc
         utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/string_append/stringappend2.cc
         utilities/merge_operators/uint64add.cc
@@ -887,6 +888,7 @@ if(WITH_TESTS)
         db/db_log_iter_test.cc
         db/db_memtable_test.cc
         db/db_merge_operator_test.cc
+        db/db_merge_operand_test.cc
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
diff --git a/Makefile b/Makefile
index 4502be8e46b..1718309cb89 100644
--- a/Makefile
+++ b/Makefile
@@ -454,6 +454,7 @@ TESTS = \
 	db_iterator_test \
 	db_memtable_test \
 	db_merge_operator_test \
+	db_merge_operand_test \
 	db_options_test \
 	db_range_del_test \
 	db_secondary_test \
@@ -1254,6 +1255,9 @@ db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHA
 db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+	
 db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index 25d7ff66759..bac5c4311aa 100644
--- a/TARGETS
+++ b/TARGETS
@@ -301,6 +301,7 @@ cpp_library(
         "utilities/merge_operators/bytesxor.cc",
         "utilities/merge_operators/max.cc",
         "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/sortlist.cc",
         "utilities/merge_operators/string_append/stringappend.cc",
         "utilities/merge_operators/string_append/stringappend2.cc",
         "utilities/merge_operators/uint64add.cc",
@@ -755,6 +756,13 @@ ROCKS_TESTS = [
         [],
         [],
     ],
+    [
+        "db_merge_operand_test",
+        "db/db_merge_operand_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
     [
         "db_options_test",
         "db/db_options_test.cc",
diff --git a/appveyor.yml b/appveyor.yml
index 6bdb164e84e..77901c40724 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -60,7 +60,7 @@ build:
 test:
 
 test_script:
-  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
+  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8
 
 on_failure:
   - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip
diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc
index 88928391ad2..13cccbd7746 100644
--- a/db/compacted_db_impl.cc
+++ b/db/compacted_db_impl.cc
@@ -37,7 +37,7 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
                             const Slice& key, PinnableSlice* value) {
   GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, key, value, nullptr, nullptr,
-                         nullptr, nullptr);
+                         true, nullptr, nullptr);
   LookupKey lkey(key, kMaxSequenceNumber);
   files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
                                                    &get_context, nullptr);
@@ -70,7 +70,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
       std::string& value = (*values)[idx];
       GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
                              GetContext::kNotFound, keys[idx], &pinnable_val,
-                             nullptr, nullptr, nullptr, nullptr);
+                             nullptr, nullptr, true, nullptr, nullptr);
       LookupKey lkey(keys[idx], kMaxSequenceNumber);
       r->Get(options, lkey.internal_key(), &get_context, nullptr);
       value.assign(pinnable_val.data(), pinnable_val.size());
diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
index 005a23d63b7..e9618885a2d 100644
--- a/db/db_blob_index_test.cc
+++ b/db/db_blob_index_test.cc
@@ -63,9 +63,11 @@ class DBBlobIndexTest : public DBTestBase {
     ReadOptions read_options;
     read_options.snapshot = snapshot;
     PinnableSlice value;
-    auto s = dbfull()->GetImpl(read_options, cfh(), key, &value,
-                               nullptr /*value_found*/, nullptr /*callback*/,
-                               is_blob_index);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
     if (s.IsNotFound()) {
       return "NOT_FOUND";
     }
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 81c44388bcf..9236d911e78 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1441,19 +1441,22 @@ ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
-  return GetImpl(read_options, column_family, key, value);
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  return GetImpl(read_options, key, get_impl_options);
 }
 
-Status DBImpl::GetImpl(const ReadOptions& read_options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       PinnableSlice* pinnable_val, bool* value_found,
-                       ReadCallback* callback, bool* is_blob_index) {
-  assert(pinnable_val != nullptr);
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+                       GetImplOptions get_impl_options) {
+  assert(get_impl_options.value != nullptr ||
+         get_impl_options.merge_operands != nullptr);
   PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
   StopWatch sw(env_, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
   auto cfd = cfh->cfd();
 
   if (tracer_) {
@@ -1461,7 +1464,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     // tracing is enabled.
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      tracer_->Get(column_family, key);
+      tracer_->Get(get_impl_options.column_family, key);
     }
   }
 
@@ -1473,9 +1476,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
 
   SequenceNumber snapshot;
   if (read_options.snapshot != nullptr) {
-    if (callback) {
+    if (get_impl_options.callback) {
       // Already calculated based on read_options.snapshot
-      snapshot = callback->max_visible_seq();
+      snapshot = get_impl_options.callback->max_visible_seq();
     } else {
       snapshot =
           reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
@@ -1489,12 +1492,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     snapshot = last_seq_same_as_publish_seq_
                    ? versions_->LastSequence()
                    : versions_->LastPublishedSequence();
-    if (callback) {
+    if (get_impl_options.callback) {
       // The unprep_seqs are not published for write unprepared, so it could be
       // that max_visible_seq is larger. Seek to the std::max of the two.
       // However, we still want our callback to contain the actual snapshot so
       // that it can do the correct visibility filtering.
-      callback->Refresh(snapshot);
+      get_impl_options.callback->Refresh(snapshot);
 
       // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
       // max_visible_seq = max(max_visible_seq, snapshot)
@@ -1505,7 +1508,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
       // be needed.
       //
       // assert(callback->max_visible_seq() >= snapshot);
-      snapshot = callback->max_visible_seq();
+      snapshot = get_impl_options.callback->max_visible_seq();
     }
   }
   TEST_SYNC_POINT("DBImpl::GetImpl:3");
@@ -1526,19 +1529,39 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
                         has_unpersisted_data_.load(std::memory_order_relaxed));
   bool done = false;
   if (!skip_memtable) {
-    if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                     &max_covering_tombstone_seq, read_options, callback,
-                     is_blob_index)) {
-      done = true;
-      pinnable_val->PinSelf();
-      RecordTick(stats_, MEMTABLE_HIT);
-    } else if ((s.ok() || s.IsMergeInProgress()) &&
-               sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                            &max_covering_tombstone_seq, read_options, callback,
-                            is_blob_index)) {
-      done = true;
-      pinnable_val->PinSelf();
-      RecordTick(stats_, MEMTABLE_HIT);
+    // Get value associated with key
+    if (get_impl_options.get_value) {
+      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                       &merge_context, &max_covering_tombstone_seq,
+                       read_options, get_impl_options.callback,
+                       get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, get_impl_options.callback,
+                              get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    } else {
+      // Get Merge Operands associated with key, Merge Operands should not be
+      // merged and raw values should be returned to the user.
+      if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
+                       &max_covering_tombstone_seq, read_options, nullptr,
+                       nullptr, false)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+                                           &max_covering_tombstone_seq,
+                                           read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
     }
     if (!done && !s.ok() && !s.IsMergeInProgress()) {
       ReturnAndCleanupSuperVersion(cfd, sv);
@@ -1547,9 +1570,14 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   }
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
-    sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
-                     &max_covering_tombstone_seq, value_found, nullptr, nullptr,
-                     callback, is_blob_index);
+    sv->current->Get(
+        read_options, lkey, get_impl_options.value, &s, &merge_context,
+        &max_covering_tombstone_seq,
+        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+        nullptr, nullptr,
+        get_impl_options.get_value ? get_impl_options.callback : nullptr,
+        get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+        get_impl_options.get_value);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -1561,7 +1589,25 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     RecordTick(stats_, NUMBER_KEYS_READ);
     size_t size = 0;
     if (s.ok()) {
-      size = pinnable_val->size();
+      if (get_impl_options.get_value) {
+        size = get_impl_options.value->size();
+      } else {
+        // Return all merge operands for get_impl_options.key
+        *get_impl_options.number_of_operands =
+            static_cast<int>(merge_context.GetNumOperands());
+        if (*get_impl_options.number_of_operands >
+            get_impl_options.get_merge_operands_options
+                ->expected_max_number_of_operands) {
+          s = Status::Incomplete(
+              Status::SubCode::KMergeOperandsInsufficientCapacity);
+        } else {
+          for (const Slice& sl : merge_context.GetOperands()) {
+            size += sl.size();
+            get_impl_options.merge_operands->PinSelf(sl);
+            get_impl_options.merge_operands++;
+          }
+        }
+      }
       RecordTick(stats_, BYTES_READ, size);
       PERF_COUNTER_ADD(get_read_bytes, size);
     }
@@ -2222,7 +2268,11 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
   ReadOptions roptions = read_options;
   roptions.read_tier = kBlockCacheTier;  // read from block cache only
   PinnableSlice pinnable_val;
-  auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found);
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &pinnable_val;
+  get_impl_options.value_found = value_found;
+  auto s = GetImpl(roptions, key, get_impl_options);
   value->assign(pinnable_val.data(), pinnable_val.size());
 
   // If block_cache is enabled and the index block of the table didn't
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index fe3a2f6f20f..f1dbc5d0286 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -159,6 +159,21 @@ class DBImpl : public DB {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
+  using DB::GetMergeOperands;
+  Status GetMergeOperands(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* merge_operands,
+                          GetMergeOperandsOptions* get_merge_operands_options,
+                          int* number_of_operands) override {
+    GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.merge_operands = merge_operands;
+    get_impl_options.get_merge_operands_options = get_merge_operands_options;
+    get_impl_options.number_of_operands = number_of_operands;
+    get_impl_options.get_value = false;
+    return GetImpl(options, key, get_impl_options);
+  }
+
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -395,12 +410,32 @@ class DBImpl : public DB {
 
   // ---- End of implementations of the DB interface ----
 
+  struct GetImplOptions {
+    ColumnFamilyHandle* column_family = nullptr;
+    PinnableSlice* value = nullptr;
+    bool* value_found = nullptr;
+    ReadCallback* callback = nullptr;
+    bool* is_blob_index = nullptr;
+    // If true return value associated with key via value pointer else return
+    // all merge operands for key via merge_operands pointer
+    bool get_value = true;
+    // Pointer to an array of size
+    // get_merge_operands_options.expected_max_number_of_operands allocated by
+    // user
+    PinnableSlice* merge_operands = nullptr;
+    GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+    int* number_of_operands = nullptr;
+  };
+
   // Function that Get and KeyMayExist call with no_io true or false
   // Note: 'value_found' from KeyMayExist propagates here
-  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
-                 const Slice& key, PinnableSlice* value,
-                 bool* value_found = nullptr, ReadCallback* callback = nullptr,
-                 bool* is_blob_index = nullptr);
+  // This function is also called by GetMergeOperands
+  // If get_impl_options.get_value = true get value associated with
+  // get_impl_options.key via get_impl_options.value
+  // If get_impl_options.get_value = false get merge operands associated with
+  // get_impl_options.key via get_impl_options.merge_operands
+  Status GetImpl(const ReadOptions& options, const Slice& key,
+                 GetImplOptions get_impl_options);
 
   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
                                       ColumnFamilyData* cfd,
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index e3b2f576523..3c5fd4fcd7f 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -318,8 +318,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
   // We may ignore the dbname when generating the file names.
   for (auto& file : state.sst_delete_files) {
     candidate_files.emplace_back(
-        MakeTableFileName(file.metadata->fd.GetNumber()),
-        file.path);
+        MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
     if (file.metadata->table_reader_handle) {
       table_cache_->Release(file.metadata->table_reader_handle);
     }
diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc
new file mode 100644
index 00000000000..e6280ad8c79
--- /dev/null
+++ b/db/db_merge_operand_test.cc
@@ -0,0 +1,240 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace rocksdb {
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {}
+};
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // k0 value in memtable
+  Put("k0", "PutARock");
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "PutARock");
+
+  // k0.1 value in SST
+  Put("k0.1", "RockInSST");
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "RockInSST");
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  Put("k1", "x");
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1.1 values are in memtable.
+  ASSERT_OK(Merge("k1.1", "r"));
+  Delete("k1.1");
+  ASSERT_OK(Merge("k1.1", "c"));
+  ASSERT_OK(Merge("k1.1", "k"));
+  ASSERT_OK(Merge("k1.1", "s"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "c");
+  ASSERT_EQ(values[1], "k");
+  ASSERT_EQ(values[2], "s");
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "q");
+  ASSERT_EQ(values[1], "w");
+  ASSERT_EQ(values[2], "e");
+  ASSERT_EQ(values[3], "r");
+
+  // All k2.1 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.1", "m"));
+  Put("k2.1", "l");
+  ASSERT_OK(Merge("k2.1", "n"));
+  ASSERT_OK(Merge("k2.1", "o"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "l,n,o");
+
+  // All k2.2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.2", "g"));
+  Delete("k2.2");
+  ASSERT_OK(Merge("k2.2", "o"));
+  ASSERT_OK(Merge("k2.2", "t"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "o,t");
+
+  // Do some compaction that will make the following tests more predictable
+  //  Slice start("PutARock");
+  //  Slice end("t");
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All k3.1 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.1", "ab"));
+  ASSERT_OK(Flush());
+  Put("k3.1", "bc");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "bc");
+  ASSERT_EQ(values[1], "cd");
+  ASSERT_EQ(values[2], "de");
+
+  // All k3.2 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.2", "ab"));
+  ASSERT_OK(Flush());
+  Delete("k3.2");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "cd");
+  ASSERT_EQ(values[1], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+  ASSERT_OK(Merge("k5", "who"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Flush());
+  Put("k5", "remember");
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "rocks"));
+  dbfull()->TEST_SwitchMemtable();
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "remember");
+  ASSERT_EQ(values[1], "i");
+  ASSERT_EQ(values[2], "am");
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 31bd2e491b1..8358ddb56c2 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -46,9 +46,11 @@ class DBMergeOperatorTest : public DBTestBase {
     ReadOptions read_opt;
     read_opt.snapshot = snapshot;
     PinnableSlice value;
-    Status s =
-        dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value,
-                          nullptr /*value_found*/, &read_callback);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = db_->DefaultColumnFamily();
+    get_impl_options.value = &value;
+    get_impl_options.callback = &read_callback;
+    Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
     if (!s.ok()) {
       return s.ToString();
     }
diff --git a/db/db_test.cc b/db/db_test.cc
index f53afa17d9d..5c96bec36c5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2540,6 +2540,15 @@ class ModelDB : public DB {
     return Status::NotSupported(key);
   }
 
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const Slice& key, PinnableSlice* /*slice*/,
+      GetMergeOperandsOptions* /*merge_operands_options*/,
+      int* /*number_of_operands*/) override {
+    return Status::NotSupported(key);
+  }
+
   using DB::MultiGet;
   std::vector<Status> MultiGet(
       const ReadOptions& /*options*/,
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 3664b3a249f..26604c53ad8 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -2797,8 +2797,12 @@ TEST_F(DBTest2, ReadCallbackTest) {
     ReadOptions roptions;
     TestReadCallback callback(seq);
     bool dont_care = true;
-    Status s = dbfull()->GetImpl(roptions, dbfull()->DefaultColumnFamily(), key,
-                                 &pinnable_val, &dont_care, &callback);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &dont_care;
+    get_impl_options.callback = &callback;
+    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
     ASSERT_TRUE(s.ok());
     // Assuming that after each Put the DB increased seq by one, the value and
     // seq number must be equal since we also inc value by 1 after each Put.
diff --git a/db/memtable.cc b/db/memtable.cc
index fdd1a577ade..62c7339b5d0 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -601,6 +601,7 @@ struct Saver {
   Logger* logger;
   Statistics* statistics;
   bool inplace_update_support;
+  bool do_merge;
   Env* env_;
   ReadCallback* callback_;
   bool* is_blob_index;
@@ -627,7 +628,7 @@ static bool SaveValue(void* arg, const char* entry) {
   //    klength  varint32
   //    userkey  char[klength-8]
   //    tag      uint64
-  //    vlength  varint32
+  //    vlength  varint32f
   //    value    char[vlength]
   // Check that it belongs to same user key.  We do not check the
   // sequence number since the Seek() call above should have skipped
@@ -677,12 +678,24 @@ static bool SaveValue(void* arg, const char* entry) {
         Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
         *(s->status) = Status::OK();
         if (*(s->merge_in_progress)) {
-          if (s->value != nullptr) {
-            *(s->status) = MergeHelper::TimedFullMerge(
-                merge_operator, s->key->user_key(), &v,
-                merge_context->GetOperands(), s->value, s->logger,
-                s->statistics, s->env_, nullptr /* result_operand */, true);
+          if (s->do_merge) {
+            if (s->value != nullptr) {
+              *(s->status) = MergeHelper::TimedFullMerge(
+                  merge_operator, s->key->user_key(), &v,
+                  merge_context->GetOperands(), s->value, s->logger,
+                  s->statistics, s->env_, nullptr /* result_operand */, true);
+            }
+          } else {
+            // Preserve the value with the goal of returning it as part of
+            // raw merge operands to the user
+            merge_context->PushOperand(
+                v, s->inplace_update_support == false /* operand_pinned */);
           }
+        } else if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+          merge_context->PushOperand(
+              v, s->inplace_update_support == false /* operand_pinned */);
         } else if (s->value != nullptr) {
           s->value->assign(v.data(), v.size());
         }
@@ -726,7 +739,8 @@ static bool SaveValue(void* arg, const char* entry) {
         *(s->merge_in_progress) = true;
         merge_context->PushOperand(
             v, s->inplace_update_support == false /* operand_pinned */);
-        if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) {
+        if (s->do_merge && merge_operator->ShouldMerge(
+                               merge_context->GetOperandsDirectionBackward())) {
           *(s->status) = MergeHelper::TimedFullMerge(
               merge_operator, s->key->user_key(), nullptr,
               merge_context->GetOperands(), s->value, s->logger, s->statistics,
@@ -750,7 +764,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext* merge_context,
                    SequenceNumber* max_covering_tombstone_seq,
                    SequenceNumber* seq, const ReadOptions& read_opts,
-                   ReadCallback* callback, bool* is_blob_index) {
+                   ReadCallback* callback, bool* is_blob_index, bool do_merge) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -810,8 +824,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.env_ = env_;
     saver.callback_ = callback;
     saver.is_blob_index = is_blob_index;
+    saver.do_merge = do_merge;
     table_->Get(key, &saver, SaveValue);
-
     *seq = saver.seq;
   }
 
diff --git a/db/memtable.h b/db/memtable.h
index 6b8c4141f5a..36ba0df79ba 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -175,6 +175,10 @@ class MemTable {
            const Slice& value, bool allow_concurrent = false,
            MemTablePostProcessInfo* post_process_info = nullptr);
 
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
   // If memtable contains a value for key, store it in *value and return true.
   // If memtable contains a deletion for key, store a NotFound() error
   // in *status and return true.
@@ -188,20 +192,23 @@ class MemTable {
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
   // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
-           bool* is_blob_index = nullptr);
+           bool* is_blob_index = nullptr, bool do_merge = true);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
-           bool* is_blob_index = nullptr) {
+           bool* is_blob_index = nullptr, bool do_merge = true) {
     SequenceNumber seq;
     return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
-               read_opts, callback, is_blob_index);
+               read_opts, callback, is_blob_index, do_merge);
   }
 
   // Attempts to update the new_value inplace, else does normal Add
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 0f796eb9a73..d06a82df8ef 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -109,6 +109,20 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                      is_blob_index);
 }
 
+bool MemTableListVersion::GetMergeOperands(
+    const LookupKey& key, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+  for (MemTable* memtable : memlist_) {
+    bool done = memtable->Get(key, nullptr, s, merge_context,
+                              max_covering_tombstone_seq, read_opts, nullptr,
+                              nullptr, false);
+    if (done) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool MemTableListVersion::GetFromHistory(
     const LookupKey& key, std::string* value, Status* s,
     MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
diff --git a/db/memtable_list.h b/db/memtable_list.h
index a72077ff3d5..2bd225b8390 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -71,6 +71,13 @@ class MemTableListVersion {
                read_opts, callback, is_blob_index);
   }
 
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
   // Similar to Get(), but searches the Memtable history of memtables that
   // have already been flushed.  Should only be used from in-memory only
   // queries (such as Transaction validation) as the history may contain
diff --git a/db/version_set.cc b/db/version_set.cc
index 3a1f47790c5..af0168f7660 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1651,7 +1651,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   MergeContext* merge_context,
                   SequenceNumber* max_covering_tombstone_seq, bool* value_found,
                   bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
-                  bool* is_blob) {
+                  bool* is_blob, bool do_merge) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
 
@@ -1671,8 +1671,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
-      value, value_found, merge_context, max_covering_tombstone_seq, this->env_,
-      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+      do_merge ? value : nullptr, value_found, merge_context, do_merge,
+      max_covering_tombstone_seq, this->env_, seq,
+      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
       tracing_get_id);
 
   // Pin blocks that we read to hold merge operands
@@ -1737,7 +1738,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
-        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel());
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                  fp.GetHitFileLevel());
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1755,11 +1757,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
     }
     f = fp.GetNextFile();
   }
-
   if (db_statistics_ != nullptr) {
     get_context.ReportCounters();
   }
   if (GetContext::kMerge == get_context.State()) {
+    if (!do_merge) {
+      *status = Status::OK();
+      return;
+    }
     if (!merge_operator_) {
       *status =  Status::InvalidArgument(
           "merge_operator is not properly initialized.");
@@ -1806,7 +1811,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
     get_ctx.emplace_back(
         user_comparator(), merge_operator_, info_log_, db_statistics_,
         iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
-        iter->value, nullptr, &(iter->merge_context),
+        iter->value, nullptr, &(iter->merge_context), true,
         &iter->max_covering_tombstone_seq, this->env_, &iter->seq,
         merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
         tracing_mget_id);
diff --git a/db/version_set.h b/db/version_set.h
index 391bb902c4b..25598630e2a 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -63,7 +63,6 @@ class VersionSet;
 class WriteBufferManager;
 class MergeContext;
 class ColumnFamilySet;
-class TableCache;
 class MergeIteratorBuilder;
 
 // Return the smallest index i such that file_level.files[i]->largest >= key.
@@ -561,28 +560,33 @@ class Version {
                                   const Slice& largest_user_key,
                                   int level, bool* overlap);
 
-  // Lookup the value for key.  If found, store it in *val and
-  // return OK.  Else return a non-OK status.
-  // Uses *operands to store merge_operator operations to apply later.
+  // Lookup the value for key or get all merge operands for key.
+  // If do_merge = true (default) then lookup value for key.
+  // Behavior if do_merge = true:
+  //    If found, store it in *value and
+  //    return OK.  Else return a non-OK status.
+  //    Uses *operands to store merge_operator operations to apply later.
   //
-  // If the ReadOptions.read_tier is set to do a read-only fetch, then
-  // *value_found will be set to false if it cannot be determined whether
-  // this value exists without doing IO.
+  //    If the ReadOptions.read_tier is set to do a read-only fetch, then
+  //    *value_found will be set to false if it cannot be determined whether
+  //    this value exists without doing IO.
   //
-  // If the key is Deleted, *status will be set to NotFound and
+  //    If the key is Deleted, *status will be set to NotFound and
   //                        *key_exists will be set to true.
-  // If no key was found, *status will be set to NotFound and
+  //    If no key was found, *status will be set to NotFound and
   //                      *key_exists will be set to false.
-  // If seq is non-null, *seq will be set to the sequence number found
-  // for the key if a key was found.
-  //
+  //    If seq is non-null, *seq will be set to the sequence number found
+  //    for the key if a key was found.
+  // Behavior if do_merge = false
+  //    If the key has any merge operands then store them in
+  //    merge_context.operands_list and don't merge the operands
   // REQUIRES: lock is not held
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
            Status* status, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            bool* value_found = nullptr, bool* key_exists = nullptr,
            SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
-           bool* is_blob = nullptr);
+           bool* is_blob = nullptr, bool do_merge = true);
 
   void MultiGet(const ReadOptions&, MultiGetRange* range,
                 ReadCallback* callback = nullptr, bool* is_blob = nullptr);
diff --git a/file/filename.cc b/file/filename.cc
index 65ec3314995..ba5d84c291f 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -60,8 +60,7 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
 static std::string MakeFileName(uint64_t number, const char* suffix) {
   char buf[100];
   snprintf(buf, sizeof(buf), "%06llu.%s",
-           static_cast<unsigned long long>(number),
-           suffix);
+           static_cast<unsigned long long>(number), suffix);
   return buf;
 }
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 1d90dc50b4b..36d6fea92bb 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -116,6 +116,10 @@ struct IngestExternalFileArg {
   IngestExternalFileOptions options;
 };
 
+struct GetMergeOperandsOptions {
+  int expected_max_number_of_operands = 0;
+};
+
 // A collections of table properties objects, where
 //  key: is the table's file name.
 //  value: the table properties object of the given table.
@@ -403,6 +407,22 @@ class DB {
     return Get(options, DefaultColumnFamily(), key, value);
   }
 
+  // Returns all the merge operands corresponding to the key. If the
+  // number of merge operands in DB is greater than
+  // merge_operands_options.expected_max_number_of_operands
+  // no merge operands are returned and status is Incomplete. Merge operands
+  // returned are in the order of insertion.
+  // merge_operands- Points to an array of at-least
+  //             merge_operands_options.expected_max_number_of_operands and the
+  //             caller is responsible for allocating it. If the status
+  //             returned is Incomplete then number_of_operands will contain
+  //             the total number of merge operands found in DB for key.
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* merge_operands,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) = 0;
+
   // If keys[i] does not exist in the database, then the i'th returned
   // status will be one for which Status::IsNotFound() is true, and
   // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index ac97ce442af..e4360126dbd 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -76,6 +76,7 @@ class Status {
     kMemoryLimit = 7,
     kSpaceLimit = 8,
     kPathNotFound = 9,
+    KMergeOperandsInsufficientCapacity = 10,
     kMaxSubCode
   };
 
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 67bf4e2fa6b..35fddc804b9 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -88,6 +88,17 @@ class StackableDB : public DB {
     return db_->Get(options, column_family, key, value);
   }
 
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* slice,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) override {
+    return db_->GetMergeOperands(options, column_family, key, slice,
+                                 get_merge_operands_options,
+                                 number_of_operands);
+  }
+
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
diff --git a/src.mk b/src.mk
index 0c6142e41ad..6d1d655c7f0 100644
--- a/src.mk
+++ b/src.mk
@@ -191,6 +191,7 @@ LIB_SOURCES =                                                   \
   utilities/memory/memory_util.cc                               \
   utilities/merge_operators/max.cc                              \
   utilities/merge_operators/put.cc                              \
+  utilities/merge_operators/sortlist.cc                  		\
   utilities/merge_operators/string_append/stringappend.cc       \
   utilities/merge_operators/string_append/stringappend2.cc      \
   utilities/merge_operators/uint64add.cc                        \
@@ -291,6 +292,7 @@ MAIN_SOURCES =                                                          \
   db/db_log_iter_test.cc                                                \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
+  db/db_merge_operand_test.cc                                          	\
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 484617d7e14..ae23f6ef2d3 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -631,7 +631,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -656,7 +656,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -681,7 +681,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -706,7 +706,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kNotFound);
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index dd1557db147..8043d36ab8e 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -122,7 +122,7 @@ class CuckooReaderTest : public testing::Test {
       PinnableSlice value;
       GetContext get_context(ucomp, nullptr, nullptr, nullptr,
                              GetContext::kNotFound, Slice(user_keys[i]), &value,
-                             nullptr, nullptr, nullptr, nullptr);
+                             nullptr, nullptr, true, nullptr, nullptr);
       ASSERT_OK(
           reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
       ASSERT_STREQ(values[i].c_str(), value.data());
@@ -336,7 +336,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   AppendInternalKey(&not_found_key, ikey);
   PinnableSlice value;
   GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
-                         Slice(not_found_key), &value, nullptr, nullptr,
+                         Slice(not_found_key), &value, nullptr, nullptr, true,
                          nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
@@ -351,7 +351,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   value.Reset();
   GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(not_found_key2), &value,
-                          nullptr, nullptr, nullptr, nullptr);
+                          nullptr, nullptr, true, nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
   ASSERT_TRUE(value.empty());
@@ -367,7 +367,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   value.Reset();
   GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(unused_key), &value,
-                          nullptr, nullptr, nullptr, nullptr);
+                          nullptr, nullptr, true, nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
   ASSERT_TRUE(value.empty());
@@ -443,7 +443,7 @@ void WriteFile(const std::vector<std::string>& keys,
   // Assume only the fast path is triggered
   GetContext get_context(nullptr, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, Slice(), &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   for (uint64_t i = 0; i < num; ++i) {
     value.Reset();
     value.clear();
@@ -491,7 +491,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   // Assume only the fast path is triggered
   GetContext get_context(nullptr, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, Slice(), &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   uint64_t start_time = env->NowMicros();
   if (batch_size > 0) {
     for (uint64_t i = 0; i < num; i += batch_size) {
diff --git a/table/get_context.cc b/table/get_context.cc
index f0c7928bf42..cdb5798f782 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -42,9 +42,9 @@ GetContext::GetContext(
     const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
     Statistics* statistics, GetState init_state, const Slice& user_key,
     PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
-    SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq,
-    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
-    bool* is_blob_index, uint64_t tracing_get_id)
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
+    SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
+    ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -60,6 +60,7 @@ GetContext::GetContext(
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
       callback_(callback),
+      do_merge_(do_merge),
       is_blob_index_(is_blob_index),
       tracing_get_id_(tracing_get_id) {
   if (seq_) {
@@ -215,29 +216,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         }
         if (kNotFound == state_) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            if (LIKELY(value_pinner != nullptr)) {
-              // If the backing resources for the value are provided, pin them
-              pinnable_val_->PinSlice(value, value_pinner);
-            } else {
-              TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this);
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              if (LIKELY(value_pinner != nullptr)) {
+                // If the backing resources for the value are provided, pin them
+                pinnable_val_->PinSlice(value, value_pinner);
+              } else {
+                TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
+                                         this);
 
-              // Otherwise copy the value
-              pinnable_val_->PinSelf(value);
+                // Otherwise copy the value
+                pinnable_val_->PinSelf(value);
+              }
             }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            push_operand(value, value_pinner);
           }
         } else if (kMerge == state_) {
           assert(merge_operator_ != nullptr);
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, &value,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, &value,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            push_operand(value, value_pinner);
           }
         }
         if (is_blob_index_ != nullptr) {
@@ -256,14 +272,18 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         } else if (kMerge == state_) {
           state_ = kFound;
           if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, nullptr,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+            if (do_merge_) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, nullptr,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
+            // If do_merge_ = false then the current value shouldn't be part of
+            // merge_context_->operand_list
           }
         }
         return false;
@@ -272,24 +292,23 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         assert(state_ == kNotFound || state_ == kMerge);
         state_ = kMerge;
         // value_pinner is not set from plain_table_reader.cc for example.
-        if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
-            value_pinner != nullptr) {
-          value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
-          merge_context_->PushOperand(value, true /*value_pinned*/);
-        } else {
-          merge_context_->PushOperand(value, false);
-        }
-        if (merge_operator_ != nullptr &&
-            merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) {
+        push_operand(value, value_pinner);
+        if (do_merge_ && merge_operator_ != nullptr &&
+            merge_operator_->ShouldMerge(
+                merge_context_->GetOperandsDirectionBackward())) {
           state_ = kFound;
           if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, nullptr,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+            // do_merge_ = true this is the case where this function is called
+            // as part of DB Get API hence merge operators should be merged.
+            if (do_merge_) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, nullptr,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
           }
           return false;
@@ -306,6 +325,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   return false;
 }
 
+void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
+  if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
+      value_pinner != nullptr) {
+    value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
+    merge_context_->PushOperand(value, true /*value_pinned*/);
+  } else {
+    merge_context_->PushOperand(value, false);
+  }
+}
+
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
                          GetContext* get_context, Cleanable* value_pinner) {
 #ifndef ROCKSDB_LITE
diff --git a/table/get_context.h b/table/get_context.h
index 7110ceae806..97d73ec0b3a 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -66,6 +66,9 @@ class GetContext {
   GetContextStats get_context_stats_;
 
   // Constructor
+  // @param value Holds the value corresponding to user_key. If its nullptr
+  //              then return all merge operands corresponding to user_key
+  //              via merge_context
   // @param value_found If non-nullptr, set to false if key may be present
   //                    but we can't be certain because we cannot do IO
   // @param max_covering_tombstone_seq Pointer to highest sequence number of
@@ -78,10 +81,14 @@ class GetContext {
   //                 for visibility of a key
   // @param is_blob_index If non-nullptr, will be used to indicate if a found
   //                      key is of type blob index
+  // @param do_merge True if value associated with user_key has to be returned
+  // and false if all the merge operands associated with user_key has to be
+  // returned. Id do_merge=false then all the merge operands are stored in
+  // merge_context and they are never merged. The value pointer is untouched.
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
-             MergeContext* merge_context,
+             MergeContext* merge_context, bool do_merge,
              SequenceNumber* max_covering_tombstone_seq, Env* env,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
@@ -140,6 +147,8 @@ class GetContext {
 
   uint64_t get_tracing_get_id() const { return tracing_get_id_; }
 
+  void push_operand(const Slice& value, Cleanable* value_pinner);
+
  private:
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
@@ -162,6 +171,10 @@ class GetContext {
   PinnedIteratorsManager* pinned_iters_mgr_;
   ReadCallback* callback_;
   bool sample_;
+  // Value is true if it's called as part of DB Get API and false if it's
+  // called as part of DB GetMergeOperands API. When it's false merge operators
+  // are never merged.
+  bool do_merge_;
   bool* is_blob_index_;
   // Used for block cache tracing only. A tracing get id uniquely identifies a
   // Get or a MultiGet.
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index cec62df5949..45d760f0ef8 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -175,7 +175,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
                                    ioptions.merge_operator, ioptions.info_log,
                                    ioptions.statistics, GetContext::kNotFound,
                                    Slice(key), &value, nullptr, &merge_context,
-                                   &max_covering_tombstone_seq, env);
+                                   true, &max_covering_tombstone_seq, env);
             s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
diff --git a/table/table_test.cc b/table/table_test.cc
index 6cd26bc732a..749048b78c2 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2323,8 +2323,8 @@ TEST_P(BlockBasedTableTest, TracingGetTest) {
     PinnableSlice value;
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, user_key, &value, nullptr,
-                           nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                           nullptr, /*get_id=*/i);
+                           nullptr, true, nullptr, nullptr, nullptr, nullptr,
+                           nullptr, nullptr, /*tracing_get_id=*/i);
     get_perf_context()->Reset();
     ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
                                       moptions.prefix_extractor.get()));
@@ -2579,7 +2579,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
   {
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, Slice(), nullptr, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
     reader->Get(ReadOptions(), "non-exist-key", &get_context,
                 moptions.prefix_extractor.get());
@@ -2750,7 +2750,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   PinnableSlice value;
   GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, user_key, &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
                         moptions4.prefix_extractor.get()));
   ASSERT_STREQ(value.data(), "hello");
@@ -2836,7 +2836,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       {
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         get_perf_context()->Reset();
         ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
                               moptions.prefix_extractor.get()));
@@ -2862,7 +2862,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       {
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         get_perf_context()->Reset();
         ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
                               moptions.prefix_extractor.get()));
@@ -4230,7 +4230,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
         std::string user_key = ExtractUserKey(kv.first).ToString();
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         ASSERT_OK(reader->Get(ro, kv.first, &get_context,
                               moptions.prefix_extractor.get()));
         ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -4256,7 +4256,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
         PinnableSlice value;
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
                               moptions.prefix_extractor.get()));
         ASSERT_EQ(get_context.State(), GetContext::kNotFound);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index f6a9d945897..001dd4d2fb0 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -71,6 +71,7 @@
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
 #ifdef OS_WIN
@@ -120,7 +121,8 @@ DEFINE_string(
     "fillseekseq,"
     "randomtransaction,"
     "randomreplacekeys,"
-    "timeseries",
+    "timeseries,"
+    "getmergeoperands",
 
     "Comma-separated list of operations to run in the specified"
     " order. Available benchmarks:\n"
@@ -190,7 +192,13 @@ DEFINE_string(
     "\tlevelstats  -- Print the number of files and bytes per level\n"
     "\tsstables    -- Print sstable info\n"
     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
-    "\treplay      -- replay the trace file specified with trace_file\n");
+    "\treplay      -- replay the trace file specified with trace_file\n"
+    "\tgetmergeoperands -- Insert lots of merge records which are a list of "
+    "sorted ints for a key and then compare performance of lookup for another "
+    "key "
+    "by doing a Get followed by binary searching in the large sorted list vs "
+    "doing a GetMergeOperands and binary searching in the operands which are"
+    "sorted sub-lists. The MergeOperator used is sortlist.h\n");
 
 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
 
@@ -2880,6 +2888,8 @@ class Benchmark {
           exit(1);
         }
         method = &Benchmark::Replay;
+      } else if (name == "getmergeoperands") {
+        method = &Benchmark::GetMergeOperands;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         exit(1);
@@ -5921,6 +5931,97 @@ class Benchmark {
     }
   }
 
+  bool binary_search(std::vector<int>& data, int start, int end, int key) {
+    if (data.empty()) return false;
+    if (start > end) return false;
+    int mid = start + (end - start) / 2;
+    if (mid > static_cast<int>(data.size()) - 1) return false;
+    if (data[mid] == key) {
+      return true;
+    } else if (data[mid] > key) {
+      return binary_search(data, start, mid - 1, key);
+    } else {
+      return binary_search(data, mid + 1, end, key);
+    }
+  }
+
+  // Does a bunch of merge operations for a key(key1) where the merge operand
+  // is a sorted list. Next performance comparison is done between doing a Get
+  // for key1 followed by searching for another key(key2) in the large sorted
+  // list vs calling GetMergeOperands for key1 and then searching for the key2
+  // in all the sorted sub-lists. Later case is expected to be a lot faster.
+  void GetMergeOperands(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    const int kTotalValues = 100000;
+    const int kListSize = 100;
+    std::string key = "my_key";
+    std::string value;
+
+    for (int i = 1; i < kTotalValues; i++) {
+      if (i % kListSize == 0) {
+        // Remove trailing ','
+        value.pop_back();
+        db->Merge(WriteOptions(), key, value);
+        value.clear();
+      } else {
+        value.append(std::to_string(i)).append(",");
+      }
+    }
+
+    SortList s;
+    std::vector<int> data;
+    // This value can be experimented with and it will demonstrate the
+    // perf difference between doing a Get and searching for lookup_key in the
+    // resultant large sorted list vs doing GetMergeOperands and searching
+    // for lookup_key within this resultant sorted sub-lists.
+    int lookup_key = 1;
+
+    // Get API call
+    std::cout << "--- Get API call --- \n";
+    PinnableSlice p_slice;
+    uint64_t st = FLAGS_env->NowNanos();
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
+    s.MakeVector(data, p_slice);
+    bool found =
+        binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    uint64_t sp = FLAGS_env->NowNanos();
+    std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
+    std::string* dat_ = p_slice.GetSelf();
+    std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
+              << "\n";
+    data.clear();
+
+    // GetMergeOperands API call
+    std::cout << "--- GetMergeOperands API --- \n";
+    std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
+    st = FLAGS_env->NowNanos();
+    int number_of_operands = 0;
+    GetMergeOperandsOptions get_merge_operands_options;
+    get_merge_operands_options.expected_max_number_of_operands =
+        (kTotalValues / 100) + 1;
+    db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
+                         a_slice.data(), &get_merge_operands_options,
+                         &number_of_operands);
+    for (PinnableSlice& psl : a_slice) {
+      s.MakeVector(data, psl);
+      found =
+          binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+      data.clear();
+      if (found) break;
+    }
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    sp = FLAGS_env->NowNanos();
+    std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
+              << " seconds \n";
+    int to_print = 0;
+    std::cout << "Sample data from GetMergeOperands API call: ";
+    for (PinnableSlice& psl : a_slice) {
+      std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
+      if (to_print++ > 2) break;
+    }
+  }
+
 #ifndef ROCKSDB_LITE
   // This benchmark stress tests Transactions.  For a given --duration (or
   // total number of --writes, a Transaction will perform a read-modify-write
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index caa9b098804..86501280d22 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1146,9 +1146,11 @@ Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
   PinnableSlice index_entry;
   Status s;
   bool is_blob_index = false;
-  s = db_impl_->GetImpl(ro, column_family, key, &index_entry,
-                        nullptr /*value_found*/, nullptr /*read_callback*/,
-                        &is_blob_index);
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &index_entry;
+  get_impl_options.is_blob_index = &is_blob_index;
+  s = db_impl_->GetImpl(ro, key, get_impl_options);
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
   if (expiration != nullptr) {
@@ -1535,9 +1537,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     SequenceNumber latest_seq = GetLatestSequenceNumber();
     bool is_blob_index = false;
     PinnableSlice index_entry;
-    Status get_status = db_impl_->GetImpl(
-        ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
-        nullptr /*read_callback*/, &is_blob_index);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh;
+    get_impl_options.value = &index_entry;
+    get_impl_options.is_blob_index = &is_blob_index;
+    Status get_status =
+        db_impl_->GetImpl(ReadOptions(), record.key, get_impl_options);
     TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
     if (!get_status.ok() && !get_status.IsNotFound()) {
       // error
diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h
index 4c720b822fe..6e3464bdfb4 100644
--- a/utilities/merge_operators.h
+++ b/utilities/merge_operators.h
@@ -23,6 +23,7 @@ class MergeOperators {
   static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
   static std::shared_ptr<MergeOperator> CreateMaxOperator();
   static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
+  static std::shared_ptr<MergeOperator> CreateSortOperator();
 
   // Will return a different merge operator depending on the string.
   // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
@@ -42,6 +43,8 @@ class MergeOperators {
       return CreateMaxOperator();
     } else if (name == "bytesxor") {
       return CreateBytesXOROperator();
+    } else if (name == "sortlist") {
+      return CreateSortOperator();
     } else {
       // Empty or unknown, just return nullptr
       return nullptr;
diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc
new file mode 100644
index 00000000000..5dbf051157e
--- /dev/null
+++ b/utilities/merge_operators/sortlist.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+
+using rocksdb::Logger;
+using rocksdb::MergeOperator;
+using rocksdb::Slice;
+
+namespace rocksdb {
+
+bool SortList::FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const {
+  std::vector<int> left;
+  for (Slice slice : merge_in.operand_list) {
+    std::vector<int> right;
+    MakeVector(right, slice);
+    left = Merge(left, right);
+  }
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    merge_out->new_value.append(std::to_string(left[i])).append(",");
+  }
+  merge_out->new_value.append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                            const Slice& right_operand, std::string* new_value,
+                            Logger* /*logger*/) const {
+  std::vector<int> left;
+  std::vector<int> right;
+  MakeVector(left, left_operand);
+  MakeVector(right, right_operand);
+  left = Merge(left, right);
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    new_value->append(std::to_string(left[i])).append(",");
+  }
+  new_value->append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* /*logger*/) const {
+  (void)operand_list;
+  (void)new_value;
+  return true;
+}
+
+const char* SortList::Name() const { return "MergeSortOperator"; }
+
+void SortList::MakeVector(std::vector<int>& operand, Slice slice) const {
+  do {
+    const char* begin = slice.data_;
+    while (*slice.data_ != ',' && *slice.data_) slice.data_++;
+    operand.push_back(std::stoi(std::string(begin, slice.data_)));
+  } while (0 != *slice.data_++);
+}
+
+std::vector<int> SortList::Merge(std::vector<int>& left,
+                                 std::vector<int>& right) const {
+  // Fill the resultant vector with sorted results from both vectors
+  std::vector<int> result;
+  unsigned left_it = 0, right_it = 0;
+
+  while (left_it < left.size() && right_it < right.size()) {
+    // If the left value is smaller than the right it goes next
+    // into the resultant vector
+    if (left[left_it] < right[right_it]) {
+      result.push_back(left[left_it]);
+      left_it++;
+    } else {
+      result.push_back(right[right_it]);
+      right_it++;
+    }
+  }
+
+  // Push the remaining data from both vectors onto the resultant
+  while (left_it < left.size()) {
+    result.push_back(left[left_it]);
+    left_it++;
+  }
+
+  while (right_it < right.size()) {
+    result.push_back(right[right_it]);
+    right_it++;
+  }
+
+  return result;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateSortOperator() {
+  return std::make_shared<SortList>();
+}
+}  // namespace rocksdb
diff --git a/utilities/merge_operators/sortlist.h b/utilities/merge_operators/sortlist.h
new file mode 100644
index 00000000000..02c93edf5e9
--- /dev/null
+++ b/utilities/merge_operators/sortlist.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// A MergeOperator for RocksDB that implements Merge Sort.
+// It is built using the MergeOperator interface. The operator works by taking
+// an input which contains one or more merge operands where each operand is a
+// list of sorted ints and merges them to form a large sorted list.
+#pragma once
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class SortList : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override;
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
+
+  const char* Name() const override;
+
+  void MakeVector(std::vector<int>& operand, Slice slice) const;
+
+ private:
+  std::vector<int> Merge(std::vector<int>& left, std::vector<int>& right) const;
+};
+
+}  // namespace rocksdb
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 188f61120be..8dfc0d1d4ac 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -290,8 +290,12 @@ Status WritePreparedTxn::RollbackInternal() {
       PinnableSlice pinnable_val;
       bool not_used;
       auto cf_handle = handles_[cf];
-      s = db_->GetImpl(roptions_, cf_handle, key, &pinnable_val, &not_used,
-                       &callback);
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_->GetImpl(roptions_, key, get_impl_options);
       assert(s.ok() || s.IsNotFound());
       if (s.ok()) {
         s = rollback_batch_->Put(cf_handle, key, pinnable_val);
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index e6d71020685..7441cb3c093 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -231,8 +231,12 @@ Status WritePreparedTxnDB::Get(const ReadOptions& options,
   WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted,
                                         backed_by_snapshot);
   bool* dont_care = nullptr;
-  auto res = db_impl_->GetImpl(options, column_family, key, value, dont_care,
-                               &callback);
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  get_impl_options.value_found = dont_care;
+  get_impl_options.callback = &callback;
+  auto res = db_impl_->GetImpl(options, key, get_impl_options);
   if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(),
                                                   backed_by_snapshot))) {
     return res;
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index a1862d32d44..321110ea1b6 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -567,8 +567,12 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       const auto& cf_handle = cf_map.at(cfid);
       PinnableSlice pinnable_val;
       bool not_used;
-      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                            &callback);
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_impl_->GetImpl(roptions, key, get_impl_options);
 
       if (s.ok()) {
         s = rollback_batch.Put(cf_handle, key, pinnable_val);
@@ -721,8 +725,12 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
       const auto& cf_handle = cf_map.at(cfid);
       PinnableSlice pinnable_val;
       bool not_used;
-      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                            &callback);
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_impl_->GetImpl(roptions, key, get_impl_options);
 
       if (s.ok()) {
         s = write_batch_.Put(cf_handle, key, pinnable_val);
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index defaf9fce6e..3a8eff5ec5e 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -86,8 +86,12 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
         PinnableSlice pinnable_val;
         bool not_used;
         auto cf_handle = handles_[cf];
-        s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                         &callback);
+        DBImpl::GetImplOptions get_impl_options;
+        get_impl_options.column_family = cf_handle;
+        get_impl_options.value = &pinnable_val;
+        get_impl_options.value_found = &not_used;
+        get_impl_options.callback = &callback;
+        s = db_->GetImpl(roptions, key, get_impl_options);
         assert(s.ok() || s.IsNotFound());
         if (s.ok()) {
           s = rollback_batch_->Put(cf_handle, key, pinnable_val);
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 3ffa2e0c62a..272a2ab4862 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -891,9 +891,12 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
   if (!callback) {
     s = db->Get(read_options, column_family, key, pinnable_val);
   } else {
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.value = pinnable_val;
+    get_impl_options.callback = callback;
     s = static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-            ->GetImpl(read_options, column_family, key, pinnable_val, nullptr,
-                      callback);
+            ->GetImpl(read_options, key, get_impl_options);
   }
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded

From 6e78fe3c8d35fa1c0836af4501e0f272bc363bab Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Tue, 6 Aug 2019 18:47:39 -0700
Subject: [PATCH 284/572] Pysim more algorithms (#5644)

Summary:
This PR adds four more eviction policies.
- OPT [1]
- Hyperbolic caching [2]
- ARC [3]
- GreedyDualSize [4]

[1] L. A. Belady. 1966. A Study of Replacement Algorithms for a Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101. DOI=http://dx.doi.org/10.1147/sj.52.0078
[2] Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017. Hyperbolic caching: flexible caching for web applications. In Proceedings of the 2017 USENIX Conference on Usenix Annual Technical Conference (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
[3] Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA, USA, 115-130.
[4] N. Young. The k-server dual and loose competitiveness for paging. Algorithmica, June 1994, vol. 11,(no.6):525-41. Rewritten version of ''On-line caching as cache size varies'', in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5644

Differential Revision: D16548817

Pulled By: HaoyuHuang

fbshipit-source-id: 838f76db9179f07911abaab46c97e1c929cfcd63
---
 .../block_cache_analyzer/block_cache_pysim.py | 1636 ++++++++++++++---
 .../block_cache_analyzer/block_cache_pysim.sh |   86 +-
 .../block_cache_pysim_test.py                 |  478 ++++-
 .../block_cache_trace_analyzer.cc             |  162 +-
 .../block_cache_trace_analyzer.h              |   14 +-
 .../block_cache_trace_analyzer_test.cc        |   31 +-
 trace_replay/block_cache_tracer.cc            |   39 +-
 trace_replay/block_cache_tracer.h             |    9 +
 .../simulator_cache/cache_simulator_test.cc   |   26 +-
 9 files changed, 2111 insertions(+), 370 deletions(-)

diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py
index 63e367be5a7..67307df5329 100644
--- a/tools/block_cache_analyzer/block_cache_pysim.py
+++ b/tools/block_cache_analyzer/block_cache_pysim.py
@@ -2,15 +2,17 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 
 import gc
+import heapq
 import random
 import sys
 import time
+from collections import OrderedDict
 from os import path
 
 import numpy as np
 
 
-kSampleSize = 16  # The sample size used when performing eviction.
+kSampleSize = 64  # The sample size used when performing eviction.
 kMicrosInSecond = 1000000
 kSecondsInMinute = 60
 kSecondsInHour = 3600
@@ -39,11 +41,19 @@ def __init__(
         key_id,
         kv_size,
         is_hit,
+        referenced_key_exist_in_block,
+        num_keys_in_block,
+        table_id,
+        seq_number,
+        block_key_size,
+        key_size,
+        block_offset_in_file,
+        next_access_seq_no,
     ):
         self.access_time = access_time
         self.block_id = block_id
         self.block_type = block_type
-        self.block_size = block_size
+        self.block_size = block_size + block_key_size
         self.cf_id = cf_id
         self.cf_name = cf_name
         self.level = level
@@ -60,22 +70,46 @@ def __init__(
             self.is_hit = True
         else:
             self.is_hit = False
+        if referenced_key_exist_in_block == 1:
+            self.referenced_key_exist_in_block = True
+        else:
+            self.referenced_key_exist_in_block = False
+        self.num_keys_in_block = num_keys_in_block
+        self.table_id = table_id
+        self.seq_number = seq_number
+        self.block_key_size = block_key_size
+        self.key_size = key_size
+        self.block_offset_in_file = block_offset_in_file
+        self.next_access_seq_no = next_access_seq_no
 
 
 class CacheEntry:
     """A cache entry stored in the cache."""
 
-    def __init__(self, value_size, cf_id, level, block_type, access_number):
+    def __init__(
+        self,
+        value_size,
+        cf_id,
+        level,
+        block_type,
+        table_id,
+        access_number,
+        time_s,
+        num_hits=0,
+    ):
         self.value_size = value_size
         self.last_access_number = access_number
-        self.num_hits = 0
+        self.num_hits = num_hits
         self.cf_id = 0
         self.level = level
         self.block_type = block_type
+        self.last_access_time = time_s
+        self.insertion_time = time_s
+        self.table_id = table_id
 
     def __repr__(self):
         """Debug string."""
-        return "s={},last={},hits={},cf={},l={},bt={}".format(
+        return "(s={},last={},hits={},cf={},l={},bt={})\n".format(
             self.value_size,
             self.last_access_number,
             self.num_hits,
@@ -84,6 +118,22 @@ def __repr__(self):
             self.block_type,
         )
 
+    def cost_class(self, cost_class_label):
+        if cost_class_label == "table_bt":
+            return "{}-{}".format(self.table_id, self.block_type)
+        elif cost_class_label == "table":
+            return "{}".format(self.table_id)
+        elif cost_class_label == "bt":
+            return "{}".format(self.block_type)
+        elif cost_class_label == "cf":
+            return "{}".format(self.cf_id)
+        elif cost_class_label == "cf_bt":
+            return "{}-{}".format(self.cf_id, self.block_type)
+        elif cost_class_label == "table_level_bt":
+            return "{}-{}-{}".format(self.table_id, self.level, self.block_type)
+        assert False, "Unknown cost class label {}".format(cost_class_label)
+        return None
+
 
 class HashEntry:
     """A hash entry stored in a hash table."""
@@ -106,30 +156,55 @@ class HashTable:
     """
 
     def __init__(self):
-        self.table = [None] * 32
+        self.initial_size = 32
+        self.table = [None] * self.initial_size
         self.elements = 0
 
     def random_sample(self, sample_size):
         """Randomly sample 'sample_size' hash entries from the table."""
         samples = []
-        index = random.randint(0, len(self.table))
-        pos = (index + 1) % len(self.table)
-        searches = 0
+        index = random.randint(0, len(self.table) - 1)
+        pos = index
         # Starting from index, adding hash entries to the sample list until
         # sample_size is met or we ran out of entries.
-        while pos != index and len(samples) < sample_size:
+        while True:
             if self.table[pos] is not None:
                 for i in range(len(self.table[pos])):
                     if self.table[pos][i] is None:
                         continue
                     samples.append(self.table[pos][i])
-                    if len(samples) > sample_size:
+                    if len(samples) == sample_size:
                         break
             pos += 1
             pos = pos % len(self.table)
-            searches += 1
+            if pos == index or len(samples) == sample_size:
+                break
+        assert len(samples) <= sample_size
         return samples
 
+    def __repr__(self):
+        all_entries = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_entries.append(self.table[i][j])
+        return "{}".format(all_entries)
+
+    def values(self):
+        all_values = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_values.append(self.table[i][j].value)
+        return all_values
+
+    def __len__(self):
+        return self.elements
+
     def insert(self, key, hash, value):
         """
         Insert a hash entry in the table. Replace the old entry if it already
@@ -140,19 +215,21 @@ def insert(self, key, hash, value):
         index = hash % len(self.table)
         if self.table[index] is None:
             self.table[index] = []
+        # Search for the entry first.
         for i in range(len(self.table[index])):
-            if self.table[index][i] is not None:
-                if (
-                    self.table[index][i].hash == hash
-                    and self.table[index][i].key == key
-                ):
-                    # The entry already exists in the table.
-                    self.table[index][i] = HashEntry(key, hash, value)
-                    return
+            if self.table[index][i] is None:
                 continue
-            self.table[index][i] = HashEntry(key, hash, value)
-            inserted = True
-            break
+            if self.table[index][i].hash == hash and self.table[index][i].key == key:
+                # The entry already exists in the table.
+                self.table[index][i] = HashEntry(key, hash, value)
+                return
+
+        # Find an empty slot.
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is None:
+                self.table[index][i] = HashEntry(key, hash, value)
+                inserted = True
+                break
         if not inserted:
             self.table[index].append(HashEntry(key, hash, value))
         self.elements += 1
@@ -160,7 +237,7 @@ def insert(self, key, hash, value):
     def resize(self, new_size):
         if new_size == len(self.table):
             return
-        if new_size == 0:
+        if new_size < self.initial_size:
             return
         if self.elements < 100:
             return
@@ -184,29 +261,31 @@ def resize(self, new_size):
         gc.collect()
 
     def grow(self):
-        if self.elements < len(self.table):
+        if self.elements < 4 * len(self.table):
             return
-        new_size = int(len(self.table) * 1.2)
+        new_size = int(len(self.table) * 1.5)
         self.resize(new_size)
 
     def delete(self, key, hash):
         index = hash % len(self.table)
-        entries = self.table[index]
         deleted = False
-        if entries is None:
+        deleted_entry = None
+        if self.table[index] is None:
             return
-        for i in range(len(entries)):
+        for i in range(len(self.table[index])):
             if (
-                entries[i] is not None
-                and entries[i].hash == hash
-                and entries[i].key == key
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
             ):
-                entries[i] = None
+                deleted_entry = self.table[index][i]
+                self.table[index][i] = None
                 self.elements -= 1
                 deleted = True
                 break
         if deleted:
             self.shrink()
+        return deleted_entry
 
     def shrink(self):
         if self.elements * 2 >= len(self.table):
@@ -216,12 +295,15 @@ def shrink(self):
 
     def lookup(self, key, hash):
         index = hash % len(self.table)
-        entries = self.table[index]
-        if entries is None:
+        if self.table[index] is None:
             return None
-        for entry in entries:
-            if entry is not None and entry.hash == hash and entry.key == key:
-                return entry.value
+        for i in range(len(self.table[index])):
+            if (
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
+            ):
+                return self.table[index][i].value
         return None
 
 
@@ -231,9 +313,10 @@ def __init__(self, time_unit):
         self.num_accesses = 0
         self.time_unit = time_unit
         self.time_misses = {}
+        self.time_miss_bytes = {}
         self.time_accesses = {}
 
-    def update_metrics(self, access_time, is_hit):
+    def update_metrics(self, access_time, is_hit, miss_bytes):
         access_time /= kMicrosInSecond * self.time_unit
         self.num_accesses += 1
         if access_time not in self.time_accesses:
@@ -243,20 +326,41 @@ def update_metrics(self, access_time, is_hit):
             self.num_misses += 1
             if access_time not in self.time_misses:
                 self.time_misses[access_time] = 0
+                self.time_miss_bytes[access_time] = 0
             self.time_misses[access_time] += 1
+            self.time_miss_bytes[access_time] += miss_bytes
 
     def reset_counter(self):
         self.num_misses = 0
         self.num_accesses = 0
+        self.time_miss_bytes.clear()
+        self.time_misses.clear()
+        self.time_accesses.clear()
+
+    def compute_miss_bytes(self):
+        miss_bytes = []
+        for at in self.time_miss_bytes:
+            miss_bytes.append(self.time_miss_bytes[at])
+        miss_bytes = sorted(miss_bytes)
+        avg_miss_bytes = 0
+        p95_miss_bytes = 0
+        for i in range(len(miss_bytes)):
+            avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes))
+
+        p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1)
+        p95_miss_bytes = miss_bytes[p95_index]
+        return avg_miss_bytes, p95_miss_bytes
 
     def miss_ratio(self):
         return float(self.num_misses) * 100.0 / float(self.num_accesses)
 
-    def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end):
+    def write_miss_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
         start /= kMicrosInSecond * self.time_unit
         end /= kMicrosInSecond * self.time_unit
-        header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         if not path.exists(header_file_path):
             with open(header_file_path, "w+") as header_file:
@@ -264,8 +368,8 @@ def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end):
                 for trace_time in range(start, end):
                     header += ",{}".format(trace_time)
                 header_file.write(header + "\n")
-        file_path = "{}/data-ml-miss-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         with open(file_path, "w+") as file:
             row = "{}".format(cache_type)
@@ -273,11 +377,13 @@ def write_miss_timeline(self, cache_type, cache_size, result_dir, start, end):
                 row += ",{}".format(self.time_misses.get(trace_time, 0))
             file.write(row + "\n")
 
-    def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, end):
+    def write_miss_ratio_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
         start /= kMicrosInSecond * self.time_unit
         end /= kMicrosInSecond * self.time_unit
-        header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         if not path.exists(header_file_path):
             with open(header_file_path, "w+") as header_file:
@@ -285,8 +391,8 @@ def write_miss_ratio_timeline(self, cache_type, cache_size, result_dir, start, e
                 for trace_time in range(start, end):
                     header += ",{}".format(trace_time)
                 header_file.write(header + "\n")
-        file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         with open(file_path, "w+") as file:
             row = "{}".format(cache_type)
@@ -322,11 +428,13 @@ def update_metrics(self, access_time, selected_policy):
             self.time_selected_polices[access_time][policy_name] = 0
         self.time_selected_polices[access_time][policy_name] += 1
 
-    def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end):
+    def write_policy_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
         start /= kMicrosInSecond * self.time_unit
         end /= kMicrosInSecond * self.time_unit
-        header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         if not path.exists(header_file_path):
             with open(header_file_path, "w+") as header_file:
@@ -334,8 +442,8 @@ def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end):
                 for trace_time in range(start, end):
                     header += ",{}".format(trace_time)
                 header_file.write(header + "\n")
-        file_path = "{}/data-ml-policy-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         with open(file_path, "w+") as file:
             for policy in self.policy_names:
@@ -350,12 +458,12 @@ def write_policy_timeline(self, cache_type, cache_size, result_dir, start, end):
                 file.write(row + "\n")
 
     def write_policy_ratio_timeline(
-        self, cache_type, cache_size, file_path, start, end
+        self, cache_type, cache_size, target_cf_name, file_path, start, end
     ):
         start /= kMicrosInSecond * self.time_unit
         end /= kMicrosInSecond * self.time_unit
-        header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         if not path.exists(header_file_path):
             with open(header_file_path, "w+") as header_file:
@@ -363,8 +471,8 @@ def write_policy_ratio_timeline(
                 for trace_time in range(start, end):
                     header += ",{}".format(trace_time)
                 header_file.write(header + "\n")
-        file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}".format(
-            result_dir, self.time_unit, cache_type, cache_size
+        file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
         )
         with open(file_path, "w+") as file:
             for policy in self.policy_names:
@@ -400,7 +508,7 @@ def evict(self, key, max_size):
     def delete(self, key):
         self.evicted_keys.pop(key, None)
 
-    def prioritize_samples(self, samples):
+    def prioritize_samples(self, samples, auxilliary_info):
         raise NotImplementedError
 
     def policy_name(self):
@@ -413,7 +521,7 @@ def generate_reward(self, key):
 
 
 class LRUPolicy(Policy):
-    def prioritize_samples(self, samples):
+    def prioritize_samples(self, samples, auxilliary_info):
         return sorted(
             samples,
             cmp=lambda e1, e2: e1.value.last_access_number
@@ -425,7 +533,7 @@ def policy_name(self):
 
 
 class MRUPolicy(Policy):
-    def prioritize_samples(self, samples):
+    def prioritize_samples(self, samples, auxilliary_info):
         return sorted(
             samples,
             cmp=lambda e1, e2: e2.value.last_access_number
@@ -437,175 +545,478 @@ def policy_name(self):
 
 
 class LFUPolicy(Policy):
-    def prioritize_samples(self, samples):
+    def prioritize_samples(self, samples, auxilliary_info):
         return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
 
     def policy_name(self):
         return "lfu"
 
 
-class MLCache(object):
-    def __init__(self, cache_size, enable_cache_row_key, policies):
+class HyperbolicPolicy(Policy):
+    """
+    An implementation of Hyperbolic caching.
+
+    Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017.
+    Hyperbolic caching: flexible caching for web applications. In Proceedings
+    of the 2017 USENIX Conference on Usenix Annual Technical Conference
+    (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
+    """
+
+    def compare(self, e1, e2, now):
+        e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float(
+            e1.value.value_size
+        )
+        e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float(
+            e2.value.value_size
+        )
+        if e1_duration == e2_duration:
+            return e1.value.num_hits - e2.value.num_hits
+        if e1_duration == 0:
+            return 1
+        if e2_duration == 0:
+            return 1
+        diff = (float(e1.value.num_hits) / (float(e1_duration))) - (
+            float(e2.value.num_hits) / float(e2_duration)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now))
+
+    def policy_name(self):
+        return "hb"
+
+
+class CostClassPolicy(Policy):
+    """
+    We calculate the hit density of a cost class as
+    number of hits / total size in cache * average duration in the cache.
+
+    An entry has a higher priority if its class's hit density is higher.
+    """
+
+    def compare(self, e1, e2, now, cost_classes, cost_class_label):
+        e1_class = e1.value.cost_class(cost_class_label)
+        e2_class = e2.value.cost_class(cost_class_label)
+
+        assert e1_class in cost_classes
+        assert e2_class in cost_classes
+
+        e1_entry = cost_classes[e1_class]
+        e2_entry = cost_classes[e2_class]
+        e1_density = e1_entry.density(now)
+        e2_density = e2_entry.density(now)
+        e1_hits = cost_classes[e1_class].hits
+        e2_hits = cost_classes[e2_class].hits
+
+        if e1_density == e2_density:
+            return e1_hits - e2_hits
+
+        if e1_entry.num_entries_in_cache == 0:
+            return -1
+        if e2_entry.num_entries_in_cache == 0:
+            return 1
+
+        if e1_density == 0:
+            return 1
+        if e2_density == 0:
+            return -1
+        diff = (float(e1_hits) / float(e1_density)) - (
+            float(e2_hits) / float(e2_density)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        cost_classes = auxilliary_info[1]
+        cost_class_label = auxilliary_info[2]
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: self.compare(
+                e1, e2, now, cost_classes, cost_class_label
+            ),
+        )
+
+    def policy_name(self):
+        return "cc"
+
+
+class Cache(object):
+    """
+    This is the base class for the implementations of alternative cache
+    replacement policies.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
         self.cache_size = cache_size
         self.used_size = 0
+        self.per_second_miss_ratio_stats = MissRatioStats(1)
         self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
-        self.policy_stats = PolicyStats(kSecondsInMinute, policies)
         self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
-        self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
-        self.table = HashTable()
+        # 0: disabled. 1: enabled. Insert both row and the refereneced data block.
+        # 2: enabled. Insert only the row but NOT the referenced data block.
         self.enable_cache_row_key = enable_cache_row_key
         self.get_id_row_key_map = {}
-        self.policies = policies
+        self.max_seen_get_id = 0
+        self.retain_get_id_range = 100000
 
-    def _lookup(self, key, hash):
-        value = self.table.lookup(key, hash)
-        if value is not None:
-            value.last_access_number = self.miss_ratio_stats.num_accesses
-            value.num_hits += 1
-            return True
-        return False
+    def block_key(self, trace_record):
+        return "b{}".format(trace_record.block_id)
 
-    def _select_policy(self, trace_record, key):
-        raise NotImplementedError
+    def row_key(self, trace_record):
+        return "g{}-{}".format(trace_record.fd, trace_record.key_id)
 
-    def cache_name(self):
+    def _lookup(self, trace_record, key, hash):
+        """
+        Look up the key in the cache.
+        Returns true upon a cache hit, false otherwise.
+        """
         raise NotImplementedError
 
-    def _evict(self, policy_index, value_size):
-        # Randomly sample n entries.
-        samples = self.table.random_sample(kSampleSize)
-        samples = self.policies[policy_index].prioritize_samples(samples)
-        for hash_entry in samples:
-            self.used_size -= hash_entry.value.value_size
-            self.table.delete(hash_entry.key, hash_entry.hash)
-            self.policies[policy_index].evict(
-                key=hash_entry.key, max_size=self.table.elements
-            )
-            if self.used_size + value_size <= self.cache_size:
-                break
+    def _evict(self, trace_record, key, hash, value_size):
+        """
+        Evict entries in the cache until there is enough room to insert the new
+        entry with 'value_size'.
+        """
+        raise NotImplementedError
 
     def _insert(self, trace_record, key, hash, value_size):
-        if value_size > self.cache_size:
-            return
-        policy_index = self._select_policy(trace_record, key)
-        self.policies[policy_index].delete(key)
-        self.policy_stats.update_metrics(trace_record.access_time, policy_index)
-        self.per_hour_policy_stats.update_metrics(
-            trace_record.access_time, policy_index
-        )
-        while self.used_size + value_size > self.cache_size:
-            self._evict(policy_index, value_size)
-        self.table.insert(
-            key,
-            hash,
-            CacheEntry(
-                value_size,
-                trace_record.cf_id,
-                trace_record.level,
-                trace_record.block_type,
-                self.miss_ratio_stats.num_accesses,
-            ),
-        )
-        self.used_size += value_size
+        """
+        Insert the new entry into the cache.
+        """
+        raise NotImplementedError
 
-    def _access_kv(self, trace_record, key, hash, value_size, no_insert):
-        if self._lookup(key, hash):
-            return True
-        if not no_insert and value_size > 0:
-            self._insert(trace_record, key, hash, value_size)
+    def _should_admit(self, trace_record, key, hash, value_size):
+        """
+        A custom admission policy to decide whether we should admit the new
+        entry upon a cache miss.
+        Returns true if the new entry should be admitted, false otherwise.
+        """
+        raise NotImplementedError
+
+    def cache_name(self):
+        """
+        The name of the replacement policy.
+        """
+        raise NotImplementedError
+
+    def is_ml_cache(self):
         return False
 
-    def _update_stats(self, access_time, is_hit):
-        self.miss_ratio_stats.update_metrics(access_time, is_hit)
-        self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit)
+    def _update_stats(self, access_time, is_hit, miss_bytes):
+        self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
 
     def access(self, trace_record):
+        """
+        Access a trace record. The simulator calls this function to access a
+        trace record.
+        """
         assert self.used_size <= self.cache_size
         if (
-            self.enable_cache_row_key
+            self.enable_cache_row_key > 0
             and trace_record.caller == 1
             and trace_record.key_id != 0
             and trace_record.get_id != 0
         ):
             # This is a get request.
-            if trace_record.get_id not in self.get_id_row_key_map:
-                self.get_id_row_key_map[trace_record.get_id] = {}
-                self.get_id_row_key_map[trace_record.get_id]["h"] = False
-            if self.get_id_row_key_map[trace_record.get_id]["h"]:
-                # We treat future accesses as hits since this get request
-                # completes.
-                self._update_stats(trace_record.access_time, is_hit=True)
-                return
-            if trace_record.key_id not in self.get_id_row_key_map[trace_record.get_id]:
-                # First time seen this key.
-                is_hit = self._access_kv(
-                    trace_record,
-                    key="g{}".format(trace_record.key_id),
-                    hash=trace_record.key_id,
-                    value_size=trace_record.kv_size,
-                    no_insert=False,
-                )
-                inserted = False
-                if trace_record.kv_size > 0:
-                    inserted = True
-                self.get_id_row_key_map[trace_record.get_id][
-                    trace_record.key_id
-                ] = inserted
-                self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
-            if self.get_id_row_key_map[trace_record.get_id]["h"]:
-                # We treat future accesses as hits since this get request
-                # completes.
-                self._update_stats(trace_record.access_time, is_hit=True)
-                return
-            # Access its blocks.
+            self._access_row(trace_record)
+            return
+        is_hit = self._access_kv(
+            trace_record,
+            self.block_key(trace_record),
+            trace_record.block_id,
+            trace_record.block_size,
+            trace_record.no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size
+        )
+
+    def _access_row(self, trace_record):
+        row_key = self.row_key(trace_record)
+        self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id)
+        self.get_id_row_key_map.pop(
+            self.max_seen_get_id - self.retain_get_id_range, None
+        )
+        if trace_record.get_id not in self.get_id_row_key_map:
+            self.get_id_row_key_map[trace_record.get_id] = {}
+            self.get_id_row_key_map[trace_record.get_id]["h"] = False
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 1")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+            return
+        if row_key not in self.get_id_row_key_map[trace_record.get_id]:
+            # First time seen this key.
             is_hit = self._access_kv(
                 trace_record,
-                key="b{}".format(trace_record.block_id),
-                hash=trace_record.block_id,
-                value_size=trace_record.block_size,
-                no_insert=trace_record.no_insert,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
             )
-            self._update_stats(trace_record.access_time, is_hit)
-            if (
-                trace_record.kv_size > 0
-                and not self.get_id_row_key_map[trace_record.get_id][
-                    trace_record.key_id
-                ]
-            ):
-                # Insert the row key-value pair.
-                self._access_kv(
-                    trace_record,
-                    key="g{}".format(trace_record.key_id),
-                    hash=trace_record.key_id,
-                    value_size=trace_record.kv_size,
-                    no_insert=False,
-                )
-                # Mark as inserted.
-                self.get_id_row_key_map[trace_record.get_id][trace_record.key_id] = True
+            inserted = False
+            if trace_record.kv_size > 0:
+                inserted = True
+            self.get_id_row_key_map[trace_record.get_id][row_key] = inserted
+            self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 2")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
             return
-        # Access the block.
+        # Access its blocks.
+        no_insert = trace_record.no_insert
+        if (
+            self.enable_cache_row_key == 2
+            and trace_record.kv_size > 0
+            and trace_record.block_type == 9
+        ):
+            no_insert = True
         is_hit = self._access_kv(
             trace_record,
-            key="b{}".format(trace_record.block_id),
+            key=self.block_key(trace_record),
             hash=trace_record.block_id,
             value_size=trace_record.block_size,
-            no_insert=trace_record.no_insert,
+            no_insert=no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit, miss_bytes=trace_record.block_size
+        )
+        if (
+            trace_record.kv_size > 0
+            and not self.get_id_row_key_map[trace_record.get_id][row_key]
+        ):
+            # Insert the row key-value pair.
+            self._access_kv(
+                trace_record,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
+            )
+            # Mark as inserted.
+            self.get_id_row_key_map[trace_record.get_id][row_key] = True
+
+    def _access_kv(self, trace_record, key, hash, value_size, no_insert):
+        # Sanity checks.
+        assert self.used_size <= self.cache_size
+        if self._lookup(trace_record, key, hash):
+            # A cache hit.
+            return True
+        if no_insert or value_size <= 0:
+            return False
+        # A cache miss.
+        if value_size > self.cache_size:
+            # The block is too large to fit into the cache.
+            return False
+        self._evict(trace_record, key, hash, value_size)
+        if self._should_admit(trace_record, key, hash, value_size):
+            self._insert(trace_record, key, hash, value_size)
+            self.used_size += value_size
+        return False
+
+
+class CostClassEntry:
+    """
+    A cost class maintains aggregated statistics of cached entries in a class.
+    For example, we may define block type as a class. Then, cached blocks of the
+    same type will share one cost class entry.
+    """
+
+    def __init__(self):
+        self.hits = 0
+        self.num_entries_in_cache = 0
+        self.size_in_cache = 0
+        self.sum_insertion_times = 0
+        self.sum_last_access_time = 0
+
+    def insert(self, trace_record, key, value_size):
+        self.size_in_cache += value_size
+        self.num_entries_in_cache += 1
+        self.sum_insertion_times += trace_record.access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def remove(self, insertion_time, last_access_time, key, value_size, num_hits):
+        self.hits -= num_hits
+        self.num_entries_in_cache -= 1
+        self.sum_insertion_times -= insertion_time / kMicrosInSecond
+        self.size_in_cache -= value_size
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+
+    def update_on_hit(self, trace_record, last_access_time):
+        self.hits += 1
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def avg_lifetime_in_cache(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        return now / kMicrosInSecond - avg_insertion_time
+
+    def avg_last_access_time(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def avg_size(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def density(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        in_cache_duration = now / kMicrosInSecond - avg_insertion_time
+        return self.size_in_cache * in_cache_duration
+
+
+class MLCache(Cache):
+    """
+    MLCache is the base class for implementations of alternative replacement
+    policies using reinforcement learning.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(MLCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = HashTable()
+        self.policy_stats = PolicyStats(kSecondsInMinute, policies)
+        self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
+        self.policies = policies
+        self.cost_classes = {}
+        self.cost_class_label = cost_class_label
+
+    def is_ml_cache(self):
+        return True
+
+    def _lookup(self, trace_record, key, hash):
+        value = self.table.lookup(key, hash)
+        if value is not None:
+            # Update the entry's cost class statistics.
+            if self.cost_class_label is not None:
+                cost_class = value.cost_class(self.cost_class_label)
+                assert cost_class in self.cost_classes
+                self.cost_classes[cost_class].update_on_hit(
+                    trace_record, value.last_access_time
+                )
+            # Update the entry's last access time.
+            self.table.insert(
+                key,
+                hash,
+                CacheEntry(
+                    value_size=value.value_size,
+                    cf_id=value.cf_id,
+                    level=value.level,
+                    block_type=value.block_type,
+                    table_id=value.table_id,
+                    access_number=self.miss_ratio_stats.num_accesses,
+                    time_s=trace_record.access_time,
+                    num_hits=value.num_hits + 1,
+                ),
+            )
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Select a policy, random sample kSampleSize keys from the cache, then
+        # evict keys in the sample set until we have enough room for the new
+        # entry.
+        policy_index = self._select_policy(trace_record, key)
+        assert policy_index < len(self.policies) and policy_index >= 0
+        self.policies[policy_index].delete(key)
+        self.policy_stats.update_metrics(trace_record.access_time, policy_index)
+        self.per_hour_policy_stats.update_metrics(
+            trace_record.access_time, policy_index
+        )
+        while self.used_size + value_size > self.cache_size:
+            # Randomly sample n entries.
+            samples = self.table.random_sample(kSampleSize)
+            samples = self.policies[policy_index].prioritize_samples(
+                samples,
+                [trace_record.access_time, self.cost_classes, self.cost_class_label],
+            )
+            for hash_entry in samples:
+                assert self.table.delete(hash_entry.key, hash_entry.hash) is not None
+                self.used_size -= hash_entry.value.value_size
+                self.policies[policy_index].evict(
+                    key=hash_entry.key, max_size=self.table.elements
+                )
+                # Update the entry's cost class statistics.
+                if self.cost_class_label is not None:
+                    cost_class = hash_entry.value.cost_class(self.cost_class_label)
+                    assert cost_class in self.cost_classes
+                    self.cost_classes[cost_class].remove(
+                        hash_entry.value.insertion_time,
+                        hash_entry.value.last_access_time,
+                        key,
+                        hash_entry.value.value_size,
+                        hash_entry.value.num_hits,
+                    )
+                if self.used_size + value_size <= self.cache_size:
+                    break
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert self.used_size + value_size <= self.cache_size
+        entry = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            self.miss_ratio_stats.num_accesses,
+            trace_record.access_time,
         )
-        self._update_stats(trace_record.access_time, is_hit)
+        # Update the entry's cost class statistics.
+        if self.cost_class_label is not None:
+            cost_class = entry.cost_class(self.cost_class_label)
+            if cost_class not in self.cost_classes:
+                self.cost_classes[cost_class] = CostClassEntry()
+            self.cost_classes[cost_class].insert(trace_record, key, value_size)
+        self.table.insert(key, hash, entry)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def _select_policy(self, trace_record, key):
+        raise NotImplementedError
 
 
 class ThompsonSamplingCache(MLCache):
     """
-    An implementation of Thompson Sampling for the Bernoulli Bandit [1].
-    [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
+    An implementation of Thompson Sampling for the Bernoulli Bandit.
+
+    Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
     and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
     Trends Mach. Learn. 11, 1 (July 2018), 1-96.
     DOI: https://doi.org/10.1561/2200000070
     """
 
-    def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b=1):
+    def __init__(
+        self,
+        cache_size,
+        enable_cache_row_key,
+        policies,
+        cost_class_label,
+        init_a=1,
+        init_b=1,
+    ):
         super(ThompsonSamplingCache, self).__init__(
-            cache_size, enable_cache_row_key, policies
+            cache_size, enable_cache_row_key, policies, cost_class_label
         )
         self._as = {}
         self._bs = {}
@@ -614,6 +1025,8 @@ def __init__(self, cache_size, enable_cache_row_key, policies, init_a=1, init_b=
             self._bs = [init_b] * len(self.policies)
 
     def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
         samples = [
             np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
         ]
@@ -626,23 +1039,28 @@ def _select_policy(self, trace_record, key):
 
     def cache_name(self):
         if self.enable_cache_row_key:
-            return "Hybrid ThompsonSampling (ts_hybrid)"
-        return "ThompsonSampling (ts)"
+            return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format(
+                self.cost_class_label
+            )
+        return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label)
 
 
 class LinUCBCache(MLCache):
     """
-    An implementation of LinUCB with disjoint linear models [2].
-    [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
+    An implementation of LinUCB with disjoint linear models.
+
+    Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
     A contextual-bandit approach to personalized news article recommendation.
     In Proceedings of the 19th international conference on World wide web
     (WWW '10). ACM, New York, NY, USA, 661-670.
     DOI=http://dx.doi.org/10.1145/1772690.1772758
     """
 
-    def __init__(self, cache_size, enable_cache_row_key, policies):
-        super(LinUCBCache, self).__init__(cache_size, enable_cache_row_key, policies)
-        self.nfeatures = 4  # Block type, caller, level, cf.
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(LinUCBCache, self).__init__(
+            cache_size, enable_cache_row_key, policies, cost_class_label
+        )
+        self.nfeatures = 4  # Block type, level, cf.
         self.th = np.zeros((len(self.policies), self.nfeatures))
         self.eps = 0.2
         self.b = np.zeros_like(self.th)
@@ -655,11 +1073,12 @@ def __init__(self, cache_size, enable_cache_row_key, policies):
         self.alph = 0.2
 
     def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
         x_i = np.zeros(self.nfeatures)  # The current context vector
         x_i[0] = trace_record.block_type
-        x_i[1] = trace_record.caller
-        x_i[2] = trace_record.level
-        x_i[3] = trace_record.cf_id
+        x_i[1] = trace_record.level
+        x_i[2] = trace_record.cf_id
         p = np.zeros(len(self.policies))
         for a in range(len(self.policies)):
             self.th_hat[a] = self.A_inv[a].dot(self.b[a])
@@ -679,8 +1098,429 @@ def _select_policy(self, trace_record, key):
 
     def cache_name(self):
         if self.enable_cache_row_key:
-            return "Hybrid LinUCB (linucb_hybrid)"
-        return "LinUCB (linucb)"
+            return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format(
+                self.cost_class_label
+            )
+        return "LinUCB with cost class {} (linucb)".format(self.cost_class_label)
+
+
+class OPTCacheEntry:
+    """
+    A cache entry for the OPT algorithm. The entries are sorted based on its
+    next access sequence number in reverse order, i.e., the entry which next
+    access is the furthest in the future is ordered before other entries.
+    """
+
+    def __init__(self, key, next_access_seq_no, value_size):
+        self.key = key
+        self.next_access_seq_no = next_access_seq_no
+        self.value_size = value_size
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.next_access_seq_no != self.next_access_seq_no:
+            return other.next_access_seq_no - self.next_access_seq_no
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class PQTable:
+    """
+    A hash table with a priority queue.
+    """
+
+    def __init__(self):
+        # A list of entries arranged in a heap sorted based on the entry custom
+        # implementation of __cmp__
+        self.pq = []
+        self.table = {}
+
+    def pqinsert(self, entry):
+        "Add a new key or update the priority of an existing key"
+        # Remove the entry from the table first.
+        removed_entry = self.table.pop(entry.key, None)
+        if removed_entry:
+            # Mark as removed since there is no 'remove' API in heappq.
+            # Instead, an entry in pq is removed lazily when calling pop.
+            removed_entry.is_removed = True
+        self.table[entry.key] = entry
+        heapq.heappush(self.pq, entry)
+        return removed_entry
+
+    def pqpop(self):
+        while self.pq:
+            entry = heapq.heappop(self.pq)
+            if not entry.is_removed:
+                del self.table[entry.key]
+                return entry
+        return None
+
+    def pqpeek(self):
+        while self.pq:
+            entry = self.pq[0]
+            if not entry.is_removed:
+                return entry
+            heapq.heappop(self.pq)
+        return
+
+    def __contains__(self, k):
+        return k in self.table
+
+    def __getitem__(self, k):
+        return self.table[k]
+
+    def __len__(self):
+        return len(self.table)
+
+    def values(self):
+        return self.table.values()
+
+
+class OPTCache(Cache):
+    """
+    An implementation of the Belady MIN algorithm. OPTCache evicts an entry
+    in the cache whose next access occurs furthest in the future.
+
+    Note that Belady MIN algorithm is optimal assuming all blocks having the
+    same size and a missing entry will be inserted in the cache.
+    These are NOT true for the block cache trace since blocks have different
+    sizes and we may not insert a block into the cache upon a cache miss.
+    However, it is still useful to serve as a "theoretical upper bound" on the
+    lowest miss ratio we can achieve given a cache size.
+
+    L. A. Belady. 1966. A Study of Replacement Algorithms for a
+    Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101.
+    DOI=http://dx.doi.org/10.1147/sj.52.0078
+    """
+
+    def __init__(self, cache_size):
+        super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0)
+        self.table = PQTable()
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its next access time.
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(
+                    key, trace_record.next_access_seq_no, self.table[key].value_size
+                )
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(key, trace_record.next_access_seq_no, value_size)
+            )
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        return "Belady MIN (opt)"
+
+
+class GDSizeEntry:
+    """
+    A cache entry for the greedy dual size replacement policy.
+    """
+
+    def __init__(self, key, value_size, priority):
+        self.key = key
+        self.value_size = value_size
+        self.priority = priority
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.priority != self.priority:
+            return self.priority - other.priority
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class GDSizeCache(Cache):
+    """
+    An implementation of the greedy dual size algorithm.
+    We define cost as an entry's size.
+
+    See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html
+    and N. Young. The k-server dual and loose competitiveness for paging.
+    Algorithmica,June 1994, vol. 11,(no.6):525-41.
+    Rewritten version of ''On-line caching as cache size varies'',
+    in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = PQTable()
+        self.L = 0.0
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid GreedyDualSize (gdsize_hybrid)"
+        return "GreedyDualSize (gdsize)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its priority.
+        entry = self.table[key]
+        assert (
+            self.table.pqinsert(
+                GDSizeEntry(key, entry.value_size, self.L + entry.value_size)
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.L = evict_entry.priority
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size))
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class Deque(object):
+    """A Deque class facilitates the implementation of LRU and ARC."""
+
+    def __init__(self):
+        self.od = OrderedDict()
+
+    def appendleft(self, k):
+        if k in self.od:
+            del self.od[k]
+        self.od[k] = None
+
+    def pop(self):
+        item = self.od.popitem(last=False) if self.od else None
+        if item is not None:
+            return item[0]
+        return None
+
+    def remove(self, k):
+        del self.od[k]
+
+    def __len__(self):
+        return len(self.od)
+
+    def __contains__(self, k):
+        return k in self.od
+
+    def __iter__(self):
+        return reversed(self.od)
+
+    def __repr__(self):
+        return "Deque(%r)" % (list(self),)
+
+
+class ARCCache(Cache):
+    """
+    An implementation of ARC. ARC assumes that all blocks are having the
+    same size. The size of index and filter blocks are variable. To accommodate
+    this, we modified ARC as follows:
+    1) We use 16 KB as the average block size and calculate the number of blocks
+       (c) in the cache.
+    2) When we insert an entry, the cache evicts entries in both t1 and t2
+       queues until it has enough space for the new entry. This also requires
+       modification of the algorithm to maintain a maximum of 2*c blocks.
+
+    Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low
+    Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on
+    File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA,
+    USA, 115-130.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(ARCCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.c = cache_size / 16 * 1024  # Number of elements in the cache.
+        self.p = 0  # Target size for the list T1
+        # L1: only once recently
+        self.t1 = Deque()  # T1: recent cache entries
+        self.b1 = Deque()  # B1: ghost entries recently evicted from the T1 cache
+        # L2: at least twice recently
+        self.t2 = Deque()  # T2: frequent entries
+        self.b2 = Deque()  # B2: ghost entries recently evicted from the T2 cache
+
+    def _replace(self, key, value_size):
+        while self.used_size + value_size > self.cache_size:
+            if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)):
+                old = self.t1.pop()
+                self.b1.appendleft(old)
+            else:
+                if self.t2:
+                    old = self.t2.pop()
+                    self.b2.appendleft(old)
+                else:
+                    old = self.t1.pop()
+                    self.b1.appendleft(old)
+            self.used_size -= self.table[old].value_size
+            del self.table[old]
+
+    def _lookup(self, trace_record, key, hash):
+        # Case I: key is in T1 or T2.
+        #   Move key to MRU position in T2.
+        if key in self.t1:
+            self.t1.remove(key)
+            self.t2.appendleft(key)
+            return True
+
+        if key in self.t2:
+            self.t2.remove(key)
+            self.t2.appendleft(key)
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Case II: key is in B1
+        #   Move x from B1 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b1:
+            self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1))
+            self._replace(key, value_size)
+            self.b1.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case III: key is in B2
+        #   Move x from B2 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b2:
+            self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1))
+            self._replace(key, value_size)
+            self.b2.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case IV: key is not in (T1 u B1 u T2 u B2)
+        self._replace(key, value_size)
+        while len(self.t1) + len(self.b1) >= self.c and self.b1:
+            self.b1.pop()
+
+        total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2)
+        while total >= (2 * self.c) and self.b2:
+            self.b2.pop()
+            total -= 1
+        # Finally, move it to MRU position in T1.
+        self.t1.appendleft(key)
+        return
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid Adaptive Replacement Cache (arc_hybrid)"
+        return "Adaptive Replacement Cache (arc)"
+
+
+class LRUCache(Cache):
+    """
+    A strict LRU queue.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(LRUCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.lru = Deque()
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LRU (lru_hybrid)"
+        return "LRU (lru)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update LRU queue.
+        self.lru.remove(key)
+        self.lru.appendleft(key)
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_key = self.lru.pop()
+            self.used_size -= self.table[evict_key].value_size
+            del self.table[evict_key]
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+        self.lru.appendleft(key)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class TraceCache(Cache):
+    """
+    A trace cache. Lookup returns true if the trace observes a cache hit.
+    It is used to maintain cache hits observed in the trace.
+    """
+
+    def __init__(self, cache_size):
+        super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0)
+
+    def _lookup(self, trace_record, key, hash):
+        return trace_record.is_hit
+
+    def _evict(self, trace_record, key, hash, value_size):
+        pass
+
+    def _insert(self, trace_record, key, hash, value_size):
+        pass
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return False
+
+    def cache_name(self):
+        return "Trace"
 
 
 def parse_cache_size(cs):
@@ -695,47 +1535,255 @@ def parse_cache_size(cs):
 
 
 def create_cache(cache_type, cache_size, downsample_size):
-    policies = []
-    policies.append(LRUPolicy())
-    policies.append(MRUPolicy())
-    policies.append(LFUPolicy())
     cache_size = cache_size / downsample_size
-    enable_cache_row_key = False
+    enable_cache_row_key = 0
+    if "hybridn" in cache_type:
+        enable_cache_row_key = 2
+        cache_type = cache_type[:-8]
     if "hybrid" in cache_type:
-        enable_cache_row_key = True
+        enable_cache_row_key = 1
         cache_type = cache_type[:-7]
     if cache_type == "ts":
-        return ThompsonSamplingCache(cache_size, enable_cache_row_key, policies)
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
     elif cache_type == "linucb":
-        return LinUCBCache(cache_size, enable_cache_row_key, policies)
+        return LinUCBCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pylru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pymru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pylfu":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pyhb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pycctbbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_bt",
+        )
+    elif cache_type == "pycccf":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf"
+        )
+    elif cache_type == "pycctblevelbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_level_bt",
+        )
+    elif cache_type == "pycccfbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="cf_bt",
+        )
+    elif cache_type == "pycctb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table",
+        )
+    elif cache_type == "pyccbt":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt"
+        )
+    elif cache_type == "opt":
+        if enable_cache_row_key:
+            print("opt does not support hybrid mode.")
+            assert False
+        return OPTCache(cache_size)
+    elif cache_type == "trace":
+        if enable_cache_row_key:
+            print("trace does not support hybrid mode.")
+            assert False
+        return TraceCache(cache_size)
+    elif cache_type == "lru":
+        return LRUCache(cache_size, enable_cache_row_key)
+    elif cache_type == "arc":
+        return ARCCache(cache_size, enable_cache_row_key)
+    elif cache_type == "gdsize":
+        return GDSizeCache(cache_size, enable_cache_row_key)
     else:
         print("Unknown cache type {}".format(cache_type))
         assert False
     return None
 
 
-def run(trace_file_path, cache_type, cache, warmup_seconds):
+class BlockAccessTimeline:
+    """
+    BlockAccessTimeline stores all accesses of a block.
+    """
+
+    def __init__(self):
+        self.accesses = []
+        self.current_access_index = 1
+
+    def get_next_access(self):
+        if self.current_access_index == len(self.accesses):
+            return sys.maxsize
+        next_access_seq_no = self.accesses[self.current_access_index]
+        self.current_access_index += 1
+        return next_access_seq_no
+
+
+def percent(e1, e2):
+    if e2 == 0:
+        return -1
+    return float(e1) * 100.0 / float(e2)
+
+
+def is_target_cf(access_cf, target_cf_name):
+    if target_cf_name == "all":
+        return True
+    return access_cf == target_cf_name
+
+
+def run(
+    trace_file_path,
+    cache_type,
+    cache,
+    warmup_seconds,
+    max_accesses_to_process,
+    target_cf_name,
+):
     warmup_complete = False
-    num = 0
+    trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+    access_seq_no = 0
+    time_interval = 1
+    start_time = time.time()
     trace_start_time = 0
     trace_duration = 0
-    start_time = time.time()
+    is_opt_cache = False
+    if cache.cache_name() == "Belady MIN (opt)":
+        is_opt_cache = True
+
+    block_access_timelines = {}
+    num_no_inserts = 0
+    num_blocks_with_no_size = 0
+    num_inserts_block_with_no_size = 0
+
+    if is_opt_cache:
+        # Read all blocks in memory and stores their access times so that OPT
+        # can use this information to evict the cached key which next access is
+        # the furthest in the future.
+        print("Preprocessing block traces.")
+        with open(trace_file_path, "r") as trace_file:
+            for line in trace_file:
+                if (
+                    max_accesses_to_process != -1
+                    and access_seq_no > max_accesses_to_process
+                ):
+                    break
+                ts = line.split(",")
+                timestamp = int(ts[0])
+                cf_name = ts[5]
+                if not is_target_cf(cf_name, target_cf_name):
+                    continue
+                if trace_start_time == 0:
+                    trace_start_time = timestamp
+                trace_duration = timestamp - trace_start_time
+                block_id = int(ts[1])
+                block_size = int(ts[3])
+                no_insert = int(ts[9])
+                if block_id not in block_access_timelines:
+                    block_access_timelines[block_id] = BlockAccessTimeline()
+                    if block_size == 0:
+                        num_blocks_with_no_size += 1
+                block_access_timelines[block_id].accesses.append(access_seq_no)
+                access_seq_no += 1
+                if no_insert == 1:
+                    num_no_inserts += 1
+                if no_insert == 0 and block_size == 0:
+                    num_inserts_block_with_no_size += 1
+                if access_seq_no % 100 != 0:
+                    continue
+                now = time.time()
+                if now - start_time > time_interval * 10:
+                    print(
+                        "Take {} seconds to process {} trace records with trace "
+                        "duration of {} seconds. Throughput: {} records/second.".format(
+                            now - start_time,
+                            access_seq_no,
+                            trace_duration / 1000000,
+                            access_seq_no / (now - start_time),
+                        )
+                    )
+                    time_interval += 1
+            print(
+                "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size."
+                "{3} accesses, {4}({5:.2f}%) accesses with no_insert,"
+                "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format(
+                    len(block_access_timelines),
+                    num_blocks_with_no_size,
+                    percent(num_blocks_with_no_size, len(block_access_timelines)),
+                    access_seq_no,
+                    num_no_inserts,
+                    percent(num_no_inserts, access_seq_no),
+                    num_inserts_block_with_no_size,
+                    percent(num_inserts_block_with_no_size, access_seq_no),
+                )
+            )
+
+    access_seq_no = 0
     time_interval = 1
-    trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+    start_time = time.time()
+    trace_start_time = 0
+    trace_duration = 0
+    print("Running simulated {} cache on block traces.".format(cache.cache_name()))
     with open(trace_file_path, "r") as trace_file:
         for line in trace_file:
-            num += 1
-            if num % 1000000 == 0:
+            if (
+                max_accesses_to_process != -1
+                and access_seq_no > max_accesses_to_process
+            ):
+                break
+            if access_seq_no % 1000000 == 0:
                 # Force a python gc periodically to reduce memory usage.
                 gc.collect()
             ts = line.split(",")
             timestamp = int(ts[0])
+            cf_name = ts[5]
+            if not is_target_cf(cf_name, target_cf_name):
+                continue
             if trace_start_time == 0:
                 trace_start_time = timestamp
             trace_duration = timestamp - trace_start_time
-            if not warmup_complete and trace_duration > warmup_seconds * 1000000:
+            if (
+                not warmup_complete
+                and warmup_seconds > 0
+                and trace_duration > warmup_seconds * 1000000
+            ):
                 cache.miss_ratio_stats.reset_counter()
                 warmup_complete = True
+            next_access_seq_no = 0
+            block_id = int(ts[1])
+            if is_opt_cache:
+                next_access_seq_no = block_access_timelines[block_id].get_next_access()
             record = TraceRecord(
                 access_time=int(ts[0]),
                 block_id=int(ts[1]),
@@ -751,13 +1799,23 @@ def run(trace_file_path, cache_type, cache, warmup_seconds):
                 key_id=int(ts[11]),
                 kv_size=int(ts[12]),
                 is_hit=int(ts[13]),
+                referenced_key_exist_in_block=int(ts[14]),
+                num_keys_in_block=int(ts[15]),
+                table_id=int(ts[16]),
+                seq_number=int(ts[17]),
+                block_key_size=int(ts[18]),
+                key_size=int(ts[19]),
+                block_offset_in_file=int(ts[20]),
+                next_access_seq_no=next_access_seq_no,
             )
             trace_miss_ratio_stats.update_metrics(
-                record.access_time, is_hit=record.is_hit
+                record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size
             )
             cache.access(record)
+            access_seq_no += 1
             del record
-            if num % 100 != 0:
+            del ts
+            if access_seq_no % 100 != 0:
                 continue
             # Report progress every 10 seconds.
             now = time.time()
@@ -767,9 +1825,9 @@ def run(trace_file_path, cache_type, cache, warmup_seconds):
                     "duration of {} seconds. Throughput: {} records/second. "
                     "Trace miss ratio {}".format(
                         now - start_time,
-                        num,
+                        access_seq_no,
                         trace_duration / 1000000,
-                        num / (now - start_time),
+                        access_seq_no / (now - start_time),
                         trace_miss_ratio_stats.miss_ratio(),
                     )
                 )
@@ -787,19 +1845,33 @@ def run(trace_file_path, cache_type, cache, warmup_seconds):
         "Take {} seconds to process {} trace records with trace duration of {} "
         "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
             now - start_time,
-            num,
+            access_seq_no,
             trace_duration / 1000000,
-            num / (now - start_time),
+            access_seq_no / (now - start_time),
             trace_miss_ratio_stats.miss_ratio(),
         )
     )
+    print(
+        "{},0,0,{},{},{}".format(
+            cache_type,
+            cache.cache_size,
+            cache.miss_ratio_stats.miss_ratio(),
+            cache.miss_ratio_stats.num_accesses,
+        )
+    )
     return trace_start_time, trace_duration
 
 
 def report_stats(
-    cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+    cache,
+    cache_type,
+    cache_size,
+    target_cf_name,
+    result_dir,
+    trace_start_time,
+    trace_end_time,
 ):
-    cache_label = "{}-{}".format(cache_type, cache_size)
+    cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name)
     with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
         mrc_file.write(
             "{},0,0,{},{},{}\n".format(
@@ -809,56 +1881,120 @@ def report_stats(
                 cache.miss_ratio_stats.num_accesses,
             )
         )
-    cache.policy_stats.write_policy_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.policy_stats.write_policy_ratio_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.miss_ratio_stats.write_miss_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.miss_ratio_stats.write_miss_ratio_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.per_hour_policy_stats.write_policy_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.per_hour_policy_stats.write_policy_ratio_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.per_hour_miss_ratio_stats.write_miss_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
-    cache.per_hour_miss_ratio_stats.write_miss_ratio_timeline(
-        cache_type, cache_size, result_dir, trace_start_time, trace_end_time
-    )
+
+    cache_stats = [
+        cache.per_second_miss_ratio_stats,
+        cache.miss_ratio_stats,
+        cache.per_hour_miss_ratio_stats,
+    ]
+    for i in range(len(cache_stats)):
+        avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes()
+
+        with open(
+            "{}/data-ml-avgmb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes)
+            )
+
+        with open(
+            "{}/data-ml-p95mb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes)
+            )
+
+        cache_stats[i].write_miss_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        cache_stats[i].write_miss_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+
+    if not cache.is_ml_cache():
+        return
+
+    policy_stats = [cache.policy_stats, cache.per_hour_policy_stats]
+    for i in range(len(policy_stats)):
+        policy_stats[i].write_policy_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        policy_stats[i].write_policy_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
 
 
 if __name__ == "__main__":
-    if len(sys.argv) <= 6:
+    if len(sys.argv) <= 8:
         print(
-            "Must provide 6 arguments. "
-            "1) cache_type (ts, ts_hybrid, linucb, linucb_hybrid). "
-            "2) cache size (xM, xG, xT). "
+            "Must provide 8 arguments.\n"
+            "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, "
+            "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache "
+            "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. "
+            "Note that hybrid is not supported with opt and trace. \n"
+            "2) Cache size (xM, xG, xT).\n"
             "3) The sampling frequency used to collect the trace. (The "
-            "simulation scales down the cache size by the sampling frequency). "
-            "4) Warmup seconds (The number of seconds used for warmup). "
-            "5) Trace file path. "
-            "6) Result directory (A directory that saves generated results)"
+            "simulation scales down the cache size by the sampling frequency).\n"
+            "4) Warmup seconds (The number of seconds used for warmup).\n"
+            "5) Trace file path.\n"
+            "6) Result directory (A directory that saves generated results)\n"
+            "7) Max number of accesses to process\n"
+            "8) The target column family. (The simulation will only run "
+            "accesses on the target column family. If it is set to all, "
+            "it will run against all accesses.)"
         )
         exit(1)
+    print("Arguments: {}".format(sys.argv))
     cache_type = sys.argv[1]
     cache_size = parse_cache_size(sys.argv[2])
     downsample_size = int(sys.argv[3])
     warmup_seconds = int(sys.argv[4])
     trace_file_path = sys.argv[5]
     result_dir = sys.argv[6]
+    max_accesses_to_process = int(sys.argv[7])
+    target_cf_name = sys.argv[8]
     cache = create_cache(cache_type, cache_size, downsample_size)
     trace_start_time, trace_duration = run(
-        trace_file_path, cache_type, cache, warmup_seconds
+        trace_file_path,
+        cache_type,
+        cache,
+        warmup_seconds,
+        max_accesses_to_process,
+        target_cf_name,
     )
     trace_end_time = trace_start_time + trace_duration
     report_stats(
-        cache, cache_type, cache_size, result_dir, trace_start_time, trace_end_time
+        cache,
+        cache_type,
+        cache_size,
+        target_cf_name,
+        result_dir,
+        trace_start_time,
+        trace_end_time,
     )
diff --git a/tools/block_cache_analyzer/block_cache_pysim.sh b/tools/block_cache_analyzer/block_cache_pysim.sh
index 58193a0635a..295f734aa05 100644
--- a/tools/block_cache_analyzer/block_cache_pysim.sh
+++ b/tools/block_cache_analyzer/block_cache_pysim.sh
@@ -10,6 +10,10 @@
 # warmup_seconds: The number of seconds used for warmup.
 # max_jobs: The max number of concurrent pysims to run.
 
+# Install required packages to run simulations.
+# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel
+ulimit -c 0
+
 if [ $# -ne 5 ]; then
   echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
   exit 0
@@ -20,17 +24,26 @@ result_dir="$2"
 downsample_size="$3"
 warmup_seconds="$4"
 max_jobs="$5"
-current_jobs=0
+max_num_accesses=100000000
+current_jobs=1
 
 ml_tmp_result_dir="$result_dir/ml"
 rm -rf "$ml_tmp_result_dir"
 mkdir -p "$result_dir"
 mkdir -p "$ml_tmp_result_dir"
 
-for cache_type in "ts" "linucb" "ts_hybrid" "linucb_hybrid"
+# Report miss ratio in the trace.
+current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+for cf_name in "all"
+do
+for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T"
 do
-for cache_size in "16M" "256M" "1G" "2G" "4G" "8G" "12G" "16G"
+for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid"  #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace"
 do
+    if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then
+      # We only need to collect miss ratios observed in the trace once.
+      continue
+    fi
     while [ "$current_jobs" -ge "$max_jobs" ]
     do
       sleep 10
@@ -38,12 +51,13 @@ do
       current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
       echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
     done
-    output="log-ml-$cache_type-$cache_size"
-    echo "Running simulation for $cache_type and cache size $cache_size. Number of running jobs: $current_jobs. "
-    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" >& $ml_tmp_result_dir/$output &
+    output="log-ml-$cache_type-$cache_size-$cf_name"
+    echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. "
+    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" &
     current_jobs=$((current_jobs+1))
 done
 done
+done
 
 # Wait for all jobs to complete.
 while [ $current_jobs -gt 0 ]
@@ -57,14 +71,14 @@ done
 echo "Combine individual pysim output files"
 
 rm -rf "$result_dir/ml_*"
-mrc_file="$result_dir/ml_mrc"
 for header in "header-" "data-"
 do
-for fn in $ml_tmp_result_dir/*
+for fn in "$ml_tmp_result_dir"/*
 do
   sum_file=""
   time_unit=""
   capacity=""
+  target_cf_name=""
   if [[ $fn == *"timeline"* ]]; then
     tmpfn="$fn"
     IFS='-' read -ra elements <<< "$tmpfn"
@@ -79,24 +93,43 @@ do
     done
     time_unit_index=$((time_unit_index+1))
     capacity_index=$((time_unit_index+2))
+    target_cf_name_index=$((time_unit_index+3))
     time_unit="${elements[$time_unit_index]}_"
     capacity="${elements[$capacity_index]}_"
+    target_cf_name="${elements[$target_cf_name_index]}_"
   fi
 
-  if [[ $fn == "${header}ml-policy-timeline"* ]]; then
-    sum_file="$result_dir/ml_${capacity}${time_unit}policy_timeline"
+  if [[ $fn == *"${header}ml-policy-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline"
+  fi
+  if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline"
   fi
-  if [[ $fn == "${header}ml-policy-ratio-timeline"* ]]; then
-    sum_file="$result_dir/ml_${capacity}${time_unit}policy_ratio_timeline"
+  if [[ $fn == *"${header}ml-miss-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline"
   fi
-  if [[ $fn == "${header}ml-miss-timeline"* ]]; then
-    sum_file="$result_dir/ml_${capacity}${time_unit}miss_timeline"
+  if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline"
+  fi
+  if [[ $fn == *"${header}ml-mrc"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${target_cf_name}_mrc"
   fi
-  if [[ $fn == "${header}ml-miss-ratio-timeline"* ]]; then
-    sum_file="$result_dir/ml_${capacity}${time_unit}miss_ratio_timeline"
+  if [[ $fn == *"${header}ml-avgmb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb"
   fi
-  if [[ $fn == "${header}ml-mrc"* ]]; then
-    sum_file="$mrc_file"
+  if [[ $fn == *"${header}ml-p95mb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb"
   fi
   if [[ $sum_file == "" ]]; then
     continue
@@ -106,13 +139,18 @@ do
       continue
     fi
   fi
-  cat "$ml_tmp_result_dir/$fn" >> "$sum_file"
+  cat "$fn" >> "$sum_file"
 done
 done
 
 echo "Done"
-# Sort MRC file by cache_type and cache_size.
-tmp_file="$result_dir/tmp_mrc"
-cat "$mrc_file" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
-cat "$tmp_file" > "$mrc_file"
-rm -rf "$tmp_file"
+for fn in $result_dir/*
+do
+  if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then
+    # Sort MRC file by cache_type and cache_size.
+    tmp_file="$result_dir/tmp_mrc"
+    cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
+    cat "$tmp_file" > "$fn"
+    rm -rf "$tmp_file"
+  fi
+done
diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py
index e298d7bbd6f..4b2bdeba656 100644
--- a/tools/block_cache_analyzer/block_cache_pysim_test.py
+++ b/tools/block_cache_analyzer/block_cache_pysim_test.py
@@ -1,17 +1,30 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 
+import os
 import random
+import sys
 
 from block_cache_pysim import (
+    ARCCache,
+    CacheEntry,
+    GDSizeCache,
     HashTable,
+    HyperbolicPolicy,
     LFUPolicy,
     LinUCBCache,
+    LRUCache,
     LRUPolicy,
     MRUPolicy,
+    OPTCache,
+    OPTCacheEntry,
     ThompsonSamplingCache,
+    TraceCache,
     TraceRecord,
+    create_cache,
+    kMicrosInSecond,
     kSampleSize,
+    run,
 )
 
 
@@ -33,30 +46,44 @@ def test_hash_table():
     records = 100
     for i in range(n):
         key_id = random.randint(0, records)
+        v = random.randint(0, records)
         key = "k{}".format(key_id)
-        value = "v{}".format(key_id)
-        action = random.randint(0, 2)
-        # print "{}:{}:{}".format(action, key, value)
+        value = CacheEntry(v, v, v, v, v, v, v)
+        action = random.randint(0, 10)
         assert len(truth_map) == table.elements, "{} {} {}".format(
             len(truth_map), table.elements, i
         )
-        if action == 0:
-            table.insert(key, key_id, value)
-            truth_map[key] = value
-        elif action == 1:
+        if action <= 8:
             if key in truth_map:
                 assert table.lookup(key, key_id) is not None
-                assert truth_map[key] == table.lookup(key, key_id)
+                assert truth_map[key].value_size == table.lookup(key, key_id).value_size
             else:
                 assert table.lookup(key, key_id) is None
+            table.insert(key, key_id, value)
+            truth_map[key] = value
         else:
-            table.delete(key, key_id)
+            deleted = table.delete(key, key_id)
+            if deleted:
+                assert key in truth_map
             if key in truth_map:
                 del truth_map[key]
+
+    # Check all keys are unique in the sample set.
+    for _i in range(10):
+        samples = table.random_sample(kSampleSize)
+        unique_keys = {}
+        for sample in samples:
+            unique_keys[sample.key] = True
+        assert len(samples) == len(unique_keys)
+
+    assert len(table) == len(truth_map)
+    for key in truth_map:
+        assert table.lookup(key, int(key[1:])) is not None
+        assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size
     print("Test hash table: Success")
 
 
-def assert_metrics(cache, expected_value):
+def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True):
     assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format(
         expected_value[0], cache.used_size
     )
@@ -70,24 +97,35 @@ def assert_metrics(cache, expected_value):
     ), "Expected {}, Actual {}".format(
         expected_value[2], cache.miss_ratio_stats.num_misses
     )
-    assert cache.table.elements == len(expected_value[3]) + len(
+    assert len(cache.table) == len(expected_value[3]) + len(
         expected_value[4]
     ), "Expected {}, Actual {}".format(
         len(expected_value[3]) + len(expected_value[4]), cache.table.elements
     )
     for expeceted_k in expected_value[3]:
-        val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
-        assert val is not None
-        assert val.value_size == 1
+        if custom_hashtable:
+            val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["b{}".format(expeceted_k)]
+        assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format(
+            expeceted_k, expected_value, cache.table
+        )
+        assert val.value_size == expected_value_size
     for expeceted_k in expected_value[4]:
-        val = cache.table.lookup("g{}".format(expeceted_k), expeceted_k)
+        if custom_hashtable:
+            val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["g0-{}".format(expeceted_k)]
         assert val is not None
-        assert val.value_size == 1
+        assert val.value_size == expected_value_size
 
 
 # Access k1, k1, k2, k3, k3, k3, k4
-def test_cache(policies, expected_value):
-    cache = ThompsonSamplingCache(3, False, policies)
+# When k4 is inserted,
+#   LRU should evict k1.
+#   LFU should evict k2.
+#   MRU should evict k3.
+def test_cache(cache, expected_value, custom_hashtable=True):
     k1 = TraceRecord(
         access_time=0,
         block_id=1,
@@ -103,6 +141,14 @@ def test_cache(policies, expected_value):
         key_id=1,
         kv_size=5,
         is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
     )
     k2 = TraceRecord(
         access_time=1,
@@ -119,6 +165,14 @@ def test_cache(policies, expected_value):
         key_id=1,
         kv_size=5,
         is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
     )
     k3 = TraceRecord(
         access_time=2,
@@ -135,6 +189,14 @@ def test_cache(policies, expected_value):
         key_id=1,
         kv_size=5,
         is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
     )
     k4 = TraceRecord(
         access_time=3,
@@ -151,6 +213,14 @@ def test_cache(policies, expected_value):
         key_id=1,
         kv_size=5,
         is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
     )
     sequence = [k1, k1, k2, k3, k3, k3]
     index = 0
@@ -167,20 +237,29 @@ def test_cache(policies, expected_value):
     expected_values.append([3, 5, 3, [1, 2, 3], []])
     # Access k3, hit.
     expected_values.append([3, 6, 3, [1, 2, 3], []])
+    access_time = 0
     for access in sequence:
+        access.access_time = access_time
         cache.access(access)
-        assert_metrics(cache, expected_values[index])
+        assert_metrics(
+            cache,
+            expected_values[index],
+            expected_value_size=1,
+            custom_hashtable=custom_hashtable,
+        )
+        access_time += 1
         index += 1
+    k4.access_time = access_time
     cache.access(k4)
-    assert_metrics(cache, expected_value)
+    assert_metrics(
+        cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable
+    )
 
 
-def test_lru_cache():
+def test_lru_cache(cache, custom_hashtable):
     print("Test LRU cache")
-    policies = []
-    policies.append(LRUPolicy())
     # Access k4, miss. evict k1
-    test_cache(policies, [3, 7, 4, [2, 3, 4], []])
+    test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable)
     print("Test LRU cache: Success")
 
 
@@ -189,7 +268,10 @@ def test_mru_cache():
     policies = []
     policies.append(MRUPolicy())
     # Access k4, miss. evict k3
-    test_cache(policies, [3, 7, 4, [1, 2, 4], []])
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 2, 4], []],
+    )
     print("Test MRU cache: Success")
 
 
@@ -198,22 +280,36 @@ def test_lfu_cache():
     policies = []
     policies.append(LFUPolicy())
     # Access k4, miss. evict k2
-    test_cache(policies, [3, 7, 4, [1, 3, 4], []])
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 3, 4], []],
+    )
     print("Test LFU cache: Success")
 
 
 def test_mix(cache):
     print("Test Mix {} cache".format(cache.cache_name()))
     n = 100000
-    records = 199
+    records = 100
+    block_size_table = {}
+    trace_num_misses = 0
     for i in range(n):
         key_id = random.randint(0, records)
         vs = random.randint(0, 10)
+        now = i * kMicrosInSecond
+        block_size = vs
+        if key_id in block_size_table:
+            block_size = block_size_table[key_id]
+        else:
+            block_size_table[key_id] = block_size
+        is_hit = key_id % 2
+        if is_hit == 0:
+            trace_num_misses += 1
         k = TraceRecord(
-            access_time=i,
+            access_time=now,
             block_id=key_id,
             block_type=1,
-            block_size=vs,
+            block_size=block_size,
             cf_id=0,
             cf_name="",
             level=0,
@@ -223,13 +319,117 @@ def test_mix(cache):
             get_id=key_id,
             key_id=key_id,
             kv_size=5,
-            is_hit=1,
+            is_hit=is_hit,
+            referenced_key_exist_in_block=1,
+            num_keys_in_block=0,
+            table_id=0,
+            seq_number=0,
+            block_key_size=0,
+            key_size=0,
+            block_offset_in_file=0,
+            next_access_seq_no=vs,
         )
         cache.access(k)
     assert cache.miss_ratio_stats.miss_ratio() > 0
+    if cache.cache_name() == "Trace":
+        assert cache.miss_ratio_stats.num_accesses == n
+        assert cache.miss_ratio_stats.num_misses == trace_num_misses
+    else:
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
     print("Test Mix {} cache: Success".format(cache.cache_name()))
 
 
+def test_end_to_end():
+    print("Test All caches")
+    n = 100000
+    nblocks = 1000
+    block_size = 16 * 1024
+    ncfs = 7
+    nlevels = 6
+    nfds = 100000
+    trace_file_path = "test_trace"
+    # All blocks are of the same size so that OPT must achieve the lowest miss
+    # ratio.
+    with open(trace_file_path, "w+") as trace_file:
+        access_records = ""
+        for i in range(n):
+            key_id = random.randint(0, nblocks)
+            cf_id = random.randint(0, ncfs)
+            level = random.randint(0, nlevels)
+            fd = random.randint(0, nfds)
+            now = i * kMicrosInSecond
+            access_record = ""
+            access_record += "{},".format(now)
+            access_record += "{},".format(key_id)
+            access_record += "{},".format(9)  # block type
+            access_record += "{},".format(block_size)  # block size
+            access_record += "{},".format(cf_id)
+            access_record += "cf_{},".format(cf_id)
+            access_record += "{},".format(level)
+            access_record += "{},".format(fd)
+            access_record += "{},".format(key_id % 3)  # caller
+            access_record += "{},".format(0)  # no insert
+            access_record += "{},".format(i)  # get_id
+            access_record += "{},".format(i)  # key_id
+            access_record += "{},".format(100)  # kv_size
+            access_record += "{},".format(1)  # is_hit
+            access_record += "{},".format(1)  # referenced_key_exist_in_block
+            access_record += "{},".format(10)  # num_keys_in_block
+            access_record += "{},".format(1)  # table_id
+            access_record += "{},".format(0)  # seq_number
+            access_record += "{},".format(10)  # block key size
+            access_record += "{},".format(20)  # key size
+            access_record += "{},".format(0)  # block offset
+            access_record = access_record[:-1]
+            access_records += access_record + "\n"
+        trace_file.write(access_records)
+
+    print("Test All caches: Start testing caches")
+    cache_size = block_size * nblocks / 10
+    downsample_size = 1
+    cache_ms = {}
+    for cache_type in [
+        "ts",
+        "opt",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pyccbt",
+        "pycctbbt",
+    ]:
+        cache = create_cache(cache_type, cache_size, downsample_size)
+        run(trace_file_path, cache_type, cache, 0, -1, "all")
+        cache_ms[cache_type] = cache
+        assert cache.miss_ratio_stats.num_accesses == n
+
+    for cache_type in cache_ms:
+        cache = cache_ms[cache_type]
+        ms = cache.miss_ratio_stats.miss_ratio()
+        assert ms <= 100.0 and ms >= 0.0
+        # OPT should perform the best.
+        assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
+        print("Test All {}: Success".format(cache.cache_name()))
+
+    os.remove(trace_file_path)
+    print("Test All: Success")
+
+
 def test_hybrid(cache):
     print("Test {} cache".format(cache.cache_name()))
     k = TraceRecord(
@@ -247,6 +447,14 @@ def test_hybrid(cache):
         key_id=1,
         kv_size=0,  # no size.
         is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
     )
     cache.access(k)  # Expect a miss.
     # used size, num accesses, num misses, hash table size, blocks, get keys.
@@ -319,22 +527,208 @@ def test_hybrid(cache):
     k.key_id = 4  # Same row key and should not be inserted again.
     k.kv_size = 1
     cache.access(k)
-    assert_metrics(cache, [16, 103, 99, [i for i in range(101 - kSampleSize, 101)], []])
+    assert_metrics(
+        cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]
+    )
     print("Test {} cache: Success".format(cache.cache_name()))
 
 
+def test_opt_cache():
+    print("Test OPT cache")
+    cache = OPTCache(3)
+    # seq:         0,  1,  2,  3,  4,  5,  6,  7,  8
+    # key:         k1, k2, k3, k4, k5, k6, k7, k1, k8
+    # next_access: 7,  19, 18, M,  M,  17, 16, 25, M
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert_metrics(
+        cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 2
+    k.next_access_seq_no = 19
+    cache.access(k)
+    assert_metrics(
+        cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 3
+    k.next_access_seq_no = 18
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 4
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 2 since its next access 19 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 5
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 4 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 6
+    k.next_access_seq_no = 17
+    cache.access(k)
+    # Evict 5 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 7
+    k.next_access_seq_no = 16
+    cache.access(k)
+    # Evict 3 since its next access 18 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 1
+    k.next_access_seq_no = 25
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 8
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    # Evict 1 since its next access 25 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False
+    )
+
+    # Insert a large kv pair to evict all keys.
+    k.access_time += 1
+    k.block_id = 10
+    k.block_size = 3
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False
+    )
+    print("Test OPT cache: Success")
+
+
+def test_trace_cache():
+    print("Test trace cache")
+    cache = TraceCache(0)
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=0,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 1
+    assert cache.miss_ratio_stats.num_misses == 0
+    k.is_hit = 0
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 2
+    assert cache.miss_ratio_stats.num_misses == 1
+    print("Test trace cache: Success")
+
+
 if __name__ == "__main__":
-    policies = []
-    policies.append(MRUPolicy())
-    policies.append(LRUPolicy())
-    policies.append(LFUPolicy())
     test_hash_table()
-    test_lru_cache()
+    test_trace_cache()
+    test_opt_cache()
+    test_lru_cache(
+        ThompsonSamplingCache(
+            3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None
+        ),
+        custom_hashtable=True,
+    )
+    test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False)
     test_mru_cache()
     test_lfu_cache()
-    test_mix(ThompsonSamplingCache(100, False, policies))
-    test_mix(ThompsonSamplingCache(100, True, policies))
-    test_mix(LinUCBCache(100, False, policies))
-    test_mix(LinUCBCache(100, True, policies))
-    test_hybrid(ThompsonSamplingCache(kSampleSize, True, [LRUPolicy()]))
-    test_hybrid(LinUCBCache(kSampleSize, True, [LRUPolicy()]))
+    test_hybrid(
+        ThompsonSamplingCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    test_hybrid(
+        LinUCBCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    for cache_type in [
+        "ts",
+        "opt",
+        "arc",
+        "pylfu",
+        "pymru",
+        "trace",
+        "pyhb",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pycctbbt",
+        "pycctb",
+        "pyccbt",
+    ]:
+        for enable_row_cache in [0, 1, 2]:
+            cache_type_str = cache_type
+            if cache_type != "opt" and cache_type != "trace":
+                if enable_row_cache == 1:
+                    cache_type_str += "_hybrid"
+                elif enable_row_cache == 2:
+                    cache_type_str += "_hybridn"
+            test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1))
+    test_end_to_end()
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 032ed2be24f..e1021b466c3 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -127,6 +127,9 @@ DEFINE_string(analyze_get_spatial_locality_labels, "",
               "Group data blocks using these labels.");
 DEFINE_string(analyze_get_spatial_locality_buckets, "",
               "Group data blocks by their statistics using these buckets.");
+DEFINE_string(skew_labels, "",
+              "Group the access count of a block using these labels.");
+DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets.");
 DEFINE_bool(mrc_only, false,
             "Evaluate alternative cache policies only. When this flag is true, "
             "the analyzer does NOT maintain states of each block in memory for "
@@ -147,6 +150,7 @@ namespace {
 
 const std::string kMissRatioCurveFileName = "mrc";
 const std::string kGroupbyBlock = "block";
+const std::string kGroupbyTable = "table";
 const std::string kGroupbyColumnFamily = "cf";
 const std::string kGroupbySSTFile = "sst";
 const std::string kGroupbyBlockType = "bt";
@@ -164,6 +168,7 @@ const std::string kSupportedCacheNames =
 // The suffix for the generated csv files.
 const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline";
 const std::string kFileNameSuffixMissTimeline = "miss_timeline";
+const std::string kFileNameSuffixSkew = "skewness";
 const std::string kFileNameSuffixAccessTimeline = "access_timeline";
 const std::string kFileNameSuffixCorrelation = "correlation_input";
 const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
@@ -540,6 +545,62 @@ void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const {
   }
 }
 
+void BlockCacheTraceAnalyzer::WriteSkewness(
+    const std::string& label_str, const std::vector<uint64_t>& percent_buckets,
+    TraceType target_block_type) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, uint64_t> label_naccesses;
+  uint64_t total_naccesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (target_block_type != TraceType::kTraceMax &&
+        target_block_type != type) {
+      return;
+    }
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    label_naccesses[label] += block.num_accesses;
+    total_naccesses += block.num_accesses;
+  };
+  TraverseBlocks(block_callback, &labels);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_bucket_naccesses;
+  std::vector<std::pair<std::string, uint64_t>> pairs;
+  for (auto const& itr : label_naccesses) {
+    pairs.push_back(itr);
+  }
+  // Sort in descending order.
+  sort(
+      pairs.begin(), pairs.end(),
+      [=](std::pair<std::string, uint64_t>& a,
+          std::pair<std::string, uint64_t>& b) { return b.second < a.second; });
+
+  size_t prev_start_index = 0;
+  for (auto const& percent : percent_buckets) {
+    label_bucket_naccesses[label_str][percent] = 0;
+    size_t end_index = 0;
+    if (percent == port::kMaxUint64) {
+      end_index = label_naccesses.size();
+    } else {
+      end_index = percent * label_naccesses.size() / 100;
+    }
+    for (size_t i = prev_start_index; i < end_index; i++) {
+      label_bucket_naccesses[label_str][percent] += pairs[i].second;
+    }
+    prev_start_index = end_index;
+  }
+  std::string filename_suffix;
+  if (target_block_type != TraceType::kTraceMax) {
+    filename_suffix = block_type_to_string(target_block_type);
+    filename_suffix += "_";
+  }
+  filename_suffix += kFileNameSuffixSkew;
+  WriteStatsToFile(label_str, percent_buckets, filename_suffix,
+                   label_bucket_naccesses, total_naccesses);
+}
+
 void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
     const std::string& label_str, uint32_t max_number_of_values) const {
   std::set<std::string> labels = ParseLabelStr(label_str);
@@ -549,12 +610,16 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
       [&](const std::string& cf_name, uint64_t fd, uint32_t level,
           TraceType block_type, const std::string& /*block_key*/,
           uint64_t /*block_key_id*/, const BlockAccessInfo& block) {
+        if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) {
+          // We only know table id information for get requests.
+          return;
+        }
         if (labels.find(kGroupbyCaller) != labels.end()) {
           // Group by caller.
           for (auto const& caller_map : block.caller_access_timeline) {
             const std::string label =
                 BuildLabel(labels, cf_name, fd, level, block_type,
-                           caller_map.first, /*block_id=*/0);
+                           caller_map.first, /*block_id=*/0, block);
             auto it = block.caller_access_sequence__number_timeline.find(
                 caller_map.first);
             assert(it != block.caller_access_sequence__number_timeline.end());
@@ -563,14 +628,15 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
           }
           return;
         }
-        const std::string label = BuildLabel(
-            labels, cf_name, fd, level, block_type,
-            TableReaderCaller::kMaxBlockCacheLookupCaller, /*block_id=*/0);
+        const std::string label =
+            BuildLabel(labels, cf_name, fd, level, block_type,
+                       TableReaderCaller::kMaxBlockCacheLookupCaller,
+                       /*block_id=*/0, block);
         UpdateFeatureVectors(block.access_sequence_number_timeline,
                              block.access_timeline, label, &label_features,
                              &label_predictions);
       };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
   WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions,
                                  max_number_of_values);
 }
@@ -656,7 +722,7 @@ std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
 std::string BlockCacheTraceAnalyzer::BuildLabel(
     const std::set<std::string>& labels, const std::string& cf_name,
     uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
-    uint64_t block_key) const {
+    uint64_t block_key, const BlockAccessInfo& block) const {
   std::map<std::string, std::string> label_value_map;
   label_value_map[kGroupbyAll] = kGroupbyAll;
   label_value_map[kGroupbyLevel] = std::to_string(level);
@@ -665,6 +731,7 @@ std::string BlockCacheTraceAnalyzer::BuildLabel(
   label_value_map[kGroupbyBlockType] = block_type_to_string(type);
   label_value_map[kGroupbyColumnFamily] = cf_name;
   label_value_map[kGroupbyBlock] = std::to_string(block_key);
+  label_value_map[kGroupbyTable] = std::to_string(block.table_id);
   // Concatenate the label values.
   std::string label;
   for (auto const& l : labels) {
@@ -683,7 +750,8 @@ void BlockCacheTraceAnalyzer::TraverseBlocks(
                        const std::string& /*block_key*/,
                        uint64_t /*block_key_id*/,
                        const BlockAccessInfo& /*block_access_info*/)>
-        block_callback) const {
+        block_callback,
+    std::set<std::string>* labels) const {
   for (auto const& cf_aggregates : cf_aggregates_map_) {
     // Stats per column family.
     const std::string& cf_name = cf_aggregates.first;
@@ -698,6 +766,11 @@ void BlockCacheTraceAnalyzer::TraverseBlocks(
         for (auto const& block_access_info :
              block_type_aggregates.second.block_access_info_map) {
           // Stats per block.
+          if (labels && block_access_info.second.table_id == 0 &&
+              labels->find(kGroupbyTable) != labels->end()) {
+            // We only know table id information for get requests.
+            return;
+          }
           block_callback(cf_name, fd, level, type, block_access_info.first,
                          block_access_info.second.block_id,
                          block_access_info.second);
@@ -733,7 +806,7 @@ void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
     }
     const std::string label =
         BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock,
-                   TableReaderCaller::kUserGet, /*block_id=*/0);
+                   TableReaderCaller::kUserGet, /*block_id=*/0, block);
 
     const uint64_t percent_referenced_for_existing_keys =
         static_cast<uint64_t>(std::max(
@@ -761,7 +834,7 @@ void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
         ->second += 1;
     nblocks += 1;
   };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
   WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys,
                    label_pnrefkeys_nblocks, nblocks);
   WriteStatsToFile(label_str, percent_buckets,
@@ -792,7 +865,7 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
         continue;
       }
       const std::string label =
-          BuildLabel(labels, cf_name, fd, level, type, caller, block_id);
+          BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block);
       for (auto const& naccess : timeline.second) {
         const uint64_t timestamp = naccess.first / time_unit;
         const uint64_t num = naccess.second;
@@ -806,7 +879,7 @@ void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
       access_count_block_id_map[naccesses].push_back(std::to_string(block_id));
     }
   };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
 
   // We have label_access_timeline now. Write them into a file.
   const std::string user_access_prefix =
@@ -877,9 +950,9 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance(
                             uint32_t level, TraceType type,
                             const std::string& /*block_key*/, uint64_t block_id,
                             const BlockAccessInfo& block) {
-    const std::string label =
-        BuildLabel(labels, cf_name, fd, level, type,
-                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
     if (label_distance_num_reuses.find(label) ==
         label_distance_num_reuses.end()) {
       // The first time we encounter this label.
@@ -894,7 +967,7 @@ void BlockCacheTraceAnalyzer::WriteReuseDistance(
       total_num_reuses += reuse_distance.second;
     }
   };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
   // We have label_naccesses and label_distance_num_reuses now. Write them into
   // a file.
   const std::string output_path =
@@ -1016,17 +1089,17 @@ void BlockCacheTraceAnalyzer::WriteReuseInterval(
     if (labels.find(kGroupbyCaller) != labels.end()) {
       for (auto const& timeline : block.caller_num_accesses_timeline) {
         const TableReaderCaller caller = timeline.first;
-        const std::string label =
-            BuildLabel(labels, cf_name, fd, level, type, caller, block_id);
+        const std::string label = BuildLabel(labels, cf_name, fd, level, type,
+                                             caller, block_id, block);
         UpdateReuseIntervalStats(label, time_buckets, timeline.second,
                                  &label_time_num_reuses, &total_num_reuses);
       }
       return;
     }
     // Does not group by caller so we need to flatten the access timeline.
-    const std::string label =
-        BuildLabel(labels, cf_name, fd, level, type,
-                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
     std::map<uint64_t, uint64_t> timeline;
     for (auto const& caller_timeline : block.caller_num_accesses_timeline) {
       for (auto const& time_naccess : caller_timeline.second) {
@@ -1045,7 +1118,7 @@ void BlockCacheTraceAnalyzer::WriteReuseInterval(
     label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second +=
         block.num_accesses;
   };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
 
   // Write the stats into files.
   WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval,
@@ -1074,9 +1147,9 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime(
     } else {
       lifetime = port::kMaxUint64 - 1;
     }
-    const std::string label =
-        BuildLabel(labels, cf_name, fd, level, type,
-                   TableReaderCaller::kMaxBlockCacheLookupCaller, block_id);
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
 
     if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) {
       // The first time we encounter this label.
@@ -1087,7 +1160,7 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime(
     label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1;
     total_nblocks += 1;
   };
-  TraverseBlocks(block_callback);
+  TraverseBlocks(block_callback, &labels);
   WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime,
                    label_lifetime_nblocks, total_nblocks);
 }
@@ -1396,11 +1469,17 @@ Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord(
   int ret = snprintf(
       trace_record_buffer_, sizeof(trace_record_buffer_),
       "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
-      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u\n",
+      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64
+      ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
       access.access_timestamp, block_id, access.block_type, access.block_size,
       access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
       access.caller, access.no_insert, access.get_id, get_key_id,
-      access.referenced_data_size, access.is_cache_hit);
+      access.referenced_data_size, access.is_cache_hit,
+      access.referenced_key_exist_in_block, access.num_keys_in_block,
+      BlockCacheTraceHelper::GetTableId(access),
+      BlockCacheTraceHelper::GetSequenceNumber(access), access.block_key.size(),
+      access.referenced_key.size(),
+      BlockCacheTraceHelper::GetBlockOffsetInFile(access));
   if (ret < 0) {
     return Status::IOError("failed to format the output");
   }
@@ -1432,13 +1511,13 @@ Status BlockCacheTraceAnalyzer::RecordAccess(
   uint64_t get_key_id = 0;
   if (access.caller == TableReaderCaller::kUserGet &&
       access.get_id != BlockCacheTraceHelper::kReservedGetId) {
-    std::string row_key = BlockCacheTraceHelper::ComputeRowKey(access);
-    if (get_key_info_map_.find(row_key) == get_key_info_map_.end()) {
-      get_key_info_map_[row_key].key_id = unique_get_key_id_;
-      get_key_id = unique_get_key_id_;
+    std::string user_key = ExtractUserKey(access.referenced_key).ToString();
+    if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) {
+      get_key_info_map_[user_key].key_id = unique_get_key_id_;
       unique_get_key_id_++;
     }
-    get_key_info_map_[row_key].AddAccess(access, access_sequence_number_);
+    get_key_id = get_key_info_map_[user_key].key_id;
+    get_key_info_map_[user_key].AddAccess(access, access_sequence_number_);
   }
 
   if (compute_reuse_distance_) {
@@ -2224,6 +2303,25 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
     analyzer.WriteCorrelationFeaturesForGet(
         FLAGS_analyze_correlation_coefficients_max_number_of_values);
   }
+
+  if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_skew_buckets);
+    std::stringstream ss(FLAGS_skew_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceIndexBlock);
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceFilterBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      } else {
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      }
+    }
+  }
   return 0;
 }
 
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
index bc41ff468cc..f22a9da68f3 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -33,6 +33,8 @@ struct GetKeyInfo {
 // Statistics of a block.
 struct BlockAccessInfo {
   uint64_t block_id = 0;
+  uint64_t table_id = 0;
+  uint64_t block_offset = 0;
   uint64_t num_accesses = 0;
   uint64_t block_size = 0;
   uint64_t first_access_time = 0;
@@ -73,6 +75,8 @@ struct BlockAccessInfo {
     if (first_access_time == 0) {
       first_access_time = access.access_timestamp;
     }
+    table_id = BlockCacheTraceHelper::GetTableId(access);
+    block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access);
     last_access_time = access.access_timestamp;
     block_size = access.block_size;
     caller_num_access_map[access.caller]++;
@@ -301,6 +305,10 @@ class BlockCacheTraceAnalyzer {
 
   void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const;
 
+  void WriteSkewness(const std::string& label_str,
+                     const std::vector<uint64_t>& percent_buckets,
+                     TraceType target_block_type) const;
+
   const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
   TEST_cf_aggregates_map() const {
     return cf_aggregates_map_;
@@ -312,7 +320,8 @@ class BlockCacheTraceAnalyzer {
   std::string BuildLabel(const std::set<std::string>& labels,
                          const std::string& cf_name, uint64_t fd,
                          uint32_t level, TraceType type,
-                         TableReaderCaller caller, uint64_t block_key) const;
+                         TableReaderCaller caller, uint64_t block_key,
+                         const BlockAccessInfo& block) const;
 
   void ComputeReuseDistance(BlockAccessInfo* info) const;
 
@@ -341,7 +350,8 @@ class BlockCacheTraceAnalyzer {
                          const std::string& /*block_key*/,
                          uint64_t /*block_key_id*/,
                          const BlockAccessInfo& /*block_access_info*/)>
-          block_callback) const;
+          block_callback,
+      std::set<std::string>* labels = nullptr) const;
 
   void UpdateFeatureVectors(
       const std::vector<uint64_t>& access_sequence_number_timeline,
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index 9917d5b9e78..eecd6e80d9d 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -181,7 +181,9 @@ class BlockCacheTracerTest : public testing::Test {
             analyze_get_spatial_locality_labels_,
         "-analyze_get_spatial_locality_buckets=" +
             analyze_get_spatial_locality_buckets_,
-        "-analyze_correlation_coefficients_labels=all"};
+        "-analyze_correlation_coefficients_labels=all",
+        "-skew_labels=all",
+        "-skew_buckets=10,50,100"};
     char arg_buffer[kArgBufferSize];
     char* argv[kMaxArgCount];
     int argc = 0;
@@ -331,6 +333,33 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
       }
     }
   }
+  {
+    // Validate the skewness csv file.
+    const std::string skewness_file_path = test_path_ + "/all_skewness";
+    std::ifstream skew_file(skewness_file_path);
+    // Read header.
+    std::string line;
+    ASSERT_TRUE(getline(skew_file, line));
+    std::stringstream ss(line);
+    double sum_percent = 0;
+    while (getline(skew_file, line)) {
+      std::stringstream ss_naccess(line);
+      std::string substr;
+      bool read_label = false;
+      while (ss_naccess.good()) {
+        ASSERT_TRUE(getline(ss_naccess, substr, ','));
+        if (!read_label) {
+          read_label = true;
+          continue;
+        }
+        sum_percent += ParseDouble(substr);
+      }
+    }
+    ASSERT_EQ(100.0, sum_percent);
+    ASSERT_FALSE(getline(skew_file, line));
+    skew_file.close();
+    ASSERT_OK(env_->DeleteFile(skewness_file_path));
+  }
   {
     // Validate the timeline csv files.
     const std::vector<std::string> time_units{"_60", "_3600"};
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 1eeb64ac85d..4f39be609fe 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -61,11 +61,40 @@ std::string BlockCacheTraceHelper::ComputeRowKey(
     return "";
   }
   Slice key = ExtractUserKey(access.referenced_key);
-  uint64_t seq_no = access.get_from_user_specified_snapshot == Boolean::kFalse
-                        ? 0
-                        : 1 + GetInternalKeySeqno(access.referenced_key);
-  return std::to_string(access.sst_fd_number) + "_" + key.ToString() + "_" +
-         std::to_string(seq_no);
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString();
+}
+
+uint64_t BlockCacheTraceHelper::GetTableId(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller) || access.referenced_key.size() < 4) {
+    return 0;
+  }
+  return static_cast<uint64_t>(DecodeFixed32(access.referenced_key.data())) + 1;
+}
+
+uint64_t BlockCacheTraceHelper::GetSequenceNumber(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return 0;
+  }
+  return access.get_from_user_specified_snapshot == Boolean::kFalse
+             ? 0
+             : 1 + GetInternalKeySeqno(access.referenced_key);
+}
+
+uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile(
+    const BlockCacheTraceRecord& access) {
+  Slice input(access.block_key);
+  uint64_t offset = 0;
+  while (true) {
+    uint64_t tmp = 0;
+    if (GetVarint64(&input, &tmp)) {
+      offset = tmp;
+    } else {
+      break;
+    }
+  }
+  return offset;
 }
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 3863ca430a4..b109b1db01c 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -31,6 +31,15 @@ class BlockCacheTraceHelper {
   // Row key is a concatenation of the access's fd_number and the referenced
   // user key.
   static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
+  // The first four bytes of the referenced key in a Get request is the table
+  // id.
+  static uint64_t GetTableId(const BlockCacheTraceRecord& access);
+  // The sequence number of a get request is the last part of the referenced
+  // key.
+  static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
+  // Block offset in a file is the last varint64 in the block key.
+  static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
+
   static const std::string kUnknownColumnFamilyName;
   static const uint64_t kReservedGetId;
 };
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
index babdd431f5a..3d3432e20a4 100644
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -84,7 +84,7 @@ class CacheSimulatorTest : public testing::Test {
     for (auto const& key : keys) {
       std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
       auto handle =
-          sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString() + "_0");
+          sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString());
       EXPECT_NE(nullptr, handle);
       sim_cache->Release(handle);
     }
@@ -229,10 +229,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
   ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses());
   ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
-  auto handle = sim_cache->Lookup(
-      std::to_string(first_get.sst_fd_number) + "_" +
-      ExtractUserKey(first_get.referenced_key).ToString() + "_" +
-      std::to_string(1 + GetInternalKeySeqno(first_get.referenced_key)));
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   for (uint32_t i = 100; i < block_id; i++) {
@@ -256,10 +255,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
   ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses());
   ASSERT_EQ(66, static_cast<uint64_t>(
                     cache_simulator->miss_ratio_stats().user_miss_ratio()));
-  handle = sim_cache->Lookup(
-      std::to_string(second_get.sst_fd_number) + "_" +
-      ExtractUserKey(second_get.referenced_key).ToString() + "_" +
-      std::to_string(1 + GetInternalKeySeqno(second_get.referenced_key)));
+  handle =
+      sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
+                        ExtractUserKey(second_get.referenced_key).ToString());
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   for (uint32_t i = 100; i < block_id; i++) {
@@ -394,7 +392,7 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
   AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
               {"1", "2", "3", "5"}, {"1", "2", "4"});
   for (auto const& key : {"1", "2", "4"}) {
-    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
     ASSERT_NE(nullptr, handle);
     sim_cache->Release(handle);
   }
@@ -417,7 +415,7 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
   AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
               {});
   for (auto const& key : {"1", "2", "4"}) {
-    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
     ASSERT_EQ(nullptr, handle);
   }
 }
@@ -437,9 +435,9 @@ TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
     cache_simulator->Access(first_get);
     block_id++;
   }
-  auto handle = sim_cache->Lookup(
-      std::to_string(first_get.sst_fd_number) + "_" +
-      ExtractUserKey(first_get.referenced_key).ToString() + "_0");
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
   ASSERT_NE(nullptr, handle);
   sim_cache->Release(handle);
   // All blocks are missing from the cache since insert_blocks_row_kvpair_misses

From 38b03c840e0ac49cffc7f5a667c6bc910648b9a1 Mon Sep 17 00:00:00 2001
From: Aaryaman Sagar <rmn100@gmail.com>
Date: Wed, 7 Aug 2019 14:29:35 -0700
Subject: [PATCH 285/572] Port folly/synchronization/DistributedMutex to
 rocksdb (#5642)

Summary:
This ports `folly::DistributedMutex` into RocksDB. The PR includes everything else needed to compile and use DistributedMutex as a component within folly. Most files are unchanged except for some portability stuff and includes.

For now, I've put this under `rocksdb/third-party`, but if there is a better folder to put this under, let me know. I also am not sure how or where to put unit tests for third-party stuff like this. It seems like gtest is included already, but I need to link with it from another third-party folder.

This also includes some other common components from folly

- folly/Optional
- folly/ScopeGuard (In particular `SCOPE_EXIT`)
- folly/synchronization/ParkingLot (A portable futex-like interface)
- folly/synchronization/AtomicNotification (The standard C++ interface for futexes)
- folly/Indestructible (For singletons that don't get destroyed without allocations)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5642

Differential Revision: D16544439

fbshipit-source-id: 179b98b5dcddc3075926d31a30f92fd064245731
---
 CMakeLists.txt                                |   23 +
 Makefile                                      |   30 +-
 build_tools/build_detect_platform             |    6 +
 build_tools/fbcode_config.sh                  |    2 +
 build_tools/fbcode_config_platform007.sh      |    2 +
 src.mk                                        |    7 +
 third-party/folly/folly/CPortability.h        |   15 +
 third-party/folly/folly/ConstexprMath.h       |   17 +
 third-party/folly/folly/Indestructible.h      |  166 ++
 third-party/folly/folly/Optional.h            |  570 ++++++
 third-party/folly/folly/Portability.h         |   74 +
 third-party/folly/folly/ScopeGuard.h          |   54 +
 third-party/folly/folly/Traits.h              |  152 ++
 third-party/folly/folly/Unit.h                |   59 +
 third-party/folly/folly/Utility.h             |  141 ++
 third-party/folly/folly/chrono/Hardware.h     |   33 +
 third-party/folly/folly/container/Array.h     |   74 +
 third-party/folly/folly/detail/Futex-inl.h    |  117 ++
 third-party/folly/folly/detail/Futex.cpp      |  263 +++
 third-party/folly/folly/detail/Futex.h        |   96 +
 third-party/folly/folly/functional/Invoke.h   |   40 +
 third-party/folly/folly/hash/Hash.h           |   29 +
 third-party/folly/folly/lang/Align.h          |   38 +
 third-party/folly/folly/lang/Bits.h           |   30 +
 third-party/folly/folly/lang/Launder.h        |   51 +
 third-party/folly/folly/portability/Asm.h     |   28 +
 .../folly/folly/portability/SysSyscall.h      |   10 +
 .../folly/folly/portability/SysTypes.h        |   26 +
 .../synchronization/AtomicNotification-inl.h  |  138 ++
 .../synchronization/AtomicNotification.cpp    |   23 +
 .../synchronization/AtomicNotification.h      |   57 +
 .../folly/synchronization/AtomicUtil-inl.h    |  258 +++
 .../folly/folly/synchronization/AtomicUtil.h  |   52 +
 .../folly/folly/synchronization/Baton.h       |  327 ++++
 .../synchronization/DistributedMutex-inl.h    | 1702 +++++++++++++++++
 .../synchronization/DistributedMutex.cpp      |   16 +
 .../folly/synchronization/DistributedMutex.h  |  304 +++
 .../DistributedMutexSpecializations.h         |   39 +
 .../folly/synchronization/ParkingLot.cpp      |   26 +
 .../folly/folly/synchronization/ParkingLot.h  |  318 +++
 .../folly/synchronization/WaitOptions.cpp     |   12 +
 .../folly/folly/synchronization/WaitOptions.h |   57 +
 .../detail/InlineFunctionRef.h                |  219 +++
 .../detail/ProxyLockable-inl.h                |  207 ++
 .../synchronization/detail/ProxyLockable.h    |  164 ++
 .../folly/synchronization/detail/Sleeper.h    |   57 +
 .../folly/folly/synchronization/detail/Spin.h |   77 +
 .../test/DistributedMutexTest.cpp             | 1130 +++++++++++
 48 files changed, 7335 insertions(+), 1 deletion(-)
 create mode 100644 third-party/folly/folly/CPortability.h
 create mode 100644 third-party/folly/folly/ConstexprMath.h
 create mode 100644 third-party/folly/folly/Indestructible.h
 create mode 100644 third-party/folly/folly/Optional.h
 create mode 100644 third-party/folly/folly/Portability.h
 create mode 100644 third-party/folly/folly/ScopeGuard.h
 create mode 100644 third-party/folly/folly/Traits.h
 create mode 100644 third-party/folly/folly/Unit.h
 create mode 100644 third-party/folly/folly/Utility.h
 create mode 100644 third-party/folly/folly/chrono/Hardware.h
 create mode 100644 third-party/folly/folly/container/Array.h
 create mode 100644 third-party/folly/folly/detail/Futex-inl.h
 create mode 100644 third-party/folly/folly/detail/Futex.cpp
 create mode 100644 third-party/folly/folly/detail/Futex.h
 create mode 100644 third-party/folly/folly/functional/Invoke.h
 create mode 100644 third-party/folly/folly/hash/Hash.h
 create mode 100644 third-party/folly/folly/lang/Align.h
 create mode 100644 third-party/folly/folly/lang/Bits.h
 create mode 100644 third-party/folly/folly/lang/Launder.h
 create mode 100644 third-party/folly/folly/portability/Asm.h
 create mode 100644 third-party/folly/folly/portability/SysSyscall.h
 create mode 100644 third-party/folly/folly/portability/SysTypes.h
 create mode 100644 third-party/folly/folly/synchronization/AtomicNotification-inl.h
 create mode 100644 third-party/folly/folly/synchronization/AtomicNotification.cpp
 create mode 100644 third-party/folly/folly/synchronization/AtomicNotification.h
 create mode 100644 third-party/folly/folly/synchronization/AtomicUtil-inl.h
 create mode 100644 third-party/folly/folly/synchronization/AtomicUtil.h
 create mode 100644 third-party/folly/folly/synchronization/Baton.h
 create mode 100644 third-party/folly/folly/synchronization/DistributedMutex-inl.h
 create mode 100644 third-party/folly/folly/synchronization/DistributedMutex.cpp
 create mode 100644 third-party/folly/folly/synchronization/DistributedMutex.h
 create mode 100644 third-party/folly/folly/synchronization/DistributedMutexSpecializations.h
 create mode 100644 third-party/folly/folly/synchronization/ParkingLot.cpp
 create mode 100644 third-party/folly/folly/synchronization/ParkingLot.h
 create mode 100644 third-party/folly/folly/synchronization/WaitOptions.cpp
 create mode 100644 third-party/folly/folly/synchronization/WaitOptions.h
 create mode 100644 third-party/folly/folly/synchronization/detail/InlineFunctionRef.h
 create mode 100644 third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h
 create mode 100644 third-party/folly/folly/synchronization/detail/ProxyLockable.h
 create mode 100644 third-party/folly/folly/synchronization/detail/Sleeper.h
 create mode 100644 third-party/folly/folly/synchronization/detail/Spin.h
 create mode 100644 third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8622242aa75..f81e0ca4f99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,13 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files,
 if (WITH_WINDOWS_UTF8_FILENAMES)
   add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
 endif()
+# third-party/folly is only validated to work on Linux and Windows for now.
+# So only turn it on there by default.
+if(CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows")
+  option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON)
+else()
+  option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF)
+endif()
 if(MSVC)
   # Defaults currently different for GFLAGS.
   #  We will address find_package work a little later
@@ -462,6 +469,9 @@ endif()
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
+if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+  include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
+endif()
 find_package(Threads REQUIRED)
 
 # Main library source code
@@ -738,6 +748,15 @@ else()
     env/io_posix.cc)
 endif()
 
+if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+  list(APPEND SOURCES
+    third-party/folly/folly/detail/Futex.cpp
+    third-party/folly/folly/synchronization/AtomicNotification.cpp
+    third-party/folly/folly/synchronization/DistributedMutex.cpp
+    third-party/folly/folly/synchronization/ParkingLot.cpp
+    third-party/folly/folly/synchronization/WaitOptions.cpp)
+endif()
+
 set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX})
 set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
 set(ROCKSDB_IMPORT_LIB ${ROCKSDB_SHARED_LIB})
@@ -1009,6 +1028,10 @@ if(WITH_TESTS)
     list(APPEND TESTS utilities/env_librados_test.cc)
   endif()
 
+  if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+    list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp)
+  endif()
+
   set(BENCHMARKS
     cache/cache_bench.cc
     memtable/memtablerep_bench.cc
diff --git a/Makefile b/Makefile
index 1718309cb89..ccca3ac5efb 100644
--- a/Makefile
+++ b/Makefile
@@ -89,7 +89,7 @@ endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker)
 	ifneq ($(DEBUG_LEVEL),2)
-        	DEBUG_LEVEL=0
+		DEBUG_LEVEL=0
 	endif
 endif
 
@@ -304,6 +304,10 @@ ifndef DISABLE_JEMALLOC
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE)
 endif
 
+ifndef USE_FOLLY_DISTRIBUTED_MUTEX
+	USE_FOLLY_DISTRIBUTED_MUTEX=0
+endif
+
 export GTEST_THROW_ON_FAILURE=1
 export GTEST_HAS_EXCEPTIONS=1
 GTEST_DIR = ./third-party/gtest-1.7.0/fused-src
@@ -316,6 +320,18 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+	FOLLY_DIR = ./third-party/folly
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
+	endif
+endif
+
 # This (the first rule) must depend on "all".
 default: all
 
@@ -402,6 +418,9 @@ endif
 
 LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o)
 MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o)
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+  FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o)
+endif
 
 GTEST = $(GTEST_DIR)/gtest/gtest-all.o
 TESTUTIL = ./test_util/testutil.o
@@ -569,6 +588,10 @@ TESTS = \
 	block_cache_tracer_test \
 	block_cache_trace_analyzer_test \
 
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+	TESTS += folly_synchronization_distributed_mutex_test
+endif
+
 PARALLEL_TEST = \
 	backupable_db_test \
 	db_bloom_filter_test \
@@ -1120,6 +1143,11 @@ trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 	$(AM_LINK)
 
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o
+	$(AM_LINK)
+endif
+
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 4a52c6cddb7..7b18a5d5f59 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -150,6 +150,9 @@ case "$TARGET_OS" in
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
+          USE_FOLLY_DISTRIBUTED_MUTEX=1
+        fi
         # PORT_FILES=port/linux/linux_specific.cc
         ;;
     SunOS)
@@ -661,3 +664,6 @@ if test -n "$WITH_JEMALLOC_FLAG"; then
   echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
 fi
 echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
+if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
+  echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT"
+fi
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 4415f87da38..c2c39db48fe 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -159,4 +159,6 @@ else
   LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
 fi
 
+USE_FOLLY_DISTRIBUTED_MUTEX=1
+
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/build_tools/fbcode_config_platform007.sh b/build_tools/fbcode_config_platform007.sh
index 1a1e4208139..9da23fd843f 100644
--- a/build_tools/fbcode_config_platform007.sh
+++ b/build_tools/fbcode_config_platform007.sh
@@ -155,4 +155,6 @@ VALGRIND_VER="$VALGRIND_BASE/bin/"
 LUA_PATH=
 LUA_LIB=
 
+USE_FOLLY_DISTRIBUTED_MUTEX=1
+
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/src.mk b/src.mk
index 6d1d655c7f0..8ebc0bee96c 100644
--- a/src.mk
+++ b/src.mk
@@ -263,6 +263,13 @@ TEST_LIB_SOURCES =                                              \
   test_util/testutil.cc                                         \
   utilities/cassandra/test_utils.cc                             \
 
+FOLLY_SOURCES = \
+  third-party/folly/folly/detail/Futex.cpp                                     \
+  third-party/folly/folly/synchronization/AtomicNotification.cpp               \
+  third-party/folly/folly/synchronization/DistributedMutex.cpp                 \
+  third-party/folly/folly/synchronization/ParkingLot.cpp                       \
+  third-party/folly/folly/synchronization/WaitOptions.cpp                      \
+
 MAIN_SOURCES =                                                          \
   cache/cache_bench.cc                                                  \
   cache/cache_test.cc                                                   \
diff --git a/third-party/folly/folly/CPortability.h b/third-party/folly/folly/CPortability.h
new file mode 100644
index 00000000000..3ce3a7785ac
--- /dev/null
+++ b/third-party/folly/folly/CPortability.h
@@ -0,0 +1,15 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/**
+ * Macro for marking functions as having public visibility.
+ */
+#if defined(__GNUC__)
+#define FOLLY_EXPORT __attribute__((__visibility__("default")))
+#else
+#define FOLLY_EXPORT
+#endif
diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h
new file mode 100644
index 00000000000..b125c5f423b
--- /dev/null
+++ b/third-party/folly/folly/ConstexprMath.h
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace folly {
+template <typename T>
+constexpr T constexpr_max(T a) {
+  return a;
+}
+template <typename T, typename... Ts>
+constexpr T constexpr_max(T a, T b, Ts... ts) {
+  return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...);
+}
+} // namespace folly
diff --git a/third-party/folly/folly/Indestructible.h b/third-party/folly/folly/Indestructible.h
new file mode 100644
index 00000000000..68249d86512
--- /dev/null
+++ b/third-party/folly/folly/Indestructible.h
@@ -0,0 +1,166 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include <folly/Traits.h>
+
+namespace folly {
+
+/***
+ *  Indestructible
+ *
+ *  When you need a Meyers singleton that will not get destructed, even at
+ *  shutdown, and you also want the object stored inline.
+ *
+ *  Use like:
+ *
+ *      void doSomethingWithExpensiveData();
+ *
+ *      void doSomethingWithExpensiveData() {
+ *        static const Indestructible<map<string, int>> data{
+ *          map<string, int>{{"key1", 17}, {"key2", 19}, {"key3", 23}},
+ *        };
+ *        callSomethingTakingAMapByRef(*data);
+ *      }
+ *
+ *  This should be used only for Meyers singletons, and, even then, only when
+ *  the instance does not need to be destructed ever.
+ *
+ *  This should not be used more generally, e.g., as member fields, etc.
+ *
+ *  This is designed as an alternative, but with one fewer allocation at
+ *  construction time and one fewer pointer dereference at access time, to the
+ *  Meyers singleton pattern of:
+ *
+ *    void doSomethingWithExpensiveData() {
+ *      static const auto data =  // never `delete`d
+ *          new map<string, int>{{"key1", 17}, {"key2", 19}, {"key3", 23}};
+ *      callSomethingTakingAMapByRef(*data);
+ *    }
+ */
+
+template <typename T>
+class Indestructible final {
+ public:
+  template <typename S = T, typename = decltype(S())>
+  constexpr Indestructible() noexcept(noexcept(T())) {}
+
+  /**
+   * Constructor accepting a single argument by forwarding reference, this
+   * allows using list initialzation without the overhead of things like
+   * in_place, etc and also works with std::initializer_list constructors
+   * which can't be deduced, the default parameter helps there.
+   *
+   *    auto i = folly::Indestructible<std::map<int, int>>{{{1, 2}}};
+   *
+   * This provides convenience
+   *
+   * There are two versions of this constructor - one for when the element is
+   * implicitly constructible from the given argument and one for when the
+   * type is explicitly but not implicitly constructible from the given
+   * argument.
+   */
+  template <
+      typename U = T,
+      _t<std::enable_if<std::is_constructible<T, U&&>::value>>* = nullptr,
+      _t<std::enable_if<
+          !std::is_same<Indestructible<T>, remove_cvref_t<U>>::value>>* =
+          nullptr,
+      _t<std::enable_if<!std::is_convertible<U&&, T>::value>>* = nullptr>
+  explicit constexpr Indestructible(U&& u) noexcept(
+      noexcept(T(std::declval<U>())))
+      : storage_(std::forward<U>(u)) {}
+  template <
+      typename U = T,
+      _t<std::enable_if<std::is_constructible<T, U&&>::value>>* = nullptr,
+      _t<std::enable_if<
+          !std::is_same<Indestructible<T>, remove_cvref_t<U>>::value>>* =
+          nullptr,
+      _t<std::enable_if<std::is_convertible<U&&, T>::value>>* = nullptr>
+  /* implicit */ constexpr Indestructible(U&& u) noexcept(
+      noexcept(T(std::declval<U>())))
+      : storage_(std::forward<U>(u)) {}
+
+  template <typename... Args, typename = decltype(T(std::declval<Args>()...))>
+  explicit constexpr Indestructible(Args&&... args) noexcept(
+      noexcept(T(std::declval<Args>()...)))
+      : storage_(std::forward<Args>(args)...) {}
+  template <
+      typename U,
+      typename... Args,
+      typename = decltype(
+          T(std::declval<std::initializer_list<U>&>(),
+            std::declval<Args>()...))>
+  explicit constexpr Indestructible(std::initializer_list<U> il, Args... args) noexcept(
+      noexcept(
+          T(std::declval<std::initializer_list<U>&>(),
+            std::declval<Args>()...)))
+      : storage_(il, std::forward<Args>(args)...) {}
+
+  ~Indestructible() = default;
+
+  Indestructible(Indestructible const&) = delete;
+  Indestructible& operator=(Indestructible const&) = delete;
+
+  Indestructible(Indestructible&& other) noexcept(
+      noexcept(T(std::declval<T>())))
+      : storage_(std::move(other.storage_.value)) {
+    other.erased_ = true;
+  }
+  Indestructible& operator=(Indestructible&& other) noexcept(
+      noexcept(T(std::declval<T>()))) {
+    storage_.value = std::move(other.storage_.value);
+    other.erased_ = true;
+  }
+
+  T* get() noexcept {
+    check();
+    return &storage_.value;
+  }
+  T const* get() const noexcept {
+    check();
+    return &storage_.value;
+  }
+  T& operator*() noexcept {
+    return *get();
+  }
+  T const& operator*() const noexcept {
+    return *get();
+  }
+  T* operator->() noexcept {
+    return get();
+  }
+  T const* operator->() const noexcept {
+    return get();
+  }
+
+ private:
+  void check() const noexcept {
+    assert(!erased_);
+  }
+
+  union Storage {
+    T value;
+
+    template <typename S = T, typename = decltype(S())>
+    constexpr Storage() noexcept(noexcept(T())) : value() {}
+
+    template <typename... Args, typename = decltype(T(std::declval<Args>()...))>
+    explicit constexpr Storage(Args&&... args) noexcept(
+        noexcept(T(std::declval<Args>()...)))
+        : value(std::forward<Args>(args)...) {}
+
+    ~Storage() {}
+  };
+
+  Storage storage_{};
+  bool erased_{false};
+};
+} // namespace folly
diff --git a/third-party/folly/folly/Optional.h b/third-party/folly/folly/Optional.h
new file mode 100644
index 00000000000..ee12467dda7
--- /dev/null
+++ b/third-party/folly/folly/Optional.h
@@ -0,0 +1,570 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/*
+ * Optional - For conditional initialization of values, like boost::optional,
+ * but with support for move semantics and emplacement.  Reference type support
+ * has not been included due to limited use cases and potential confusion with
+ * semantics of assignment: Assigning to an optional reference could quite
+ * reasonably copy its value or redirect the reference.
+ *
+ * Optional can be useful when a variable might or might not be needed:
+ *
+ *  Optional<Logger> maybeLogger = ...;
+ *  if (maybeLogger) {
+ *    maybeLogger->log("hello");
+ *  }
+ *
+ * Optional enables a 'null' value for types which do not otherwise have
+ * nullability, especially useful for parameter passing:
+ *
+ * void testIterator(const unique_ptr<Iterator>& it,
+ *                   initializer_list<int> idsExpected,
+ *                   Optional<initializer_list<int>> ranksExpected = none) {
+ *   for (int i = 0; it->next(); ++i) {
+ *     EXPECT_EQ(it->doc().id(), idsExpected[i]);
+ *     if (ranksExpected) {
+ *       EXPECT_EQ(it->doc().rank(), (*ranksExpected)[i]);
+ *     }
+ *   }
+ * }
+ *
+ * Optional models OptionalPointee, so calling 'get_pointer(opt)' will return a
+ * pointer to nullptr if the 'opt' is empty, and a pointer to the value if it is
+ * not:
+ *
+ *  Optional<int> maybeInt = ...;
+ *  if (int* v = get_pointer(maybeInt)) {
+ *    cout << *v << endl;
+ *  }
+ */
+
+#include <cstddef>
+#include <functional>
+#include <new>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <folly/CPortability.h>
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+
+namespace folly {
+
+template <class Value>
+class Optional;
+
+namespace detail {
+template <class Value>
+struct OptionalPromiseReturn;
+} // namespace detail
+
+struct None {
+  enum class _secret { _token };
+
+  /**
+   * No default constructor to support both `op = {}` and `op = none`
+   * as syntax for clearing an Optional, just like std::nullopt_t.
+   */
+  constexpr explicit None(_secret) {}
+};
+constexpr None none{None::_secret::_token};
+
+class FOLLY_EXPORT OptionalEmptyException : public std::runtime_error {
+ public:
+  OptionalEmptyException()
+      : std::runtime_error("Empty Optional cannot be unwrapped") {}
+};
+
+template <class Value>
+class Optional {
+ public:
+  typedef Value value_type;
+
+  static_assert(
+      !std::is_reference<Value>::value,
+      "Optional may not be used with reference types");
+  static_assert(
+      !std::is_abstract<Value>::value,
+      "Optional may not be used with abstract types");
+
+  Optional() noexcept {}
+
+  Optional(const Optional& src) noexcept(
+      std::is_nothrow_copy_constructible<Value>::value) {
+    if (src.hasValue()) {
+      construct(src.value());
+    }
+  }
+
+  Optional(Optional&& src) noexcept(
+      std::is_nothrow_move_constructible<Value>::value) {
+    if (src.hasValue()) {
+      construct(std::move(src.value()));
+      src.clear();
+    }
+  }
+
+  /* implicit */ Optional(const None&) noexcept {}
+
+  /* implicit */ Optional(Value&& newValue) noexcept(
+      std::is_nothrow_move_constructible<Value>::value) {
+    construct(std::move(newValue));
+  }
+
+  /* implicit */ Optional(const Value& newValue) noexcept(
+      std::is_nothrow_copy_constructible<Value>::value) {
+    construct(newValue);
+  }
+
+  template <typename... Args>
+  explicit Optional(in_place_t, Args&&... args) noexcept(
+      std::is_nothrow_constructible<Value, Args...>::value)
+      : Optional{PrivateConstructor{}, std::forward<Args>(args)...} {}
+
+  template <typename U, typename... Args>
+  explicit Optional(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       Value,
+                                       std::initializer_list<U>,
+                                       Args...>::value)
+      : Optional{PrivateConstructor{}, il, std::forward<Args>(args)...} {}
+
+  // Used only when an Optional is used with coroutines on MSVC
+  /* implicit */ Optional(const detail::OptionalPromiseReturn<Value>& p)
+      : Optional{} {
+    p.promise_->value_ = this;
+  }
+
+  void assign(const None&) {
+    clear();
+  }
+
+  void assign(Optional&& src) {
+    if (this != &src) {
+      if (src.hasValue()) {
+        assign(std::move(src.value()));
+        src.clear();
+      } else {
+        clear();
+      }
+    }
+  }
+
+  void assign(const Optional& src) {
+    if (src.hasValue()) {
+      assign(src.value());
+    } else {
+      clear();
+    }
+  }
+
+  void assign(Value&& newValue) {
+    if (hasValue()) {
+      storage_.value = std::move(newValue);
+    } else {
+      construct(std::move(newValue));
+    }
+  }
+
+  void assign(const Value& newValue) {
+    if (hasValue()) {
+      storage_.value = newValue;
+    } else {
+      construct(newValue);
+    }
+  }
+
+  Optional& operator=(None) noexcept {
+    reset();
+    return *this;
+  }
+
+  template <class Arg>
+  Optional& operator=(Arg&& arg) {
+    assign(std::forward<Arg>(arg));
+    return *this;
+  }
+
+  Optional& operator=(Optional&& other) noexcept(
+      std::is_nothrow_move_assignable<Value>::value) {
+    assign(std::move(other));
+    return *this;
+  }
+
+  Optional& operator=(const Optional& other) noexcept(
+      std::is_nothrow_copy_assignable<Value>::value) {
+    assign(other);
+    return *this;
+  }
+
+  template <class... Args>
+  Value& emplace(Args&&... args) {
+    clear();
+    construct(std::forward<Args>(args)...);
+    return value();
+  }
+
+  template <class U, class... Args>
+  typename std::enable_if<
+      std::is_constructible<Value, std::initializer_list<U>&, Args&&...>::value,
+      Value&>::type
+  emplace(std::initializer_list<U> ilist, Args&&... args) {
+    clear();
+    construct(ilist, std::forward<Args>(args)...);
+    return value();
+  }
+
+  void reset() noexcept {
+    storage_.clear();
+  }
+
+  void clear() noexcept {
+    reset();
+  }
+
+  void swap(Optional& that) noexcept(IsNothrowSwappable<Value>::value) {
+    if (hasValue() && that.hasValue()) {
+      using std::swap;
+      swap(value(), that.value());
+    } else if (hasValue()) {
+      that.emplace(std::move(value()));
+      reset();
+    } else if (that.hasValue()) {
+      emplace(std::move(that.value()));
+      that.reset();
+    }
+  }
+
+  const Value& value() const& {
+    require_value();
+    return storage_.value;
+  }
+
+  Value& value() & {
+    require_value();
+    return storage_.value;
+  }
+
+  Value&& value() && {
+    require_value();
+    return std::move(storage_.value);
+  }
+
+  const Value&& value() const&& {
+    require_value();
+    return std::move(storage_.value);
+  }
+
+  const Value* get_pointer() const& {
+    return storage_.hasValue ? &storage_.value : nullptr;
+  }
+  Value* get_pointer() & {
+    return storage_.hasValue ? &storage_.value : nullptr;
+  }
+  Value* get_pointer() && = delete;
+
+  bool has_value() const noexcept {
+    return storage_.hasValue;
+  }
+
+  bool hasValue() const noexcept {
+    return has_value();
+  }
+
+  explicit operator bool() const noexcept {
+    return has_value();
+  }
+
+  const Value& operator*() const& {
+    return value();
+  }
+  Value& operator*() & {
+    return value();
+  }
+  const Value&& operator*() const&& {
+    return std::move(value());
+  }
+  Value&& operator*() && {
+    return std::move(value());
+  }
+
+  const Value* operator->() const {
+    return &value();
+  }
+  Value* operator->() {
+    return &value();
+  }
+
+  // Return a copy of the value if set, or a given default if not.
+  template <class U>
+  Value value_or(U&& dflt) const& {
+    if (storage_.hasValue) {
+      return storage_.value;
+    }
+
+    return std::forward<U>(dflt);
+  }
+
+  template <class U>
+  Value value_or(U&& dflt) && {
+    if (storage_.hasValue) {
+      return std::move(storage_.value);
+    }
+
+    return std::forward<U>(dflt);
+  }
+
+ private:
+  template <class T>
+  friend Optional<_t<std::decay<T>>> make_optional(T&&);
+  template <class T, class... Args>
+  friend Optional<T> make_optional(Args&&... args);
+  template <class T, class U, class... As>
+  friend Optional<T> make_optional(std::initializer_list<U>, As&&...);
+
+  /**
+   * Construct the optional in place, this is duplicated as a non-explicit
+   * constructor to allow returning values that are non-movable from
+   * make_optional using list initialization.
+   *
+   * Until C++17, at which point this will become unnecessary because of
+   * specified prvalue elision.
+   */
+  struct PrivateConstructor {
+    explicit PrivateConstructor() = default;
+  };
+  template <typename... Args>
+  Optional(PrivateConstructor, Args&&... args) noexcept(
+      std::is_constructible<Value, Args&&...>::value) {
+    construct(std::forward<Args>(args)...);
+  }
+
+  void require_value() const {
+    if (!storage_.hasValue) {
+      throw OptionalEmptyException{};
+    }
+  }
+
+  template <class... Args>
+  void construct(Args&&... args) {
+    const void* ptr = &storage_.value;
+    // For supporting const types.
+    new (const_cast<void*>(ptr)) Value(std::forward<Args>(args)...);
+    storage_.hasValue = true;
+  }
+
+  struct StorageTriviallyDestructible {
+    union {
+      char emptyState;
+      Value value;
+    };
+    bool hasValue;
+
+    StorageTriviallyDestructible()
+        : emptyState('\0'), hasValue{false} {}
+    void clear() {
+      hasValue = false;
+    }
+  };
+
+  struct StorageNonTriviallyDestructible {
+    union {
+      char emptyState;
+      Value value;
+    };
+    bool hasValue;
+
+    StorageNonTriviallyDestructible() : hasValue{false} {}
+    ~StorageNonTriviallyDestructible() {
+      clear();
+    }
+
+    void clear() {
+      if (hasValue) {
+        hasValue = false;
+        value.~Value();
+      }
+    }
+  };
+
+  using Storage = typename std::conditional<
+      std::is_trivially_destructible<Value>::value,
+      StorageTriviallyDestructible,
+      StorageNonTriviallyDestructible>::type;
+
+  Storage storage_;
+};
+
+template <class T>
+const T* get_pointer(const Optional<T>& opt) {
+  return opt.get_pointer();
+}
+
+template <class T>
+T* get_pointer(Optional<T>& opt) {
+  return opt.get_pointer();
+}
+
+template <class T>
+void swap(Optional<T>& a, Optional<T>& b) noexcept(noexcept(a.swap(b))) {
+  a.swap(b);
+}
+
+template <class T>
+Optional<_t<std::decay<T>>> make_optional(T&& v) {
+  using PrivateConstructor =
+      typename folly::Optional<_t<std::decay<T>>>::PrivateConstructor;
+  return {PrivateConstructor{}, std::forward<T>(v)};
+}
+
+template <class T, class... Args>
+folly::Optional<T> make_optional(Args&&... args) {
+  using PrivateConstructor = typename folly::Optional<T>::PrivateConstructor;
+  return {PrivateConstructor{}, std::forward<Args>(args)...};
+}
+
+template <class T, class U, class... Args>
+folly::Optional<T> make_optional(
+    std::initializer_list<U> il,
+    Args&&... args) {
+  using PrivateConstructor = typename folly::Optional<T>::PrivateConstructor;
+  return {PrivateConstructor{}, il, std::forward<Args>(args)...};
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Comparisons.
+
+template <class U, class V>
+bool operator==(const Optional<U>& a, const V& b) {
+  return a.hasValue() && a.value() == b;
+}
+
+template <class U, class V>
+bool operator!=(const Optional<U>& a, const V& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator==(const U& a, const Optional<V>& b) {
+  return b.hasValue() && b.value() == a;
+}
+
+template <class U, class V>
+bool operator!=(const U& a, const Optional<V>& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator==(const Optional<U>& a, const Optional<V>& b) {
+  if (a.hasValue() != b.hasValue()) {
+    return false;
+  }
+  if (a.hasValue()) {
+    return a.value() == b.value();
+  }
+  return true;
+}
+
+template <class U, class V>
+bool operator!=(const Optional<U>& a, const Optional<V>& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator<(const Optional<U>& a, const Optional<V>& b) {
+  if (a.hasValue() != b.hasValue()) {
+    return a.hasValue() < b.hasValue();
+  }
+  if (a.hasValue()) {
+    return a.value() < b.value();
+  }
+  return false;
+}
+
+template <class U, class V>
+bool operator>(const Optional<U>& a, const Optional<V>& b) {
+  return b < a;
+}
+
+template <class U, class V>
+bool operator<=(const Optional<U>& a, const Optional<V>& b) {
+  return !(b < a);
+}
+
+template <class U, class V>
+bool operator>=(const Optional<U>& a, const Optional<V>& b) {
+  return !(a < b);
+}
+
+// Suppress comparability of Optional<T> with T, despite implicit conversion.
+template <class V>
+bool operator<(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator<=(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator>=(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator>(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator<(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator<=(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator>=(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator>(const V& other, const Optional<V>&) = delete;
+
+// Comparisons with none
+template <class V>
+bool operator==(const Optional<V>& a, None) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator==(None, const Optional<V>& a) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator<(const Optional<V>&, None) noexcept {
+  return false;
+}
+template <class V>
+bool operator<(None, const Optional<V>& a) noexcept {
+  return a.hasValue();
+}
+template <class V>
+bool operator>(const Optional<V>& a, None) noexcept {
+  return a.hasValue();
+}
+template <class V>
+bool operator>(None, const Optional<V>&) noexcept {
+  return false;
+}
+template <class V>
+bool operator<=(None, const Optional<V>&) noexcept {
+  return true;
+}
+template <class V>
+bool operator<=(const Optional<V>& a, None) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator>=(const Optional<V>&, None) noexcept {
+  return true;
+}
+template <class V>
+bool operator>=(None, const Optional<V>& a) noexcept {
+  return !a.hasValue();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace folly
diff --git a/third-party/folly/folly/Portability.h b/third-party/folly/folly/Portability.h
new file mode 100644
index 00000000000..2c6544c1961
--- /dev/null
+++ b/third-party/folly/folly/Portability.h
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define FOLLY_X64 1
+#else
+#define FOLLY_X64 0
+#endif
+
+#if defined(__aarch64__)
+#define FOLLY_AARCH64 1
+#else
+#define FOLLY_AARCH64 0
+#endif
+
+#if defined(__powerpc64__)
+#define FOLLY_PPC64 1
+#else
+#define FOLLY_PPC64 0
+#endif
+
+#if defined(__has_builtin)
+#define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__)
+#else
+#define FOLLY_HAS_BUILTIN(...) 0
+#endif
+
+#if defined(__has_cpp_attribute)
+#if __has_cpp_attribute(nodiscard)
+#define FOLLY_NODISCARD [[nodiscard]]
+#endif
+#endif
+#if !defined FOLLY_NODISCARD
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+#define FOLLY_NODISCARD _Check_return_
+#elif defined(__GNUC__)
+#define FOLLY_NODISCARD __attribute__((__warn_unused_result__))
+#else
+#define FOLLY_NODISCARD
+#endif
+#endif
+
+namespace folly {
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
+constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
+constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
+} // namespace folly
+
+namespace folly {
+#ifdef NDEBUG
+constexpr auto kIsDebug = false;
+#else
+constexpr auto kIsDebug = true;
+#endif
+} // namespace folly
+
+namespace folly {
+#if defined(_MSC_VER)
+constexpr bool kIsMsvc = true;
+#else
+constexpr bool kIsMsvc = false;
+#endif
+} // namespace folly
diff --git a/third-party/folly/folly/ScopeGuard.h b/third-party/folly/folly/ScopeGuard.h
new file mode 100644
index 00000000000..71134406303
--- /dev/null
+++ b/third-party/folly/folly/ScopeGuard.h
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <utility>
+#include <type_traits>
+
+namespace folly {
+namespace scope_guard_detail {
+template <typename F>
+class ScopeGuardImpl {
+ public:
+  explicit ScopeGuardImpl(F&& f) : f_{std::forward<F>(f)} {}
+  ~ScopeGuardImpl() {
+    f_();
+  }
+
+ private:
+  F f_;
+};
+
+enum class ScopeGuardEnum {};
+template <typename Func, typename DecayedFunc = _t<std::decay<Func>>>
+ScopeGuardImpl<DecayedFunc> operator+(ScopeGuardEnum, Func&& func) {
+  return ScopeGuardImpl<DecayedFunc>{std::forward<Func>(func)};
+}
+} // namespace scope_guard_detail
+} // namespace folly
+
+/**
+ * FB_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#ifndef FB_ANONYMOUS_VARIABLE
+#define FB_CONCATENATE_IMPL(s1, s2) s1##s2
+#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define FB_ANONYMOUS_VARIABLE(str) \
+  FB_CONCATENATE(FB_CONCATENATE(FB_CONCATENATE(str, __COUNTER__), _), __LINE__)
+#else
+#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__)
+#endif
+#endif
+
+#ifndef SCOPE_EXIT
+#define SCOPE_EXIT                                    \
+    auto FB_ANONYMOUS_VARIABLE(SCOPE_EXIT_STATE) =    \
+        ::folly::scope_guard_detail::ScopeGuardEnum{} + [&]() noexcept
+#endif
diff --git a/third-party/folly/folly/Traits.h b/third-party/folly/folly/Traits.h
new file mode 100644
index 00000000000..ea7e1eb1c05
--- /dev/null
+++ b/third-party/folly/folly/Traits.h
@@ -0,0 +1,152 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace folly {
+
+#if !defined(_MSC_VER)
+template <class T>
+struct is_trivially_copyable
+    : std::integral_constant<bool, __has_trivial_copy(T)> {};
+#else
+template <class T>
+using is_trivially_copyable = std::is_trivially_copyable<T>;
+#endif
+
+/***
+ *  _t
+ *
+ *  Instead of:
+ *
+ *    using decayed = typename std::decay<T>::type;
+ *
+ *  With the C++14 standard trait aliases, we could use:
+ *
+ *    using decayed = std::decay_t<T>;
+ *
+ *  Without them, we could use:
+ *
+ *    using decayed = _t<std::decay<T>>;
+ *
+ *  Also useful for any other library with template types having dependent
+ *  member types named `type`, like the standard trait types.
+ */
+template <typename T>
+using _t = typename T::type;
+
+/**
+ *  type_t
+ *
+ *  A type alias for the first template type argument. `type_t` is useful for
+ *  controlling class-template and function-template partial specialization.
+ *
+ *  Example:
+ *
+ *    template <typename Value>
+ *    class Container {
+ *     public:
+ *      template <typename... Args>
+ *      Container(
+ *          type_t<in_place_t, decltype(Value(std::declval<Args>()...))>,
+ *          Args&&...);
+ *    };
+ *
+ *  void_t
+ *
+ *  A type alias for `void`. `void_t` is useful for controling class-template
+ *  and function-template partial specialization.
+ *
+ *  Example:
+ *
+ *    // has_value_type<T>::value is true if T has a nested type `value_type`
+ *    template <class T, class = void>
+ *    struct has_value_type
+ *        : std::false_type {};
+ *
+ *    template <class T>
+ *    struct has_value_type<T, folly::void_t<typename T::value_type>>
+ *        : std::true_type {};
+ */
+
+/**
+ * There is a bug in libstdc++, libc++, and MSVC's STL that causes it to
+ * ignore unused template parameter arguments in template aliases and does not
+ * cause substitution failures. This defect has been recorded here:
+ * http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558.
+ *
+ * This causes the implementation of std::void_t to be buggy, as it is likely
+ * defined as something like the following:
+ *
+ *  template <typename...>
+ *  using void_t = void;
+ *
+ * This causes the compiler to ignore all the template arguments and does not
+ * help when one wants to cause substitution failures.  Rather declarations
+ * which have void_t in orthogonal specializations are treated as the same.
+ * For example, assuming the possible `T` types are only allowed to have
+ * either the alias `one` or `two` and never both or none:
+ *
+ *  template <typename T,
+ *            typename std::void_t<std::decay_t<T>::one>* = nullptr>
+ *  void foo(T&&) {}
+ *  template <typename T,
+ *            typename std::void_t<std::decay_t<T>::two>* = nullptr>
+ *  void foo(T&&) {}
+ *
+ * The second foo() will be a redefinition because it conflicts with the first
+ * one; void_t does not cause substitution failures - the template types are
+ * just ignored.
+ */
+
+namespace traits_detail {
+template <class T, class...>
+struct type_t_ {
+  using type = T;
+};
+} // namespace traits_detail
+
+template <class T, class... Ts>
+using type_t = typename traits_detail::type_t_<T, Ts...>::type;
+template <class... Ts>
+using void_t = type_t<void, Ts...>;
+
+/**
+ * A type trait to remove all const volatile and reference qualifiers on a
+ * type T
+ */
+template <typename T>
+struct remove_cvref {
+  using type =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+};
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+template <class T>
+struct IsNothrowSwappable
+    : std::integral_constant<
+          bool,
+          std::is_nothrow_move_constructible<T>::value&& noexcept(
+              std::swap(std::declval<T&>(), std::declval<T&>()))> {};
+
+template <typename...>
+struct Conjunction : std::true_type {};
+template <typename T>
+struct Conjunction<T> : T {};
+template <typename T, typename... TList>
+struct Conjunction<T, TList...>
+    : std::conditional<T::value, Conjunction<TList...>, T>::type {};
+
+template <typename T>
+struct Negation : std::integral_constant<bool, !T::value> {};
+
+template <std::size_t I>
+using index_constant = std::integral_constant<std::size_t, I>;
+
+} // namespace folly
diff --git a/third-party/folly/folly/Unit.h b/third-party/folly/folly/Unit.h
new file mode 100644
index 00000000000..c8cb77e2c37
--- /dev/null
+++ b/third-party/folly/folly/Unit.h
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <type_traits>
+
+namespace folly {
+
+/// In functional programming, the degenerate case is often called "unit". In
+/// C++, "void" is often the best analogue. However, because of the syntactic
+/// special-casing required for void, it is frequently a liability for template
+/// metaprogramming. So, instead of writing specializations to handle cases like
+/// SomeContainer<void>, a library author may instead rule that out and simply
+/// have library users use SomeContainer<Unit>. Contained values may be ignored.
+/// Much easier.
+///
+/// "void" is the type that admits of no values at all. It is not possible to
+/// construct a value of this type.
+/// "unit" is the type that admits of precisely one unique value. It is
+/// possible to construct a value of this type, but it is always the same value
+/// every time, so it is uninteresting.
+struct Unit {
+  constexpr bool operator==(const Unit& /*other*/) const {
+    return true;
+  }
+  constexpr bool operator!=(const Unit& /*other*/) const {
+    return false;
+  }
+};
+
+constexpr Unit unit{};
+
+template <typename T>
+struct lift_unit {
+  using type = T;
+};
+template <>
+struct lift_unit<void> {
+  using type = Unit;
+};
+template <typename T>
+using lift_unit_t = typename lift_unit<T>::type;
+
+template <typename T>
+struct drop_unit {
+  using type = T;
+};
+template <>
+struct drop_unit<Unit> {
+  using type = void;
+};
+template <typename T>
+using drop_unit_t = typename drop_unit<T>::type;
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/Utility.h b/third-party/folly/folly/Utility.h
new file mode 100644
index 00000000000..7e43bdc2f17
--- /dev/null
+++ b/third-party/folly/folly/Utility.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+namespace folly {
+
+/**
+ *  Backports from C++17 of:
+ *    std::in_place_t
+ *    std::in_place_type_t
+ *    std::in_place_index_t
+ *    std::in_place
+ *    std::in_place_type
+ *    std::in_place_index
+ */
+
+struct in_place_tag {};
+template <class>
+struct in_place_type_tag {};
+template <std::size_t>
+struct in_place_index_tag {};
+
+using in_place_t = in_place_tag (&)(in_place_tag);
+template <class T>
+using in_place_type_t = in_place_type_tag<T> (&)(in_place_type_tag<T>);
+template <std::size_t I>
+using in_place_index_t = in_place_index_tag<I> (&)(in_place_index_tag<I>);
+
+inline in_place_tag in_place(in_place_tag = {}) {
+  return {};
+}
+template <class T>
+inline in_place_type_tag<T> in_place_type(in_place_type_tag<T> = {}) {
+  return {};
+}
+template <std::size_t I>
+inline in_place_index_tag<I> in_place_index(in_place_index_tag<I> = {}) {
+  return {};
+}
+
+template <class T, class U = T>
+T exchange(T& obj, U&& new_value) {
+  T old_value = std::move(obj);
+  obj = std::forward<U>(new_value);
+  return old_value;
+}
+
+namespace utility_detail {
+template <typename...>
+struct make_seq_cat;
+template <
+    template <typename T, T...> class S,
+    typename T,
+    T... Ta,
+    T... Tb,
+    T... Tc>
+struct make_seq_cat<S<T, Ta...>, S<T, Tb...>, S<T, Tc...>> {
+  using type =
+      S<T,
+        Ta...,
+        (sizeof...(Ta) + Tb)...,
+        (sizeof...(Ta) + sizeof...(Tb) + Tc)...>;
+};
+
+// Not parameterizing by `template <typename T, T...> class, typename` because
+// clang precisely v4.0 fails to compile that. Note that clang v3.9 and v5.0
+// handle that code correctly.
+//
+// For this to work, `S0` is required to be `Sequence<T>` and `S1` is required
+// to be `Sequence<T, 0>`.
+
+template <std::size_t Size>
+struct make_seq {
+  template <typename S0, typename S1>
+  using apply = typename make_seq_cat<
+      typename make_seq<Size / 2>::template apply<S0, S1>,
+      typename make_seq<Size / 2>::template apply<S0, S1>,
+      typename make_seq<Size % 2>::template apply<S0, S1>>::type;
+};
+template <>
+struct make_seq<1> {
+  template <typename S0, typename S1>
+  using apply = S1;
+};
+template <>
+struct make_seq<0> {
+  template <typename S0, typename S1>
+  using apply = S0;
+};
+} // namespace utility_detail
+
+// TODO: Remove after upgrading to C++14 baseline
+
+template <class T, T... Ints>
+struct integer_sequence {
+  using value_type = T;
+
+  static constexpr std::size_t size() noexcept {
+    return sizeof...(Ints);
+  }
+};
+
+template <std::size_t... Ints>
+using index_sequence = integer_sequence<std::size_t, Ints...>;
+
+template <typename T, std::size_t Size>
+using make_integer_sequence = typename utility_detail::make_seq<
+    Size>::template apply<integer_sequence<T>, integer_sequence<T, 0>>;
+
+template <std::size_t Size>
+using make_index_sequence = make_integer_sequence<std::size_t, Size>;
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+/**
+ * A simple helper for getting a constant reference to an object.
+ *
+ * Example:
+ *
+ *   std::vector<int> v{1,2,3};
+ *   // The following two lines are equivalent:
+ *   auto a = const_cast<const std::vector<int>&>(v).begin();
+ *   auto b = folly::as_const(v).begin();
+ *
+ * Like C++17's std::as_const. See http://wg21.link/p0007
+ */
+template <class T>
+T const& as_const(T& t) noexcept {
+  return t;
+}
+
+template <class T>
+void as_const(T const&&) = delete;
+
+} // namespace folly
diff --git a/third-party/folly/folly/chrono/Hardware.h b/third-party/folly/folly/chrono/Hardware.h
new file mode 100644
index 00000000000..ec7be82e8be
--- /dev/null
+++ b/third-party/folly/folly/chrono/Hardware.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <chrono>
+#include <cstdint>
+
+#if _MSC_VER
+extern "C" std::uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+
+namespace folly {
+
+inline std::uint64_t hardware_timestamp() {
+#if _MSC_VER
+  return __rdtsc();
+#elif __GNUC__ && (__i386__ || FOLLY_X64)
+  return __builtin_ia32_rdtsc();
+#else
+  // use steady_clock::now() as an approximation for the timestamp counter on
+  // non-x86 systems
+  return std::chrono::steady_clock::now().time_since_epoch().count();
+#endif
+}
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/container/Array.h b/third-party/folly/folly/container/Array.h
new file mode 100644
index 00000000000..bb3167b9793
--- /dev/null
+++ b/third-party/folly/folly/container/Array.h
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+#include <utility>
+
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+
+namespace folly {
+
+namespace array_detail {
+template <typename>
+struct is_ref_wrapper : std::false_type {};
+template <typename T>
+struct is_ref_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T>
+using not_ref_wrapper =
+    folly::Negation<is_ref_wrapper<typename std::decay<T>::type>>;
+
+template <typename D, typename...>
+struct return_type_helper {
+  using type = D;
+};
+template <typename... TList>
+struct return_type_helper<void, TList...> {
+  static_assert(
+      folly::Conjunction<not_ref_wrapper<TList>...>::value,
+      "TList cannot contain reference_wrappers when D is void");
+  using type = typename std::common_type<TList...>::type;
+};
+
+template <typename D, typename... TList>
+using return_type = std::
+    array<typename return_type_helper<D, TList...>::type, sizeof...(TList)>;
+} // namespace array_detail
+
+template <typename D = void, typename... TList>
+constexpr array_detail::return_type<D, TList...> make_array(TList&&... t) {
+  using value_type =
+      typename array_detail::return_type_helper<D, TList...>::type;
+  return {{static_cast<value_type>(std::forward<TList>(t))...}};
+}
+
+namespace array_detail {
+template <typename MakeItem, std::size_t... Index>
+inline constexpr auto make_array_with(
+    MakeItem const& make,
+    folly::index_sequence<Index...>)
+      -> std::array<decltype(make(0)), sizeof...(Index)> {
+  return std::array<decltype(make(0)), sizeof...(Index)>{{make(Index)...}};
+}
+} // namespace array_detail
+
+//  make_array_with
+//
+//  Constructs a std::array<..., Size> with elements m(i) for i in [0, Size).
+template <std::size_t Size, typename MakeItem>
+constexpr auto make_array_with(MakeItem const& make)
+    -> decltype(array_detail::make_array_with(
+          make,
+          folly::make_index_sequence<Size>{})) {
+  return array_detail::make_array_with(
+      make,
+      folly::make_index_sequence<Size>{});
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex-inl.h b/third-party/folly/folly/detail/Futex-inl.h
new file mode 100644
index 00000000000..3b2a412bfb6
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex-inl.h
@@ -0,0 +1,117 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/detail/Futex.h>
+#include <folly/synchronization/ParkingLot.h>
+
+namespace folly {
+namespace detail {
+
+/** Optimal when TargetClock is the same type as Clock.
+ *
+ *  Otherwise, both Clock::now() and TargetClock::now() must be invoked. */
+template <typename TargetClock, typename Clock, typename Duration>
+typename TargetClock::time_point time_point_conv(
+    std::chrono::time_point<Clock, Duration> const& time) {
+  using std::chrono::duration_cast;
+  using TimePoint = std::chrono::time_point<Clock, Duration>;
+  using TargetDuration = typename TargetClock::duration;
+  using TargetTimePoint = typename TargetClock::time_point;
+  if (time == TimePoint::max()) {
+    return TargetTimePoint::max();
+  } else if (std::is_same<Clock, TargetClock>::value) {
+    // in place of time_point_cast, which cannot compile without if-constexpr
+    auto const delta = time.time_since_epoch();
+    return TargetTimePoint(duration_cast<TargetDuration>(delta));
+  } else {
+    // different clocks with different epochs, so non-optimal case
+    auto const delta = time - Clock::now();
+    return TargetClock::now() + duration_cast<TargetDuration>(delta);
+  }
+}
+
+/**
+ * Available overloads, with definitions elsewhere
+ *
+ * These functions are treated as ADL-extension points, the templates above
+ * call these functions without them having being pre-declared.  This works
+ * because ADL lookup finds the definitions of these functions when you pass
+ * the relevant arguments
+ */
+int futexWakeImpl(
+    const Futex<std::atomic>* futex,
+    int count,
+    uint32_t wakeMask);
+FutexResult futexWaitImpl(
+    const Futex<std::atomic>* futex,
+    uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask);
+
+int futexWakeImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    int count,
+    uint32_t wakeMask);
+FutexResult futexWaitImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask);
+
+template <typename Futex, typename Deadline>
+typename std::enable_if<Deadline::clock::is_steady, FutexResult>::type
+futexWaitImpl(
+    Futex* futex,
+    uint32_t expected,
+    Deadline const& deadline,
+    uint32_t waitMask) {
+  return futexWaitImpl(futex, expected, nullptr, &deadline, waitMask);
+}
+
+template <typename Futex, typename Deadline>
+typename std::enable_if<!Deadline::clock::is_steady, FutexResult>::type
+futexWaitImpl(
+    Futex* futex,
+    uint32_t expected,
+    Deadline const& deadline,
+    uint32_t waitMask) {
+  return futexWaitImpl(futex, expected, &deadline, nullptr, waitMask);
+}
+
+template <typename Futex>
+FutexResult
+futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask) {
+  auto rv = futexWaitImpl(futex, expected, nullptr, nullptr, waitMask);
+  assert(rv != FutexResult::TIMEDOUT);
+  return rv;
+}
+
+template <typename Futex>
+int futexWake(const Futex* futex, int count, uint32_t wakeMask) {
+  return futexWakeImpl(futex, count, wakeMask);
+}
+
+template <typename Futex, class Clock, class Duration>
+FutexResult futexWaitUntil(
+    const Futex* futex,
+    uint32_t expected,
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    uint32_t waitMask) {
+  using Target = typename std::conditional<
+      Clock::is_steady,
+      std::chrono::steady_clock,
+      std::chrono::system_clock>::type;
+  auto const converted = time_point_conv<Target>(deadline);
+  return converted == Target::time_point::max()
+      ? futexWaitImpl(futex, expected, nullptr, nullptr, waitMask)
+      : futexWaitImpl(futex, expected, converted, waitMask);
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex.cpp b/third-party/folly/folly/detail/Futex.cpp
new file mode 100644
index 00000000000..208578a901d
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex.cpp
@@ -0,0 +1,263 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/detail/Futex.h>
+#include <folly/portability/SysSyscall.h>
+#include <stdint.h>
+#include <string.h>
+#include <array>
+#include <cerrno>
+
+#include <folly/synchronization/ParkingLot.h>
+
+#ifdef __linux__
+#include <linux/futex.h>
+#endif
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+using namespace std::chrono;
+
+namespace folly {
+namespace detail {
+
+namespace {
+
+////////////////////////////////////////////////////
+// native implementation using the futex() syscall
+
+#ifdef __linux__
+
+/// Certain toolchains (like Android's) don't include the full futex API in
+/// their headers even though they support it. Make sure we have our constants
+/// even if the headers don't have them.
+#ifndef FUTEX_WAIT_BITSET
+#define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+#define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_PRIVATE_FLAG
+#define FUTEX_PRIVATE_FLAG 128
+#endif
+#ifndef FUTEX_CLOCK_REALTIME
+#define FUTEX_CLOCK_REALTIME 256
+#endif
+
+int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) {
+  int rv = syscall(
+      __NR_futex,
+      addr, /* addr1 */
+      FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */
+      count, /* val */
+      nullptr, /* timeout */
+      nullptr, /* addr2 */
+      wakeMask); /* val3 */
+
+  /* NOTE: we ignore errors on wake for the case of a futex
+     guarding its own destruction, similar to this
+     glibc bug with sem_post/sem_wait:
+     https://sourceware.org/bugzilla/show_bug.cgi?id=12674 */
+  if (rv < 0) {
+    return 0;
+  }
+  return rv;
+}
+
+template <class Clock>
+struct timespec timeSpecFromTimePoint(time_point<Clock> absTime) {
+  auto epoch = absTime.time_since_epoch();
+  if (epoch.count() < 0) {
+    // kernel timespec_valid requires non-negative seconds and nanos in [0,1G)
+    epoch = Clock::duration::zero();
+  }
+
+  // timespec-safe seconds and nanoseconds;
+  // chrono::{nano,}seconds are `long long int`
+  // whereas timespec uses smaller types
+  using time_t_seconds = duration<std::time_t, seconds::period>;
+  using long_nanos = duration<long int, nanoseconds::period>;
+
+  auto secs = duration_cast<time_t_seconds>(epoch);
+  auto nanos = duration_cast<long_nanos>(epoch - secs);
+  struct timespec result = {secs.count(), nanos.count()};
+  return result;
+}
+
+FutexResult nativeFutexWaitImpl(
+    const void* addr,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  assert(absSystemTime == nullptr || absSteadyTime == nullptr);
+
+  int op = FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG;
+  struct timespec ts;
+  struct timespec* timeout = nullptr;
+
+  if (absSystemTime != nullptr) {
+    op |= FUTEX_CLOCK_REALTIME;
+    ts = timeSpecFromTimePoint(*absSystemTime);
+    timeout = &ts;
+  } else if (absSteadyTime != nullptr) {
+    ts = timeSpecFromTimePoint(*absSteadyTime);
+    timeout = &ts;
+  }
+
+  // Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout
+  // value - http://locklessinc.com/articles/futex_cheat_sheet/
+  int rv = syscall(
+      __NR_futex,
+      addr, /* addr1 */
+      op, /* op */
+      expected, /* val */
+      timeout, /* timeout */
+      nullptr, /* addr2 */
+      waitMask); /* val3 */
+
+  if (rv == 0) {
+    return FutexResult::AWOKEN;
+  } else {
+    switch (errno) {
+      case ETIMEDOUT:
+        assert(timeout != nullptr);
+        return FutexResult::TIMEDOUT;
+      case EINTR:
+        return FutexResult::INTERRUPTED;
+      case EWOULDBLOCK:
+        return FutexResult::VALUE_CHANGED;
+      default:
+        assert(false);
+        // EINVAL, EACCESS, or EFAULT.  EINVAL means there was an invalid
+        // op (should be impossible) or an invalid timeout (should have
+        // been sanitized by timeSpecFromTimePoint).  EACCESS or EFAULT
+        // means *addr points to invalid memory, which is unlikely because
+        // the caller should have segfaulted already.  We can either
+        // crash, or return a value that lets the process continue for
+        // a bit. We choose the latter. VALUE_CHANGED probably turns the
+        // caller into a spin lock.
+        return FutexResult::VALUE_CHANGED;
+    }
+  }
+}
+
+#endif // __linux__
+
+///////////////////////////////////////////////////////
+// compatibility implementation using standard C++ API
+
+using Lot = ParkingLot<uint32_t>;
+Lot parkingLot;
+
+int emulatedFutexWake(const void* addr, int count, uint32_t waitMask) {
+  int woken = 0;
+  parkingLot.unpark(addr, [&](const uint32_t& mask) {
+    if ((mask & waitMask) == 0) {
+      return UnparkControl::RetainContinue;
+    }
+    assert(count > 0);
+    count--;
+    woken++;
+    return count > 0 ? UnparkControl::RemoveContinue
+                     : UnparkControl::RemoveBreak;
+  });
+  return woken;
+}
+
+template <typename F>
+FutexResult emulatedFutexWaitImpl(
+    F* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  static_assert(
+      std::is_same<F, const Futex<std::atomic>>::value ||
+          std::is_same<F, const Futex<EmulatedFutexAtomic>>::value,
+      "Type F must be either Futex<std::atomic> or Futex<EmulatedFutexAtomic>");
+  ParkResult res;
+  if (absSystemTime) {
+    res = parkingLot.park_until(
+        futex,
+        waitMask,
+        [&] { return *futex == expected; },
+        [] {},
+        *absSystemTime);
+  } else if (absSteadyTime) {
+    res = parkingLot.park_until(
+        futex,
+        waitMask,
+        [&] { return *futex == expected; },
+        [] {},
+        *absSteadyTime);
+  } else {
+    res = parkingLot.park(
+        futex, waitMask, [&] { return *futex == expected; }, [] {});
+  }
+  switch (res) {
+    case ParkResult::Skip:
+      return FutexResult::VALUE_CHANGED;
+    case ParkResult::Unpark:
+      return FutexResult::AWOKEN;
+    case ParkResult::Timeout:
+      return FutexResult::TIMEDOUT;
+  }
+
+  return FutexResult::INTERRUPTED;
+}
+
+} // namespace
+
+/////////////////////////////////
+// Futex<> overloads
+
+int futexWakeImpl(
+    const Futex<std::atomic>* futex,
+    int count,
+    uint32_t wakeMask) {
+#ifdef __linux__
+  return nativeFutexWake(futex, count, wakeMask);
+#else
+  return emulatedFutexWake(futex, count, wakeMask);
+#endif
+}
+
+int futexWakeImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    int count,
+    uint32_t wakeMask) {
+  return emulatedFutexWake(futex, count, wakeMask);
+}
+
+FutexResult futexWaitImpl(
+    const Futex<std::atomic>* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+#ifdef __linux__
+  return nativeFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+#else
+  return emulatedFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+#endif
+}
+
+FutexResult futexWaitImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  return emulatedFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex.h b/third-party/folly/folly/detail/Futex.h
new file mode 100644
index 00000000000..987a1b89574
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace folly {
+namespace detail {
+
+enum class FutexResult {
+  VALUE_CHANGED, /* futex value didn't match expected */
+  AWOKEN, /* wakeup by matching futex wake, or spurious wakeup */
+  INTERRUPTED, /* wakeup by interrupting signal */
+  TIMEDOUT, /* wakeup by expiring deadline */
+};
+
+/**
+ * Futex is an atomic 32 bit unsigned integer that provides access to the
+ * futex() syscall on that value.  It is templated in such a way that it
+ * can interact properly with DeterministicSchedule testing.
+ *
+ * If you don't know how to use futex(), you probably shouldn't be using
+ * this class.  Even if you do know how, you should have a good reason
+ * (and benchmarks to back you up).
+ *
+ * Because of the semantics of the futex syscall, the futex family of
+ * functions are available as free functions rather than member functions
+ */
+template <template <typename> class Atom = std::atomic>
+using Futex = Atom<std::uint32_t>;
+
+/**
+ * Puts the thread to sleep if this->load() == expected.  Returns true when
+ * it is returning because it has consumed a wake() event, false for any
+ * other return (signal, this->load() != expected, or spurious wakeup).
+ */
+template <typename Futex>
+FutexResult
+futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask = -1);
+
+/**
+ * Similar to futexWait but also accepts a deadline until when the wait call
+ * may block.
+ *
+ * Optimal clock types: std::chrono::system_clock, std::chrono::steady_clock.
+ * NOTE: On some systems steady_clock is just an alias for system_clock,
+ * and is not actually steady.
+ *
+ * For any other clock type, now() will be invoked twice.
+ */
+template <typename Futex, class Clock, class Duration>
+FutexResult futexWaitUntil(
+    const Futex* futex,
+    uint32_t expected,
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    uint32_t waitMask = -1);
+
+/**
+ * Wakes up to count waiters where (waitMask & wakeMask) != 0, returning the
+ * number of awoken threads, or -1 if an error occurred.  Note that when
+ * constructing a concurrency primitive that can guard its own destruction, it
+ * is likely that you will want to ignore EINVAL here (as well as making sure
+ * that you never touch the object after performing the memory store that is
+ * the linearization point for unlock or control handoff).  See
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=13690
+ */
+template <typename Futex>
+int futexWake(
+    const Futex* futex,
+    int count = std::numeric_limits<int>::max(),
+    uint32_t wakeMask = -1);
+
+/** A std::atomic subclass that can be used to force Futex to emulate
+ *  the underlying futex() syscall.  This is primarily useful to test or
+ *  benchmark the emulated implementation on systems that don't need it. */
+template <typename T>
+struct EmulatedFutexAtomic : public std::atomic<T> {
+  EmulatedFutexAtomic() noexcept = default;
+  constexpr /* implicit */ EmulatedFutexAtomic(T init) noexcept
+      : std::atomic<T>(init) {}
+  // It doesn't copy or move
+  EmulatedFutexAtomic(EmulatedFutexAtomic&& rhs) = delete;
+};
+
+} // namespace detail
+} // namespace folly
+
+#include <folly/detail/Futex-inl.h>
diff --git a/third-party/folly/folly/functional/Invoke.h b/third-party/folly/folly/functional/Invoke.h
new file mode 100644
index 00000000000..67c55284377
--- /dev/null
+++ b/third-party/folly/folly/functional/Invoke.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace folly {
+namespace invoke_detail {
+template <typename F, typename... Args>
+using invoke_result_ = decltype(std::declval<F>()(std::declval<Args>()...));
+
+template <typename Void, typename F, typename... Args>
+struct is_invocable : std::false_type {};
+
+template <typename F, typename... Args>
+struct is_invocable<void_t<invoke_result_<F, Args...>>, F, Args...>
+    : std::true_type {};
+
+template <typename Void, typename R, typename F, typename... Args>
+struct is_invocable_r : std::false_type {};
+
+template <typename R, typename F, typename... Args>
+struct is_invocable_r<void_t<invoke_result_<F, Args...>>, R, F, Args...>
+    : std::is_convertible<invoke_result_<F, Args...>, R> {};
+} // namespace invoke_detail
+
+//  mimic: std::is_invocable, C++17
+template <typename F, typename... Args>
+struct is_invocable : invoke_detail::is_invocable<void, F, Args...> {};
+
+//  mimic: std::is_invocable_r, C++17
+template <typename R, typename F, typename... Args>
+struct is_invocable_r : invoke_detail::is_invocable_r<void, R, F, Args...> {};
+} // namespace folly
diff --git a/third-party/folly/folly/hash/Hash.h b/third-party/folly/folly/hash/Hash.h
new file mode 100644
index 00000000000..ca221e5c0b3
--- /dev/null
+++ b/third-party/folly/folly/hash/Hash.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace folly {
+namespace hash {
+
+/*
+ * Thomas Wang 64 bit mix hash function
+ */
+
+inline uint64_t twang_mix64(uint64_t key) noexcept {
+  key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1;
+  key = key ^ (key >> 24);
+  key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8)
+  key = key ^ (key >> 14);
+  key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4)
+  key = key ^ (key >> 28);
+  key = key + (key << 31); // key *= 1 + (1 << 31)
+  return key;
+}
+
+} // namespace hash
+} // namespace folly
diff --git a/third-party/folly/folly/lang/Align.h b/third-party/folly/folly/lang/Align.h
new file mode 100644
index 00000000000..5257e2f6fd1
--- /dev/null
+++ b/third-party/folly/folly/lang/Align.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace folly {
+
+//  Memory locations within the same cache line are subject to destructive
+//  interference, also known as false sharing, which is when concurrent
+//  accesses to these different memory locations from different cores, where at
+//  least one of the concurrent accesses is or involves a store operation,
+//  induce contention and harm performance.
+//
+//  Microbenchmarks indicate that pairs of cache lines also see destructive
+//  interference under heavy use of atomic operations, as observed for atomic
+//  increment on Sandy Bridge.
+//
+//  We assume a cache line size of 64, so we use a cache line pair size of 128
+//  to avoid destructive interference.
+//
+//  mimic: std::hardware_destructive_interference_size, C++17
+constexpr std::size_t hardware_destructive_interference_size = 128;
+
+//  Memory locations within the same cache line are subject to constructive
+//  interference, also known as true sharing, which is when accesses to some
+//  memory locations induce all memory locations within the same cache line to
+//  be cached, benefiting subsequent accesses to different memory locations
+//  within the same cache line and heping performance.
+//
+//  mimic: std::hardware_constructive_interference_size, C++17
+constexpr std::size_t hardware_constructive_interference_size = 64;
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/lang/Bits.h b/third-party/folly/folly/lang/Bits.h
new file mode 100644
index 00000000000..f3abeffc429
--- /dev/null
+++ b/third-party/folly/folly/lang/Bits.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+namespace folly {
+
+template <
+    typename To,
+    typename From,
+    _t<std::enable_if<
+        sizeof(From) == sizeof(To) && std::is_trivial<To>::value &&
+            is_trivially_copyable<From>::value,
+        int>> = 0>
+To bit_cast(const From& src) noexcept {
+  To to;
+  std::memcpy(&to, &src, sizeof(From));
+  return to;
+}
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/lang/Launder.h b/third-party/folly/folly/lang/Launder.h
new file mode 100644
index 00000000000..9247e3e3346
--- /dev/null
+++ b/third-party/folly/folly/lang/Launder.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <new>
+
+#include <folly/Portability.h>
+
+/***
+ *  include or backport:
+ *  * std::launder
+ */
+
+namespace folly {
+
+/**
+ * Approximate backport from C++17 of std::launder. It should be `constexpr`
+ * but that can't be done without specific support from the compiler.
+ */
+template <typename T>
+FOLLY_NODISCARD inline T* launder(T* in) noexcept {
+#if FOLLY_HAS_BUILTIN(__builtin_launder) || __GNUC__ >= 7
+  // The builtin has no unwanted side-effects.
+  return __builtin_launder(in);
+#elif __GNUC__
+  // This inline assembler block declares that `in` is an input and an output,
+  // so the compiler has to assume that it has been changed inside the block.
+  __asm__("" : "+r"(in));
+  return in;
+#elif defined(_WIN32)
+  // MSVC does not currently have optimizations around const members of structs.
+  // _ReadWriteBarrier() will prevent compiler reordering memory accesses.
+  _ReadWriteBarrier();
+  return in;
+#else
+  static_assert(
+      false, "folly::launder is not implemented for this environment");
+#endif
+}
+
+/* The standard explicitly forbids laundering these */
+void launder(void*) = delete;
+void launder(void const*) = delete;
+void launder(void volatile*) = delete;
+void launder(void const volatile*) = delete;
+template <typename T, typename... Args>
+void launder(T (*)(Args...)) = delete;
+} // namespace folly
diff --git a/third-party/folly/folly/portability/Asm.h b/third-party/folly/folly/portability/Asm.h
new file mode 100644
index 00000000000..cca16858626
--- /dev/null
+++ b/third-party/folly/folly/portability/Asm.h
@@ -0,0 +1,28 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <cstdint>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace folly {
+inline void asm_volatile_pause() {
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+  ::_mm_pause();
+#elif defined(__i386__) || FOLLY_X64
+  asm volatile("pause");
+#elif FOLLY_AARCH64 || defined(__arm__)
+  asm volatile("yield");
+#elif FOLLY_PPC64
+  asm volatile("or 27,27,27");
+#endif
+}
+} // namespace folly
diff --git a/third-party/folly/folly/portability/SysSyscall.h b/third-party/folly/folly/portability/SysSyscall.h
new file mode 100644
index 00000000000..fa969deb1be
--- /dev/null
+++ b/third-party/folly/folly/portability/SysSyscall.h
@@ -0,0 +1,10 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef _WIN32
+#include <sys/syscall.h>
+#endif
diff --git a/third-party/folly/folly/portability/SysTypes.h b/third-party/folly/folly/portability/SysTypes.h
new file mode 100644
index 00000000000..7beb68cfbe2
--- /dev/null
+++ b/third-party/folly/folly/portability/SysTypes.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <sys/types.h>
+
+#ifdef _WIN32
+#include <basetsd.h> // @manual
+
+#define HAVE_MODE_T 1
+
+// This is a massive pain to have be an `int` due to the pthread implementation
+// we support, but it's far more compatible with the rest of the windows world
+// as an `int` than it would be as a `void*`
+using pid_t = int;
+// This isn't actually supposed to be defined here, but it's the most
+// appropriate place without defining a portability header for stdint.h
+// with just this single typedef.
+using ssize_t = SSIZE_T;
+// The Windows headers don't define this anywhere, nor do any of the libs
+// that Folly depends on, so define it here.
+using mode_t = unsigned short;
+#endif
diff --git a/third-party/folly/folly/synchronization/AtomicNotification-inl.h b/third-party/folly/folly/synchronization/AtomicNotification-inl.h
new file mode 100644
index 00000000000..c0b143d0a22
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification-inl.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/detail/Futex.h>
+#include <folly/synchronization/ParkingLot.h>
+
+#include <condition_variable>
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace atomic_notification {
+/**
+ * We use Futex<std::atomic> as the alias that has the lowest performance
+ * overhead with respect to atomic notifications.  Assert that
+ * atomic_uint_fast_wait_t is the same as Futex<std::atomic>
+ */
+static_assert(std::is_same<atomic_uint_fast_wait_t, Futex<std::atomic>>{}, "");
+
+/**
+ * Implementation and specializations for the atomic_wait() family of
+ * functions
+ */
+inline std::cv_status toCvStatus(FutexResult result) {
+  return (result == FutexResult::TIMEDOUT) ? std::cv_status::timeout
+                                           : std::cv_status::no_timeout;
+}
+inline std::cv_status toCvStatus(ParkResult result) {
+  return (result == ParkResult::Timeout) ? std::cv_status::timeout
+                                         : std::cv_status::no_timeout;
+}
+
+// ParkingLot instantiation for futex management
+extern ParkingLot<std::uint32_t> parkingLot;
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_wait_impl(
+    const Atom<std::uint32_t, Args...>* atomic,
+    std::uint32_t expected) {
+  futexWait(atomic, expected);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_wait_impl(const Atom<Integer, Args...>* atomic, Integer expected) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.park(
+      atomic, -1, [&] { return atomic->load() == expected; }, [] {});
+}
+
+template <
+    template <typename...> class Atom,
+    typename... Args,
+    typename Clock,
+    typename Duration>
+std::cv_status atomic_wait_until_impl(
+    const Atom<std::uint32_t, Args...>* atomic,
+    std::uint32_t expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  return toCvStatus(futexWaitUntil(atomic, expected, deadline));
+}
+
+template <
+    template <typename...> class Atom,
+    typename Integer,
+    typename... Args,
+    typename Clock,
+    typename Duration>
+std::cv_status atomic_wait_until_impl(
+    const Atom<Integer, Args...>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  return toCvStatus(parkingLot.park_until(
+      atomic, -1, [&] { return atomic->load() == expected; }, [] {}, deadline));
+}
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_notify_one_impl(const Atom<std::uint32_t, Args...>* atomic) {
+  futexWake(atomic, 1);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_notify_one_impl(const Atom<Integer, Args...>* atomic) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.unpark(atomic, [&](std::uint32_t data) {
+    assert(data == std::numeric_limits<std::uint32_t>::max());
+    return UnparkControl::RemoveBreak;
+  });
+}
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_notify_all_impl(const Atom<std::uint32_t, Args...>* atomic) {
+  futexWake(atomic);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_notify_all_impl(const Atom<Integer, Args...>* atomic) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.unpark(atomic, [&](std::uint32_t data) {
+    assert(data == std::numeric_limits<std::uint32_t>::max());
+    return UnparkControl::RemoveContinue;
+  });
+}
+} // namespace atomic_notification
+} // namespace detail
+
+template <typename Integer>
+void atomic_wait(const std::atomic<Integer>* atomic, Integer expected) {
+  detail::atomic_notification::atomic_wait_impl(atomic, expected);
+}
+
+template <typename Integer, typename Clock, typename Duration>
+std::cv_status atomic_wait_until(
+    const std::atomic<Integer>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  return detail::atomic_notification::atomic_wait_until_impl(
+      atomic, expected, deadline);
+}
+
+template <typename Integer>
+void atomic_notify_one(const std::atomic<Integer>* atomic) {
+  detail::atomic_notification::atomic_notify_one_impl(atomic);
+}
+
+template <typename Integer>
+void atomic_notify_all(const std::atomic<Integer>* atomic) {
+  detail::atomic_notification::atomic_notify_all_impl(atomic);
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicNotification.cpp b/third-party/folly/folly/synchronization/AtomicNotification.cpp
new file mode 100644
index 00000000000..b50875cd56b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification.cpp
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/AtomicNotification.h>
+
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace atomic_notification {
+
+// ParkingLot instance used for the atomic_wait() family of functions
+//
+// This has been defined as a static object (as opposed to allocated to avoid
+// destruction order problems) because of possible uses coming from
+// allocation-sensitive contexts.
+ParkingLot<std::uint32_t> parkingLot;
+
+} // namespace atomic_notification
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicNotification.h b/third-party/folly/folly/synchronization/AtomicNotification.h
new file mode 100644
index 00000000000..af87852cbb3
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+
+namespace folly {
+
+/**
+ * The behavior of the atomic_wait() family of functions is semantically
+ * identical to futex().  Correspondingly, calling atomic_notify_one(),
+ * atomic_notify_all() is identical to futexWake() with 1 and
+ * std::numeric_limits<int>::max() respectively
+ *
+ * The difference here compared to the futex API above is that it works with
+ * all types of atomic widths.  When a 32 bit atomic integer is used, the
+ * implementation falls back to using futex() if possible, and the
+ * compatibility implementation for non-linux systems otherwise.  For all
+ * other integer widths, the compatibility implementation is used
+ *
+ * The templating of this API is changed from the standard in the following
+ * ways
+ *
+ * - At the time of writing, libstdc++'s implementation of std::atomic<> does
+ *   not include the value_type alias.  So we rely on the atomic type being a
+ *   template class such that the first type is the underlying value type
+ * - The Atom parameter allows this API to be compatible with
+ *   DeterministicSchedule testing.
+ * - atomic_wait_until() does not exist in the linked paper, the version here
+ *   is identical to futexWaitUntil() and returns std::cv_status
+ */
+//  mimic: std::atomic_wait, p1135r0
+template <typename Integer>
+void atomic_wait(const std::atomic<Integer>* atomic, Integer expected);
+template <typename Integer, typename Clock, typename Duration>
+std::cv_status atomic_wait_until(
+    const std::atomic<Integer>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline);
+
+//  mimic: std::atomic_notify_one, p1135r0
+template <typename Integer>
+void atomic_notify_one(const std::atomic<Integer>* atomic);
+//  mimic: std::atomic_notify_all, p1135r0
+template <typename Integer>
+void atomic_notify_all(const std::atomic<Integer>* atomic);
+
+//  mimic: std::atomic_uint_fast_wait_t, p1135r0
+using atomic_uint_fast_wait_t = std::atomic<std::uint32_t>;
+
+} // namespace folly
+
+#include <folly/synchronization/AtomicNotification-inl.h>
diff --git a/third-party/folly/folly/synchronization/AtomicUtil-inl.h b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
new file mode 100644
index 00000000000..6adba2b31a5
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
@@ -0,0 +1,258 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+
+#if _WIN32
+#include <intrin.h>
+#endif
+
+namespace folly {
+namespace detail {
+
+// TODO: Remove the non-default implementations when both gcc and clang
+// can recognize single bit set/reset patterns and compile them down to locked
+// bts and btr instructions.
+//
+// Currently, at the time of writing it seems like gcc7 and greater can make
+// this optimization and clang cannot - https://gcc.godbolt.org/z/Q83rxX
+
+template <typename Atomic>
+bool atomic_fetch_set_default(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  using Integer = decltype(atomic.load());
+  auto mask = Integer{0b1} << static_cast<Integer>(bit);
+  return (atomic.fetch_or(mask, order) & mask);
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset_default(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  using Integer = decltype(atomic.load());
+  auto mask = Integer{0b1} << static_cast<Integer>(bit);
+  return (atomic.fetch_and(~mask, order) & mask);
+}
+
+/**
+ * A simple trait to determine if the given type is an instantiation of
+ * std::atomic
+ */
+template <typename T>
+struct is_atomic : std::false_type {};
+template <typename Integer>
+struct is_atomic<std::atomic<Integer>> : std::true_type {};
+
+#if FOLLY_X64
+
+#if _MSC_VER
+
+template <typename Integer>
+inline bool atomic_fetch_set_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(alignof(std::atomic<Integer>) == alignof(Integer), "");
+  static_assert(sizeof(std::atomic<Integer>) == sizeof(Integer), "");
+  assert(atomic.is_lock_free());
+
+  if /* constexpr */ (sizeof(Integer) == 4) {
+    return _interlockedbittestandset(
+        reinterpret_cast<volatile long*>(&atomic), static_cast<long>(bit));
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    return _interlockedbittestandset64(
+        reinterpret_cast<volatile long long*>(&atomic),
+        static_cast<long long>(bit));
+  } else {
+    assert(sizeof(Integer) != 4 && sizeof(Integer) != 8);
+    return atomic_fetch_set_default(atomic, bit, order);
+  }
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_set_x86(Atomic& atomic, std::size_t bit, std::memory_order order) {
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint32_t>>{}, "");
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint64_t>>{}, "");
+  return atomic_fetch_set_default(atomic, bit, order);
+}
+
+template <typename Integer>
+inline bool atomic_fetch_reset_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(alignof(std::atomic<Integer>) == alignof(Integer), "");
+  static_assert(sizeof(std::atomic<Integer>) == sizeof(Integer), "");
+  assert(atomic.is_lock_free());
+
+  if /* constexpr */ (sizeof(Integer) == 4) {
+    return _interlockedbittestandreset(
+        reinterpret_cast<volatile long*>(&atomic), static_cast<long>(bit));
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    return _interlockedbittestandreset64(
+        reinterpret_cast<volatile long long*>(&atomic),
+        static_cast<long long>(bit));
+  } else {
+    assert(sizeof(Integer) != 4 && sizeof(Integer) != 8);
+    return atomic_fetch_reset_default(atomic, bit, order);
+  }
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_reset_x86(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint32_t>>{}, "");
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint64_t>>{}, "");
+  return atomic_fetch_reset_default(atomic, bit, mo);
+}
+
+#else
+
+template <typename Integer>
+inline bool atomic_fetch_set_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  auto previous = false;
+
+  if /* constexpr */ (sizeof(Integer) == 2) {
+    auto pointer = reinterpret_cast<std::uint16_t*>(&atomic);
+    asm volatile("lock; btsw %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint16_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 4) {
+    auto pointer = reinterpret_cast<std::uint32_t*>(&atomic);
+    asm volatile("lock; btsl %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint32_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    auto pointer = reinterpret_cast<std::uint64_t*>(&atomic);
+    asm volatile("lock; btsq %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint64_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else {
+    assert(sizeof(Integer) == 1);
+    return atomic_fetch_set_default(atomic, bit, order);
+  }
+
+  return previous;
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_set_x86(Atomic& atomic, std::size_t bit, std::memory_order order) {
+  static_assert(!is_atomic<Atomic>::value, "");
+  return atomic_fetch_set_default(atomic, bit, order);
+}
+
+template <typename Integer>
+inline bool atomic_fetch_reset_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  auto previous = false;
+
+  if /* constexpr */ (sizeof(Integer) == 2) {
+    auto pointer = reinterpret_cast<std::uint16_t*>(&atomic);
+    asm volatile("lock; btrw %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint16_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 4) {
+    auto pointer = reinterpret_cast<std::uint32_t*>(&atomic);
+    asm volatile("lock; btrl %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint32_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    auto pointer = reinterpret_cast<std::uint64_t*>(&atomic);
+    asm volatile("lock; btrq %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint64_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else {
+    assert(sizeof(Integer) == 1);
+    return atomic_fetch_reset_default(atomic, bit, order);
+  }
+
+  return previous;
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset_x86(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(!is_atomic<Atomic>::value, "");
+  return atomic_fetch_reset_default(atomic, bit, order);
+}
+
+#endif
+
+#else
+
+template <typename Atomic>
+bool atomic_fetch_set_x86(Atomic&, std::size_t, std::memory_order) noexcept {
+  // This should never be called on non x86_64 platforms.
+  std::terminate();
+}
+template <typename Atomic>
+bool atomic_fetch_reset_x86(Atomic&, std::size_t, std::memory_order) noexcept {
+  // This should never be called on non x86_64 platforms.
+  std::terminate();
+}
+
+#endif
+
+} // namespace detail
+
+template <typename Atomic>
+bool atomic_fetch_set(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  using Integer = decltype(atomic.load());
+  static_assert(std::is_unsigned<Integer>{}, "");
+  static_assert(!std::is_const<Atomic>{}, "");
+  assert(bit < (sizeof(Integer) * 8));
+
+  if (folly::kIsArchAmd64) {
+    // do the optimized thing on x86 builds
+    return detail::atomic_fetch_set_x86(atomic, bit, mo);
+  } else {
+    // otherwise default to the default implementation using fetch_or()
+    return detail::atomic_fetch_set_default(atomic, bit, mo);
+  }
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  using Integer = decltype(atomic.load());
+  static_assert(std::is_unsigned<Integer>{}, "");
+  static_assert(!std::is_const<Atomic>{}, "");
+  assert(bit < (sizeof(Integer) * 8));
+
+  if (folly::kIsArchAmd64) {
+    // do the optimized thing on x86 builds
+    return detail::atomic_fetch_reset_x86(atomic, bit, mo);
+  } else {
+    // otherwise default to the default implementation using fetch_and()
+    return detail::atomic_fetch_reset_default(atomic, bit, mo);
+  }
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicUtil.h b/third-party/folly/folly/synchronization/AtomicUtil.h
new file mode 100644
index 00000000000..95bcf73c5a8
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicUtil.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+namespace folly {
+
+/**
+ * Sets a bit at the given index in the binary representation of the integer
+ * to 1.  Returns the previous value of the bit, so true if the bit was not
+ * changed, false otherwise
+ *
+ * On some architectures, using this is more efficient than the corresponding
+ * std::atomic::fetch_or() with a mask.  For example to set the first (least
+ * significant) bit of an integer, you could do atomic.fetch_or(0b1)
+ *
+ * The efficiency win is only visible in x86 (yet) and comes from the
+ * implementation using the x86 bts instruction when possible.
+ *
+ * When something other than std::atomic is passed, the implementation assumed
+ * incompatibility with this interface and calls Atomic::fetch_or()
+ */
+template <typename Atomic>
+bool atomic_fetch_set(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order = std::memory_order_seq_cst);
+
+/**
+ * Resets a bit at the given index in the binary representation of the integer
+ * to 0.  Returns the previous value of the bit, so true if the bit was
+ * changed, false otherwise
+ *
+ * This follows the same underlying principle and implementation as
+ * fetch_set().  Using the optimized implementation when possible and falling
+ * back to std::atomic::fetch_and() when in debug mode or in an architecture
+ * where an optimization is not possible
+ */
+template <typename Atomic>
+bool atomic_fetch_reset(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order = std::memory_order_seq_cst);
+
+} // namespace folly
+
+#include <folly/synchronization/AtomicUtil-inl.h>
diff --git a/third-party/folly/folly/synchronization/Baton.h b/third-party/folly/folly/synchronization/Baton.h
new file mode 100644
index 00000000000..6a6403defad
--- /dev/null
+++ b/third-party/folly/folly/synchronization/Baton.h
@@ -0,0 +1,327 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <atomic>
+#include <thread>
+
+#include <folly/detail/Futex.h>
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/WaitOptions.h>
+#include <folly/synchronization/detail/Spin.h>
+
+namespace folly {
+
+/// A Baton allows a thread to block once and be awoken. Captures a
+/// single handoff, and during its lifecycle (from construction/reset
+/// to destruction/reset) a baton must either be post()ed and wait()ed
+/// exactly once each, or not at all.
+///
+/// Baton includes no internal padding, and is only 4 bytes in size.
+/// Any alignment or padding to avoid false sharing is up to the user.
+///
+/// This is basically a stripped-down semaphore that supports only a
+/// single call to sem_post and a single call to sem_wait.
+///
+/// The non-blocking version (MayBlock == false) provides more speed
+/// by using only load acquire and store release operations in the
+/// critical path, at the cost of disallowing blocking.
+///
+/// The current posix semaphore sem_t isn't too bad, but this provides
+/// more a bit more speed, inlining, smaller size, a guarantee that
+/// the implementation won't change, and compatibility with
+/// DeterministicSchedule.  By having a much more restrictive
+/// lifecycle we can also add a bunch of assertions that can help to
+/// catch race conditions ahead of time.
+template <bool MayBlock = true, template <typename> class Atom = std::atomic>
+class Baton {
+ public:
+  static constexpr WaitOptions wait_options() {
+    return {};
+  }
+
+  constexpr Baton() noexcept : state_(INIT) {}
+
+  Baton(Baton const&) = delete;
+  Baton& operator=(Baton const&) = delete;
+
+  /// It is an error to destroy a Baton on which a thread is currently
+  /// wait()ing.  In practice this means that the waiter usually takes
+  /// responsibility for destroying the Baton.
+  ~Baton() noexcept {
+    // The docblock for this function says that it can't be called when
+    // there is a concurrent waiter.  We assume a strong version of this
+    // requirement in which the caller must _know_ that this is true, they
+    // are not allowed to be merely lucky.  If two threads are involved,
+    // the destroying thread must actually have synchronized with the
+    // waiting thread after wait() returned.  To convey causality the the
+    // waiting thread must have used release semantics and the destroying
+    // thread must have used acquire semantics for that communication,
+    // so we are guaranteed to see the post-wait() value of state_,
+    // which cannot be WAITING.
+    //
+    // Note that since we only care about a single memory location,
+    // the only two plausible memory orders here are relaxed and seq_cst.
+    assert(state_.load(std::memory_order_relaxed) != WAITING);
+  }
+
+  bool ready() const noexcept {
+    auto s = state_.load(std::memory_order_acquire);
+    assert(s == INIT || s == EARLY_DELIVERY);
+    return (s == EARLY_DELIVERY);
+  }
+
+  /// Equivalent to destroying the Baton and creating a new one.  It is
+  /// a bug to call this while there is a waiting thread, so in practice
+  /// the waiter will be the one that resets the baton.
+  void reset() noexcept {
+    // See ~Baton for a discussion about why relaxed is okay here
+    assert(state_.load(std::memory_order_relaxed) != WAITING);
+
+    // We use a similar argument to justify the use of a relaxed store
+    // here.  Since both wait() and post() are required to be called
+    // only once per lifetime, no thread can actually call those methods
+    // correctly after a reset() unless it synchronizes with the thread
+    // that performed the reset().  If a post() or wait() on another thread
+    // didn't synchronize, then regardless of what operation we performed
+    // here there would be a race on proper use of the Baton's spec
+    // (although not on any particular load and store).  Put another way,
+    // we don't need to synchronize here because anybody that might rely
+    // on such synchronization is required by the baton rules to perform
+    // an additional synchronization that has the desired effect anyway.
+    //
+    // There is actually a similar argument to be made about the
+    // constructor, in which the fenceless constructor initialization
+    // of state_ is piggybacked on whatever synchronization mechanism
+    // distributes knowledge of the Baton's existence
+    state_.store(INIT, std::memory_order_relaxed);
+  }
+
+  /// Causes wait() to wake up.  For each lifetime of a Baton (where a
+  /// lifetime starts at construction or reset() and ends at
+  /// destruction or reset()) there can be at most one call to post(),
+  /// in the single poster version.  Any thread may call post().
+  void post() noexcept {
+    if (!MayBlock) {
+      /// Spin-only version
+      ///
+      assert(
+          ((1 << state_.load(std::memory_order_relaxed)) &
+           ((1 << INIT) | (1 << EARLY_DELIVERY))) != 0);
+      state_.store(EARLY_DELIVERY, std::memory_order_release);
+      return;
+    }
+
+    /// May-block versions
+    ///
+    uint32_t before = state_.load(std::memory_order_acquire);
+
+    assert(before == INIT || before == WAITING || before == TIMED_OUT);
+
+    if (before == INIT &&
+        state_.compare_exchange_strong(
+            before,
+            EARLY_DELIVERY,
+            std::memory_order_release,
+            std::memory_order_relaxed)) {
+      return;
+    }
+
+    assert(before == WAITING || before == TIMED_OUT);
+
+    if (before == TIMED_OUT) {
+      return;
+    }
+
+    assert(before == WAITING);
+    state_.store(LATE_DELIVERY, std::memory_order_release);
+    detail::futexWake(&state_, 1);
+  }
+
+  /// Waits until post() has been called in the current Baton lifetime.
+  /// May be called at most once during a Baton lifetime (construction
+  /// |reset until destruction|reset).  If post is called before wait in
+  /// the current lifetime then this method returns immediately.
+  ///
+  /// The restriction that there can be at most one wait() per lifetime
+  /// could be relaxed somewhat without any perf or size regressions,
+  /// but by making this condition very restrictive we can provide better
+  /// checking in debug builds.
+  void wait(const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return;
+    }
+
+    auto const deadline = std::chrono::steady_clock::time_point::max();
+    tryWaitSlow(deadline, opt);
+  }
+
+  /// Similar to wait, but doesn't block the thread if it hasn't been posted.
+  ///
+  /// try_wait has the following semantics:
+  /// - It is ok to call try_wait any number times on the same baton until
+  ///   try_wait reports that the baton has been posted.
+  /// - It is ok to call timed_wait or wait on the same baton if try_wait
+  ///   reports that baton hasn't been posted.
+  /// - If try_wait indicates that the baton has been posted, it is invalid to
+  ///   call wait, try_wait or timed_wait on the same baton without resetting
+  ///
+  /// @return       true if baton has been posted, false othewise
+  bool try_wait() const noexcept {
+    return ready();
+  }
+
+  /// Similar to wait, but with a timeout. The thread is unblocked if the
+  /// timeout expires.
+  /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
+  /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
+  /// words, after try_wait_for the caller can't invoke
+  /// wait/try_wait/try_wait_for/try_wait_until
+  /// again on the same baton without resetting it.
+  ///
+  /// @param  timeout       Time until which the thread can block
+  /// @return               true if the baton was posted to before timeout,
+  ///                       false otherwise
+  template <typename Rep, typename Period>
+  bool try_wait_for(
+      const std::chrono::duration<Rep, Period>& timeout,
+      const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return true;
+    }
+
+    auto const deadline = std::chrono::steady_clock::now() + timeout;
+    return tryWaitSlow(deadline, opt);
+  }
+
+  /// Similar to wait, but with a deadline. The thread is unblocked if the
+  /// deadline expires.
+  /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
+  /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
+  /// words, after try_wait_until the caller can't invoke
+  /// wait/try_wait/try_wait_for/try_wait_until
+  /// again on the same baton without resetting it.
+  ///
+  /// @param  deadline      Time until which the thread can block
+  /// @return               true if the baton was posted to before deadline,
+  ///                       false otherwise
+  template <typename Clock, typename Duration>
+  bool try_wait_until(
+      const std::chrono::time_point<Clock, Duration>& deadline,
+      const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return true;
+    }
+
+    return tryWaitSlow(deadline, opt);
+  }
+
+  /// Alias to try_wait_for. Deprecated.
+  template <typename Rep, typename Period>
+  bool timed_wait(
+      const std::chrono::duration<Rep, Period>& timeout) noexcept {
+    return try_wait_for(timeout);
+  }
+
+  /// Alias to try_wait_until. Deprecated.
+  template <typename Clock, typename Duration>
+  bool timed_wait(
+      const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
+    return try_wait_until(deadline);
+  }
+
+ private:
+  enum State : uint32_t {
+    INIT = 0,
+    EARLY_DELIVERY = 1,
+    WAITING = 2,
+    LATE_DELIVERY = 3,
+    TIMED_OUT = 4,
+  };
+
+  template <typename Clock, typename Duration>
+  bool tryWaitSlow(
+      const std::chrono::time_point<Clock, Duration>& deadline,
+      const WaitOptions& opt) noexcept {
+    switch (detail::spin_pause_until(deadline, opt, [=] { return ready(); })) {
+      case detail::spin_result::success:
+        return true;
+      case detail::spin_result::timeout:
+        return false;
+      case detail::spin_result::advance:
+        break;
+    }
+
+    if (!MayBlock) {
+      switch (detail::spin_yield_until(deadline, [=] { return ready(); })) {
+        case detail::spin_result::success:
+          return true;
+        case detail::spin_result::timeout:
+          return false;
+        case detail::spin_result::advance:
+          break;
+      }
+    }
+
+    // guess we have to block :(
+    uint32_t expected = INIT;
+    if (!state_.compare_exchange_strong(
+            expected,
+            WAITING,
+            std::memory_order_relaxed,
+            std::memory_order_relaxed)) {
+      // CAS failed, last minute reprieve
+      assert(expected == EARLY_DELIVERY);
+      // TODO: move the acquire to the compare_exchange failure load after C++17
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return true;
+    }
+
+    while (true) {
+      auto rv = detail::futexWaitUntil(&state_, WAITING, deadline);
+
+      // Awoken by the deadline passing.
+      if (rv == detail::FutexResult::TIMEDOUT) {
+        assert(deadline != (std::chrono::time_point<Clock, Duration>::max()));
+        state_.store(TIMED_OUT, std::memory_order_release);
+        return false;
+      }
+
+      // Probably awoken by a matching wake event, but could also by awoken
+      // by an asynchronous signal or by a spurious wakeup.
+      //
+      // state_ is the truth even if FUTEX_WAIT reported a matching
+      // FUTEX_WAKE, since we aren't using type-stable storage and we
+      // don't guarantee reuse.  The scenario goes like this: thread
+      // A's last touch of a Baton is a call to wake(), which stores
+      // LATE_DELIVERY and gets an unlucky context switch before delivering
+      // the corresponding futexWake.  Thread B sees LATE_DELIVERY
+      // without consuming a futex event, because it calls futexWait
+      // with an expected value of WAITING and hence doesn't go to sleep.
+      // B returns, so the Baton's memory is reused and becomes another
+      // Baton (or a reuse of this one).  B calls futexWait on the new
+      // Baton lifetime, then A wakes up and delivers a spurious futexWake
+      // to the same memory location.  B's futexWait will then report a
+      // consumed wake event even though state_ is still WAITING.
+      //
+      // It would be possible to add an extra state_ dance to communicate
+      // that the futexWake has been sent so that we can be sure to consume
+      // it before returning, but that would be a perf and complexity hit.
+      uint32_t s = state_.load(std::memory_order_acquire);
+      assert(s == WAITING || s == LATE_DELIVERY);
+      if (s == LATE_DELIVERY) {
+        return true;
+      }
+    }
+  }
+
+  detail::Futex<Atom> state_;
+};
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex-inl.h b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
new file mode 100644
index 00000000000..b3f7d201406
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
@@ -0,0 +1,1702 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+
+#include <folly/ConstexprMath.h>
+#include <folly/Portability.h>
+#include <folly/ScopeGuard.h>
+#include <folly/Utility.h>
+#include <folly/chrono/Hardware.h>
+#include <folly/detail/Futex.h>
+#include <folly/lang/Align.h>
+#include <folly/lang/Bits.h>
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/AtomicNotification.h>
+#include <folly/synchronization/AtomicUtil.h>
+#include <folly/synchronization/detail/InlineFunctionRef.h>
+#include <folly/synchronization/detail/Sleeper.h>
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+// kUnlocked is used to show unlocked state
+//
+// When locking threads encounter kUnlocked in the underlying storage, they
+// can just acquire the lock without any further effort
+constexpr auto kUnlocked = std::uintptr_t{0b0};
+// kLocked is used to show that the mutex is currently locked, and future
+// attempts to lock the mutex should enqueue on the central storage
+//
+// Locking threads find this on central storage only when there is a
+// contention chain that is undergoing wakeups, in every other case, a locker
+// will either find kUnlocked or an arbitrary address with the kLocked bit set
+constexpr auto kLocked = std::uintptr_t{0b1};
+// kTimedWaiter is set when there is at least one timed waiter on the mutex
+//
+// Timed waiters do not follow the sleeping strategy employed by regular,
+// non-timed threads.  They sleep on the central mutex atomic through an
+// extended futex() interface that allows sleeping with the same semantics for
+// non-standard integer widths
+//
+// When a regular non-timed thread unlocks or enqueues on the mutex, and sees
+// a timed waiter, it takes ownership of all the timed waiters.  The thread
+// that has taken ownership of the timed waiter releases the timed waiters
+// when it gets a chance at the critical section.  At which point it issues a
+// wakeup to single timed waiter, timed waiters always issue wake() calls to
+// other timed waiters
+constexpr auto kTimedWaiter = std::uintptr_t{0b10};
+
+// kUninitialized means that the thread has just enqueued, and has not yet
+// gotten to initializing itself with the address of its successor
+//
+// this becomes significant for threads that are trying to wake up the
+// uninitialized thread, if they see that the thread is not yet initialized,
+// they can do nothing but spin, and wait for the thread to get initialized
+//
+// This also plays a role in the functioning of flat combining as implemented
+// in DistributedMutex.  When a thread owning the lock goes through the
+// contention chain to either unlock the mutex or combine critical sections
+// from the other end.  The presence of kUninitialized means that the
+// combining thread is not able to make progress after this point.  So we
+// transfer the lock.
+constexpr auto kUninitialized = std::uint32_t{0b0};
+// kWaiting will be set in the waiter's futex structs while they are spinning
+// while waiting for the mutex
+constexpr auto kWaiting = std::uint32_t{0b1};
+// kWake will be set by threads that are waking up waiters that have enqueued
+constexpr auto kWake = std::uint32_t{0b10};
+// kSkipped will be set by a waker when they see that a waiter has been
+// preempted away by the kernel, in this case the thread that got skipped will
+// have to wake up and put itself back on the queue
+constexpr auto kSkipped = std::uint32_t{0b11};
+// kAboutToWait will be set by a waiter that enqueues itself with the purpose
+// of waiting on a futex
+constexpr auto kAboutToWait = std::uint32_t{0b100};
+// kSleeping will be set by a waiter right before enqueueing on a futex.  When
+// a thread wants to wake up a waiter that has enqueued on a futex, it should
+// set the futex to contain kWake
+//
+// a thread that is unlocking and wants to skip over a sleeping thread also
+// calls futex_.exchange(kSleeping) on the sleeping thread's futex word.  It
+// does this to 1. detect whether the sleeping thread had actually gone to
+// sleeping on the futex word so it can skip it, and 2. to synchronize with
+// other non atomic writes in the sleeping thread's context (such as the write
+// to track the next waiting thread).
+//
+// We reuse kSleeping instead of say using another constant kEarlyDelivery to
+// avoid situations where a thread has to enter kernel mode due to calling
+// futexWait() twice because of the presence of a waking thread.  This
+// situation can arise when an unlocking thread goes to skip over a sleeping
+// thread, sees that the thread has slept and move on, but the sleeping thread
+// had not yet entered futex().  This interleaving causes the thread calling
+// futex() to return spuriously, as the futex word is not what it should be
+constexpr auto kSleeping = std::uint32_t{0b101};
+// kCombined is set by the lock holder to let the waiter thread know that its
+// combine request was successfully completed by the lock holder.  A
+// successful combine means that the thread requesting the combine operation
+// does not need to unlock the mutex; in fact, doing so would be an error.
+constexpr auto kCombined = std::uint32_t{0b111};
+// kCombineUninitialized is like kUninitialized but is set by a thread when it
+// enqueues in hopes of getting its critical section combined with the lock
+// holder
+constexpr auto kCombineUninitialized = std::uint32_t{0b1000};
+// kCombineWaiting is set by a thread when it is ready to have its combine
+// record fulfilled by the lock holder.  In particular, this signals to the
+// lock holder that the thread has set its next_ pointer in the contention
+// chain
+constexpr auto kCombineWaiting = std::uint32_t{0b1001};
+// kExceptionOccurred is set on the waiter futex when the remote task throws
+// an exception.  It is the caller's responsibility to retrieve the exception
+// and rethrow it in their own context.  Note that when the caller uses a
+// noexcept function as their critical section, they can avoid checking for
+// this value
+//
+// This allows us to avoid all cost of exceptions in the memory layout of the
+// fast path (no errors) as exceptions are stored as an std::exception_ptr in
+// the same union that stores the return value of the critical section.  We
+// also avoid all CPU overhead because the combiner uses a try-catch block
+// without any additional branching to handle exceptions
+constexpr auto kExceptionOccurred = std::uint32_t{0b1010};
+
+// The number of spins that we are allowed to do before we resort to marking a
+// thread as having slept
+//
+// This is just a magic number from benchmarks
+constexpr auto kScheduledAwaySpinThreshold = std::chrono::nanoseconds{200};
+// The maximum number of spins before a thread starts yielding its processor
+// in hopes of getting skipped
+constexpr auto kMaxSpins = 4000;
+// The maximum number of contention chains we can resolve with flat combining.
+// After this number of contention chains, the mutex falls back to regular
+// two-phased mutual exclusion to ensure that we don't starve the combiner
+// thread
+constexpr auto kMaxCombineIterations = 2;
+
+/**
+ * Write only data that is available to the thread that is waking up another.
+ * Only the waking thread is allowed to write to this, the thread to be woken
+ * is allowed to read from this after a wakeup has been issued
+ */
+template <template <typename> class Atomic>
+class WakerMetadata {
+ public:
+  explicit WakerMetadata(
+      std::uintptr_t waker = 0,
+      std::uintptr_t waiters = 0,
+      std::uint32_t sleeper = kUninitialized)
+    : waker_{waker}, waiters_{waiters}, sleeper_{sleeper} {}
+
+  // This is the thread that initiated wakeups for the contention chain.
+  // There can only ever be one thread that initiates the wakeup for a
+  // chain in the spin only version of this mutex.  When a thread that just
+  // woke up sees this as the next thread to wake up, it knows that it is the
+  // terminal node in the contention chain.  This means that it was the one
+  // that took off the thread that had acquired the mutex off the centralized
+  // state.  Therefore, the current thread is the last in its contention
+  // chain.  It will fall back to centralized storage to pick up the next
+  // waiter or release the mutex
+  //
+  // When we move to a full sleeping implementation, this might need to change
+  // to a small_vector<> to account for failed wakeups, or we can put threads
+  // to sleep on the central futex, which is an easier implementation
+  // strategy.  Although, since this is allocated on the stack, we can set a
+  // prohitively large threshold to avoid heap allocations, this strategy
+  // however, might cause increased cache misses on wakeup signalling
+  std::uintptr_t waker_{0};
+  // the list of threads that the waker had previously seen to be sleeping on
+  // a futex(),
+  //
+  // this is given to the current thread as a means to pass on
+  // information.  When the current thread goes to unlock the mutex and does
+  // not see contention, it should go and wake up the head of this list.  If
+  // the current thread sees a contention chain on the mutex, it should pass
+  // on this list to the next thread that gets woken up
+  std::uintptr_t waiters_{0};
+  // The futex that this waiter will sleep on
+  //
+  // how can we reuse futex_ from above for futex management?
+  Futex<Atomic> sleeper_{kUninitialized};
+};
+
+/**
+ * Type of the type-erased callable that is used for combining from the lock
+ * holder's end.  This has 48 bytes of inline storage that can be used to
+ * minimize cache misses when combining
+ */
+using CombineFunction = detail::InlineFunctionRef<void(), 48>;
+
+/**
+ * Waiter encapsulates the state required for waiting on the mutex, this
+ * contains potentially heavy state and is intended to be allocated on the
+ * stack as part of a lock() function call
+ *
+ * To ensure that synchronization does not cause unintended side effects on
+ * the rest of the thread stack (eg. metadata in lockImplementation(), or any
+ * other data in the user's thread), we aggresively pad this struct and use
+ * custom alignment internally to ensure that the relevant data fits within a
+ * single cacheline.  The added alignment here also gives us some room to
+ * wiggle in the bottom few bits of the mutex, where we store extra metadata
+ */
+template <template <typename> class Atomic>
+class Waiter {
+ public:
+  Waiter() {}
+  Waiter(Waiter&&) = delete;
+  Waiter(const Waiter&) = delete;
+  Waiter& operator=(Waiter&&) = delete;
+  Waiter& operator=(const Waiter&) = delete;
+
+  void initialize(std::uint64_t futex, CombineFunction task) {
+    // we only initialize the function if we were actually given a non-null
+    // task, otherwise
+    if (task) {
+      assert(futex == kCombineUninitialized);
+      new (&function_) CombineFunction(task);
+    } else {
+      assert((futex == kUninitialized) || (futex == kAboutToWait));
+      new (&metadata_) WakerMetadata<Atomic>{};
+    }
+
+    // this pedantic store is needed to ensure that the waking thread
+    // synchronizes with the state in the waiter struct when it loads the
+    // value of the futex word
+    //
+    // on x86, this gets optimized away to just a regular store, it might be
+    // needed on platforms where explicit acquire-release barriers are
+    // required for synchronization
+    //
+    // note that we release here at the end of the constructor because
+    // construction is complete here, any thread that acquires this release
+    // will see a well constructed wait node
+    futex_.store(futex, std::memory_order_release);
+  }
+
+  std::array<std::uint8_t, hardware_destructive_interference_size> padding1;
+  // the atomic that this thread will spin on while waiting for the mutex to
+  // be unlocked
+  alignas(hardware_destructive_interference_size) Atomic<std::uint64_t> futex_{
+      kUninitialized};
+  // The successor of this node.  This will be the thread that had its address
+  // on the mutex previously
+  //
+  // We can do without making this atomic since the remote thread synchronizes
+  // on the futex variable above.  If this were not atomic, the remote thread
+  // would only be allowed to read from it after the waiter has moved into the
+  // waiting state to avoid risk of a load racing with a write.  However, it
+  // helps to make this atomic because we can use an unconditional load and make
+  // full use of the load buffer to coalesce both reads into a single clock
+  // cycle after the line arrives in the combiner core.  This is a heavily
+  // contended line, so an RFO from the enqueueing thread is highly likely and
+  // has the potential to cause an immediate invalidation; blocking the combiner
+  // thread from making progress until the line is pulled back to read this
+  // value
+  //
+  // Further, making this atomic prevents the compiler from making an incorrect
+  // optimization where it does not load the value as written in the code, but
+  // rather dereferences it through a pointer whenever needed (since the value
+  // of the pointer to this is readily available on the stack).  Doing this
+  // causes multiple invalidation requests from the enqueueing thread, blocking
+  // remote progress
+  //
+  // Note that we use relaxed loads and stores, so this should not have any
+  // additional overhead compared to a regular load on most architectures
+  std::atomic<std::uintptr_t> next_{0};
+  // We use an anonymous union for the combined critical section request and
+  // the metadata that will be filled in from the leader's end.  Only one is
+  // active at a time - if a leader decides to combine the requested critical
+  // section into its execution, it will not touch the metadata field.  If a
+  // leader decides to migrate the lock to the waiter, it will not touch the
+  // function
+  //
+  // this allows us to transfer more state when combining a critical section
+  // and reduce the cache misses originating from executing an arbitrary
+  // lambda
+  //
+  // note that this is an anonymous union, not an unnamed union, the members
+  // leak into the surrounding scope
+  union {
+    // metadata for the waker
+    WakerMetadata<Atomic> metadata_;
+    // The critical section that can potentially be combined into the critical
+    // section of the locking thread
+    //
+    // This is kept as a FunctionRef because the original function is preserved
+    // until the lock_combine() function returns.  A consequence of using
+    // FunctionRef here is that we don't need to do any allocations and can
+    // allow users to capture unbounded state into the critical section.  Flat
+    // combining means that the user does not have access to the thread
+    // executing the critical section, so assumptions about thread local
+    // references can be invalidated.  Being able to capture arbitrary state
+    // allows the user to do thread local accesses right before the critical
+    // section and pass them as state to the callable being referenced here
+    CombineFunction function_;
+    // The user is allowed to use a combined critical section that returns a
+    // value.  This buffer is used to implement the value transfer to the
+    // waiting thread.  We reuse the same union because this helps us combine
+    // one synchronization operation with a material value transfer.
+    //
+    // The waker thread needs to synchronize on this cacheline to issue a
+    // wakeup to the waiter, meaning that the entire line needs to be pulled
+    // into the remote core in exclusive mode.  So we reuse the coherence
+    // operation to transfer the return value in addition to the
+    // synchronization signal.  In the case that the user's data item is
+    // small, the data is transferred all inline as part of the same line,
+    // which pretty much arrives into the CPU cache in the same clock cycle or
+    // two after a read-for-ownership request.  This gives us a high chance of
+    // coalescing the entire transitive store buffer together into one cache
+    // coherence operation from the waker's end.  This allows us to make use
+    // of the CPU bus bandwidth which would have otherwise gone to waste.
+    // Benchmarks prove this theory under a wide range of contention, value
+    // sizes, NUMA interactions and processor models
+    //
+    // The current version of the Intel optimization manual confirms this
+    // theory somewhat as well in section 2.3.5.1 (Load and Store Operation
+    // Overview)
+    //
+    //    When an instruction writes data to a memory location [...], the
+    //    processor ensures that it has the line containing this memory location
+    //    is in its L1d cache [...]. If the cache line is not there, it fetches
+    //    from the next levels using a RFO request [...] RFO and storing the
+    //    data happens after instruction retirement.  Therefore, the store
+    //    latency usually does not affect the store instruction itself
+    //
+    // This gives the user the ability to input up to 48 bytes into the
+    // combined critical section through an InlineFunctionRef and output 48
+    // bytes from it basically without any cost.  The type of the entity
+    // stored in the buffer has to be matched by the type erased callable that
+    // the caller has used.  At this point, the caller is still in the
+    // template instantiation leading to the combine request, so it has
+    // knowledge of the return type and can apply the appropriate
+    // reinterpret_cast and launder operation to safely retrieve the data from
+    // this buffer
+    _t<std::aligned_storage<48, 8>> storage_;
+  };
+  std::array<std::uint8_t, hardware_destructive_interference_size> padding2;
+};
+
+/**
+ * A template that helps us differentiate between the different ways to return
+ * a value from a combined critical section.  A return value of type void
+ * cannot be stored anywhere, so we use specializations and pick the right one
+ * switched through std::conditional_t
+ *
+ * This is then used by CoalescedTask and its family of functions to implement
+ * efficient return value transfers to the waiting threads
+ */
+template <typename Func>
+class RequestWithReturn {
+ public:
+  using F = Func;
+  using ReturnType = decltype(std::declval<const Func&>()());
+  explicit RequestWithReturn(Func func) : func_{std::move(func)} {}
+
+  /**
+   * We need to define the destructor here because C++ requires (with good
+   * reason) that a union with non-default destructor be explicitly destroyed
+   * from the surrounding class, as neither the runtime nor compiler have the
+   * knowledge of what to do with a union at the time of destruction
+   *
+   * Each request that has a valid return value set will have the value
+   * retrieved from the get() method, where the value is destroyed.  So we
+   * don't need to destroy it here
+   */
+  ~RequestWithReturn() {}
+
+  /**
+   * This method can be used to return a value from the request.  This returns
+   * the underlying value because return type of the function we were
+   * instantiated with is not void
+   */
+  ReturnType get() && {
+    // when the return value has been processed, we destroy the value
+    // contained in this request.  Using a scope_exit means that we don't have
+    // to worry about storing the value somewhere and causing potentially an
+    // extra move
+    //
+    // note that the invariant here is that this function is only called if the
+    // requesting thread had it's critical section combined, and the value_
+    // member constructed through detach()
+    SCOPE_EXIT {
+      value_.~ReturnType();
+    };
+    return std::move(value_);
+  }
+
+  // this contains a copy of the function the waiter had requested to be
+  // executed as a combined critical section
+  Func func_;
+  // this stores the return value used in the request, we use a union here to
+  // avoid laundering and allow return types that are not default
+  // constructible to be propagated through the execution of the critical
+  // section
+  //
+  // note that this is an anonymous union, the member leaks into the
+  // surrounding scope as a member variable
+  union {
+    ReturnType value_;
+  };
+};
+
+template <typename Func>
+class RequestWithoutReturn {
+ public:
+  using F = Func;
+  using ReturnType = void;
+  explicit RequestWithoutReturn(Func func) : func_{std::move(func)} {}
+
+  /**
+   * In this version of the request class, get() returns nothing as there is
+   * no stored value
+   */
+  void get() && {}
+
+  // this contains a copy of the function the waiter had requested to be
+  // executed as a combined critical section
+  Func func_;
+};
+
+// we need to use std::integral_constant::value here as opposed to
+// std::integral_constant::operator T() because MSVC errors out with the
+// implicit conversion
+template <typename Func>
+using Request = _t<std::conditional<
+    std::is_void<decltype(std::declval<const Func&>()())>::value,
+    RequestWithoutReturn<Func>,
+    RequestWithReturn<Func>>>;
+
+/**
+ * A template that helps us to transform a callable returning a value to one
+ * that returns void so it can be type erased and passed on to the waker.  If
+ * the return value is small enough, it gets coalesced into the wait struct
+ * for optimal data transfer.  When it's not small enough to fit in the waiter
+ * storage buffer, we place it on it's own cacheline with isolation to prevent
+ * false-sharing with the on-stack metadata of the waiter thread
+ *
+ * This helps a combined critical section feel more normal in the case where
+ * the user wants to return a value, for example
+ *
+ *    auto value = mutex_.lock_combine([&]() {
+ *      return data_.value();
+ *    });
+ *
+ * Without this, the user would typically create a dummy object that they
+ * would then assign to from within the lambda.  With return value chaining,
+ * this pattern feels more natural
+ *
+ * Note that it is important to copy the entire callble into this class.
+ * Storing something like a reference instead is not desirable because it does
+ * not allow InlineFunctionRef to use inline storage to represent the user's
+ * callable without extra indirections
+ *
+ * We use std::conditional_t and switch to the right type of task with the
+ * CoalescedTask type alias
+ */
+template <typename Func, typename Waiter>
+class TaskWithCoalesce {
+ public:
+  using ReturnType = decltype(std::declval<const Func&>()());
+  using StorageType = folly::Unit;
+  explicit TaskWithCoalesce(Func func, Waiter& waiter)
+      : func_{std::move(func)}, waiter_(waiter) {}
+
+  void operator()() const {
+    auto value = func_();
+    new (&waiter_.storage_) ReturnType(std::move(value));
+  }
+
+ private:
+  Func func_;
+  Waiter& waiter_;
+
+  static_assert(!std::is_void<ReturnType>{}, "");
+  static_assert(alignof(decltype(waiter_.storage_)) >= alignof(ReturnType), "");
+  static_assert(sizeof(decltype(waiter_.storage_)) >= sizeof(ReturnType), "");
+};
+
+template <typename Func, typename Waiter>
+class TaskWithoutCoalesce {
+ public:
+  using ReturnType = void;
+  using StorageType = folly::Unit;
+  explicit TaskWithoutCoalesce(Func func, Waiter&) : func_{std::move(func)} {}
+
+  void operator()() const {
+    func_();
+  }
+
+ private:
+  Func func_;
+};
+
+template <typename Func, typename Waiter>
+class TaskWithBigReturnValue {
+ public:
+  // Using storage that is aligned on the cacheline boundary helps us avoid a
+  // situation where the data ends up being allocated on two separate
+  // cachelines.  This would require the remote thread to pull in both lines
+  // to issue a write.
+  //
+  // We also isolate the storage by appending some padding to the end to
+  // ensure we avoid false-sharing with the metadata used while the waiter
+  // waits
+  using ReturnType = decltype(std::declval<const Func&>()());
+  static const auto kReturnValueAlignment = folly::kIsMsvc
+      ? 8
+      : folly::constexpr_max(
+            alignof(ReturnType),
+            folly::hardware_destructive_interference_size);
+  using StorageType = _t<std::aligned_storage<
+      sizeof(
+          _t<std::aligned_storage<sizeof(ReturnType), kReturnValueAlignment>>),
+      kReturnValueAlignment>>;
+
+  explicit TaskWithBigReturnValue(Func func, Waiter&)
+      : func_{std::move(func)} {}
+
+  void operator()() const {
+    assert(storage_);
+    auto value = func_();
+    new (storage_) ReturnType(std::move(value));
+  }
+
+  void attach(StorageType* storage) {
+    assert(!storage_);
+    storage_ = storage;
+  }
+
+ private:
+  Func func_;
+  StorageType* storage_{nullptr};
+
+  static_assert(!std::is_void<ReturnType>{}, "");
+  static_assert(sizeof(Waiter::storage_) < sizeof(ReturnType), "");
+};
+
+template <typename T, bool>
+struct Sizeof_;
+template <typename T>
+struct Sizeof_<T, false> : std::integral_constant<std::size_t, sizeof(T)> {};
+template <typename T>
+struct Sizeof_<T, true> : std::integral_constant<std::size_t, 0> {};
+template <typename T>
+struct Sizeof : Sizeof_<T, std::is_void<T>::value> {};
+
+// we need to use std::integral_constant::value here as opposed to
+// std::integral_constant::operator T() because MSVC errors out with the
+// implicit conversion
+template <typename Func, typename Waiter>
+using CoalescedTask = _t<std::conditional<
+    std::is_void<decltype(std::declval<const Func&>()())>::value,
+    TaskWithoutCoalesce<Func, Waiter>,
+    _t<std::conditional<
+        Sizeof<decltype(std::declval<const Func&>()())>::value <=
+            sizeof(Waiter::storage_),
+        TaskWithCoalesce<Func, Waiter>,
+        TaskWithBigReturnValue<Func, Waiter>>>>>;
+
+/**
+ * Given a request and a wait node, coalesce them into a CoalescedTask that
+ * coalesces the return value into the wait node when invoked from a remote
+ * thread
+ *
+ * When given a null request through nullptr_t, coalesce() returns null as well
+ */
+template <typename Waiter>
+std::nullptr_t coalesce(std::nullptr_t&, Waiter&) {
+  return nullptr;
+}
+
+template <
+    typename Request,
+    typename Waiter,
+    typename Func = typename Request::F>
+CoalescedTask<Func, Waiter> coalesce(Request& request, Waiter& waiter) {
+  static_assert(!std::is_same<Request, std::nullptr_t>{}, "");
+  return CoalescedTask<Func, Waiter>{request.func_, waiter};
+}
+
+/**
+ * Given a task, create storage for the return value.  When we get a type
+ * of CoalescedTask, this returns an instance of CoalescedTask::StorageType.
+ * std::nullptr_t otherwise
+ */
+inline std::nullptr_t makeReturnValueStorageFor(std::nullptr_t&) {
+  return {};
+}
+
+template <
+    typename CoalescedTask,
+    typename StorageType = typename CoalescedTask::StorageType>
+StorageType makeReturnValueStorageFor(CoalescedTask&) {
+  return {};
+}
+
+/**
+ * Given a task and storage, attach them together if needed.  This only helps
+ * when we have a task that returns a value bigger than can be coalesced.  In
+ * that case, we need to attach the storage with the task so the return value
+ * can be transferred to this thread from the remote thread
+ */
+template <typename Task, typename Storage>
+void attach(Task&, Storage&) {
+  static_assert(
+      std::is_same<Storage, std::nullptr_t>{} ||
+          std::is_same<Storage, folly::Unit>{},
+      "");
+}
+
+template <
+    typename R,
+    typename W,
+    typename StorageType = typename TaskWithBigReturnValue<R, W>::StorageType>
+void attach(TaskWithBigReturnValue<R, W>& task, StorageType& storage) {
+  task.attach(&storage);
+}
+
+template <typename Request, typename Waiter>
+void throwIfExceptionOccurred(Request&, Waiter& waiter, bool exception) {
+  using Storage = decltype(waiter.storage_);
+  using F = typename Request::F;
+  static_assert(sizeof(Storage) >= sizeof(std::exception_ptr), "");
+  static_assert(alignof(Storage) >= alignof(std::exception_ptr), "");
+
+  // we only need to check for an exception in the waiter struct if the passed
+  // callable is not noexcept
+  //
+  // we need to make another instance of the exception with automatic storage
+  // duration and destroy the exception held in the storage *before throwing* to
+  // avoid leaks.  If we don't destroy the exception_ptr in storage, the
+  // refcount for the internal exception will never hit zero, thereby leaking
+  // memory
+  if ((!noexcept(std::declval<const F&>()()) && exception)) {
+    auto storage = &waiter.storage_;
+    auto exc = folly::launder(reinterpret_cast<std::exception_ptr*>(storage));
+    auto copy = std::move(*exc);
+    exc->std::exception_ptr::~exception_ptr();
+    std::rethrow_exception(std::move(copy));
+  }
+}
+
+/**
+ * Given a CoalescedTask, a wait node and a request.  Detach the return value
+ * into the request from the wait node and task.
+ */
+template <typename Waiter>
+void detach(std::nullptr_t&, Waiter&, bool exception, std::nullptr_t&) {
+  assert(!exception);
+}
+
+template <typename Waiter, typename F>
+void detach(
+    RequestWithoutReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    folly::Unit&) {
+  throwIfExceptionOccurred(request, waiter, exception);
+}
+
+template <typename Waiter, typename F>
+void detach(
+    RequestWithReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    folly::Unit&) {
+  throwIfExceptionOccurred(request, waiter, exception);
+
+  using ReturnType = typename RequestWithReturn<F>::ReturnType;
+  static_assert(!std::is_same<ReturnType, void>{}, "");
+  static_assert(sizeof(waiter.storage_) >= sizeof(ReturnType), "");
+
+  auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&waiter.storage_));
+  new (&request.value_) ReturnType(std::move(val));
+  val.~ReturnType();
+}
+
+template <typename Waiter, typename F, typename Storage>
+void detach(
+    RequestWithReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    Storage& storage) {
+  throwIfExceptionOccurred(request, waiter, exception);
+
+  using ReturnType = typename RequestWithReturn<F>::ReturnType;
+  static_assert(!std::is_same<ReturnType, void>{}, "");
+  static_assert(sizeof(storage) >= sizeof(ReturnType), "");
+
+  auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&storage));
+  new (&request.value_) ReturnType(std::move(val));
+  val.~ReturnType();
+}
+
+/**
+ * Get the time since epoch in nanoseconds
+ *
+ * This is faster than std::chrono::steady_clock because it avoids a VDSO
+ * access to get the timestamp counter
+ *
+ * Note that the hardware timestamp counter on x86, like std::steady_clock is
+ * guaranteed to be monotonically increasing -
+ * https://c9x.me/x86/html/file_module_x86_id_278.html
+ */
+inline std::chrono::nanoseconds time() {
+  return std::chrono::nanoseconds{hardware_timestamp()};
+}
+
+/**
+ * Zero out the other bits used by the implementation and return just an
+ * address from a uintptr_t
+ */
+template <typename Type>
+Type* extractPtr(std::uintptr_t from) {
+  // shift one bit off the end, to get all 1s followed by a single 0
+  auto mask = std::numeric_limits<std::uintptr_t>::max();
+  mask >>= 1;
+  mask <<= 1;
+  assert(!(mask & 0b1));
+
+  return folly::bit_cast<Type*>(from & mask);
+}
+
+/**
+ * Strips the given nanoseconds into only the least significant 56 bits by
+ * moving the least significant 56 bits over by 8 zeroing out the bottom 8
+ * bits to be used as a medium of information transfer for the thread wait
+ * nodes
+ */
+inline std::uint64_t strip(std::chrono::nanoseconds t) {
+  auto time = t.count();
+  return static_cast<std::uint64_t>(time) << 8;
+}
+
+/**
+ * Recover the timestamp value from an integer that has the timestamp encoded
+ * in it
+ */
+inline std::uint64_t recover(std::uint64_t from) {
+  return from >> 8;
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+class DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy {
+ public:
+  // DistributedMutexStateProxy is move constructible and assignable for
+  // convenience
+  DistributedMutexStateProxy(DistributedMutexStateProxy&& other) {
+    *this = std::move(other);
+  }
+
+  DistributedMutexStateProxy& operator=(DistributedMutexStateProxy&& other) {
+    assert(!(*this));
+
+    next_ = folly::exchange(other.next_, nullptr);
+    expected_ = folly::exchange(other.expected_, 0);
+    timedWaiters_ = folly::exchange(other.timedWaiters_, false);
+    combined_ = folly::exchange(other.combined_, false);
+    waker_ = folly::exchange(other.waker_, 0);
+    waiters_ = folly::exchange(other.waiters_, nullptr);
+    ready_ = folly::exchange(other.ready_, nullptr);
+
+    return *this;
+  }
+
+  // The proxy is valid when a mutex acquisition attempt was successful,
+  // lock() is guaranteed to return a valid proxy, try_lock() is not
+  explicit operator bool() const {
+    return expected_;
+  }
+
+  // private:
+  // friend the mutex class, since that will be accessing state private to
+  // this class
+  friend class DistributedMutex<Atomic, TimePublishing>;
+
+  DistributedMutexStateProxy(
+      Waiter<Atomic>* next,
+      std::uintptr_t expected,
+      bool timedWaiter = false,
+      bool combined = false,
+      std::uintptr_t waker = 0,
+      Waiter<Atomic>* waiters = nullptr,
+      Waiter<Atomic>* ready = nullptr)
+      : next_{next},
+        expected_{expected},
+        timedWaiters_{timedWaiter},
+        combined_{combined},
+        waker_{waker},
+        waiters_{waiters},
+        ready_{ready} {}
+
+  // the next thread that is to be woken up, this being null at the time of
+  // unlock() shows that the current thread acquired the mutex without
+  // contention or it was the terminal thread in the queue of threads waking up
+  Waiter<Atomic>* next_{nullptr};
+  // this is the value that the current thread should expect to find on
+  // unlock, and if this value is not there on unlock, the current thread
+  // should assume that other threads are enqueued waiting for the mutex
+  //
+  // note that if the mutex has the same state set at unlock time, and this is
+  // set to an address (and not say kLocked in the case of a terminal waker)
+  // then it must have been the case that no other thread had enqueued itself,
+  // since threads in the domain of this mutex do not share stack space
+  //
+  // if we want to support stack sharing, we can solve the problem by looping
+  // at lock time, and setting a variable that says whether we have acquired
+  // the lock or not perhaps
+  std::uintptr_t expected_{0};
+  // a boolean that will be set when the mutex has timed waiters that the
+  // current thread is responsible for waking, in such a case, the current
+  // thread will issue an atomic_notify_one() call after unlocking the mutex
+  //
+  // note that a timed waiter will itself always have this flag set.  This is
+  // done so we can avoid having to issue a atomic_notify_all() call (and
+  // subsequently a thundering herd) when waking up timed-wait threads
+  bool timedWaiters_{false};
+  // a boolean that contains true if the state proxy is not meant to be passed
+  // to the unlock() function.  This is set only when there is contention and
+  // a thread had asked for its critical section to be combined
+  bool combined_{false};
+  // metadata passed along from the thread that woke this thread up
+  std::uintptr_t waker_{0};
+  // the list of threads that are waiting on a futex
+  //
+  // the current threads is meant to wake up this list of waiters if it is
+  // able to commit an unlock() on the mutex without seeing a contention chain
+  Waiter<Atomic>* waiters_{nullptr};
+  // after a thread has woken up from a futex() call, it will have the rest of
+  // the threads that it were waiting behind it in this list, a thread that
+  // unlocks has to wake up threads from this list if it has any, before it
+  // goes to sleep to prevent pathological unfairness
+  Waiter<Atomic>* ready_{nullptr};
+};
+
+template <template <typename> class Atomic, bool TimePublishing>
+DistributedMutex<Atomic, TimePublishing>::DistributedMutex()
+    : state_{kUnlocked} {}
+
+template <typename Waiter>
+std::uint64_t publish(
+    std::uint64_t spins,
+    bool& shouldPublish,
+    std::chrono::nanoseconds& previous,
+    Waiter& waiter,
+    std::uint32_t waitMode) {
+  // time publishing has some overhead because it executes an atomic exchange on
+  // the futex word.  If this line is in a remote thread (eg.  the combiner),
+  // then each time we publish a timestamp, this thread has to submit an RFO to
+  // the remote core for the cacheline, blocking progress for both threads.
+  //
+  // the remote core uses a store in the fast path - why then does an RFO make a
+  // difference?  The only educated guess we have here is that the added
+  // roundtrip delays draining of the store buffer, which essentially exerts
+  // backpressure on future stores, preventing parallelization
+  //
+  // if we have requested a combine, time publishing is less important as it
+  // only comes into play when the combiner has exhausted their max combine
+  // passes.  So we defer time publishing to the point when the current thread
+  // gets preempted
+  auto current = time();
+  if ((current - previous) >= kScheduledAwaySpinThreshold) {
+    shouldPublish = true;
+  }
+  previous = current;
+
+  // if we have requested a combine, and this is the first iteration of the
+  // wait-loop, we publish a max timestamp to optimistically convey that we have
+  // not yet been preempted (the remote knows the meaning of max timestamps)
+  //
+  // then if we are under the maximum number of spins allowed before sleeping,
+  // we publish the exact timestamp, otherwise we publish the minimum possible
+  // timestamp to force the waking thread to skip us
+  auto now = ((waitMode == kCombineWaiting) && !spins)
+      ? decltype(time())::max()
+      : (spins < kMaxSpins) ? previous : decltype(time())::zero();
+
+  // the wait mode information is published in the bottom 8 bits of the futex
+  // word, the rest contains time information as computed above.  Overflows are
+  // not really a correctness concern because time publishing is only a
+  // heuristic.  This leaves us 56 bits of nanoseconds (2 years) before we hit
+  // two consecutive wraparounds, so the lack of bits to respresent time is
+  // neither a performance nor correctness concern
+  auto data = strip(now) | waitMode;
+  auto signal = (shouldPublish || !spins || (waitMode != kCombineWaiting))
+      ? waiter.futex_.exchange(data, std::memory_order_acq_rel)
+      : waiter.futex_.load(std::memory_order_acquire);
+  return signal & std::numeric_limits<std::uint8_t>::max();
+}
+
+template <typename Waiter>
+bool spin(Waiter& waiter, std::uint32_t& sig, std::uint32_t mode) {
+  auto spins = std::uint64_t{0};
+  auto waitMode = (mode == kCombineUninitialized) ? kCombineWaiting : kWaiting;
+  auto previous = time();
+  auto shouldPublish = false;
+  while (true) {
+    auto signal = publish(spins++, shouldPublish, previous, waiter, waitMode);
+
+    // if we got skipped, make a note of it and return if we got a skipped
+    // signal or a signal to wake up
+    auto skipped = (signal == kSkipped);
+    auto combined = (signal == kCombined);
+    auto exceptionOccurred = (signal == kExceptionOccurred);
+    auto woken = (signal == kWake);
+    if (skipped || woken || combined || exceptionOccurred) {
+      sig = static_cast<std::uint32_t>(signal);
+      return !skipped;
+    }
+
+    // if we are under the spin threshold, pause to allow the other
+    // hyperthread to run.  If not, then sleep
+    if (spins < kMaxSpins) {
+      asm_volatile_pause();
+    } else {
+      Sleeper::sleep();
+    }
+  }
+}
+
+template <typename Waiter>
+void doFutexWake(Waiter* waiter) {
+  if (waiter) {
+    // We can use a simple store operation here and not worry about checking
+    // to see if the thread had actually started waiting on the futex, that is
+    // already done in tryWake() when a sleeping thread is collected
+    //
+    // We now do not know whether the waiter had already enqueued on the futex
+    // or whether it had just stored kSleeping in its futex and was about to
+    // call futexWait().  We treat both these scenarios the same
+    //
+    // the below can theoretically cause a problem if we set the
+    // wake signal and the waiter was in between setting kSleeping in its
+    // futex and enqueueing on the futex.  In this case the waiter will just
+    // return from futexWait() immediately.  This leaves the address that the
+    // waiter was using for futexWait() possibly dangling, and the thread that
+    // we woke in the exchange above might have used that address for some
+    // other object
+    //
+    // however, even if the thread had indeed woken up simply becasue of the
+    // above exchange(), the futexWake() below is not incorrect.  It is not
+    // incorrect because futexWake() does not actually change the memory of
+    // the futex word.  It just uses the address to do a lookup in the kernel
+    // futex table.  And even if we call futexWake() on some other address,
+    // and that address was being used to wait on futex() that thread will
+    // protect itself from spurious wakeups, check the value in the futex word
+    // and enqueue itself back on the futex
+    //
+    // this dangilng pointer possibility is why we use a pointer to the futex
+    // word, and avoid dereferencing after the store() operation
+    auto sleeper = &waiter->metadata_.sleeper_;
+    sleeper->store(kWake, std::memory_order_release);
+    futexWake(sleeper, 1);
+  }
+}
+
+template <typename Waiter>
+bool doFutexWait(Waiter* waiter, Waiter*& next) {
+  // first we get ready to sleep by calling exchange() on the futex with a
+  // kSleeping value
+  assert(waiter->futex_.load(std::memory_order_relaxed) == kAboutToWait);
+
+  // note the semantics of using a futex here, when we exchange the sleeper_
+  // with kSleeping, we are getting ready to sleep, but before sleeping we get
+  // ready to sleep, and we return from futexWait() when the value of
+  // sleeper_ might have changed.  We can also wake up because of a spurious
+  // wakeup, so we always check against the value in sleeper_ after returning
+  // from futexWait(), if the value is not kWake, then we continue
+  auto pre =
+      waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
+
+  // Seeing a kSleeping on a futex word before we set it ourselves means only
+  // one thing - an unlocking thread caught us before we went to futex(), and
+  // we now have the lock, so we abort
+  //
+  // if we were given an early delivery, we can return from this function with
+  // a true, meaning that we now have the lock
+  if (pre == kSleeping) {
+    return true;
+  }
+
+  // if we reach here then were were not given an early delivery, and any
+  // thread that goes to wake us up will see a consistent view of the rest of
+  // the contention chain (since the next_ variable is set before the
+  // kSleeping exchange above)
+  while (pre != kWake) {
+    // before enqueueing on the futex, we wake any waiters that we were
+    // possibly responsible for
+    doFutexWake(folly::exchange(next, nullptr));
+
+    // then we wait on the futex
+    //
+    // note that we have to protect ourselves against spurious wakeups here.
+    // Because the corresponding futexWake() above does not synchronize
+    // wakeups around the futex word.  Because doing so would become
+    // inefficient
+    futexWait(&waiter->metadata_.sleeper_, kSleeping);
+    pre = waiter->metadata_.sleeper_.load(std::memory_order_acquire);
+    assert((pre == kSleeping) || (pre == kWake));
+  }
+
+  // when coming out of a futex, we might have some other sleeping threads
+  // that we were supposed to wake up, assign that to the next pointer
+  assert(next == nullptr);
+  next = extractPtr<Waiter>(waiter->next_.load(std::memory_order_relaxed));
+  return false;
+}
+
+template <typename Waiter>
+bool wait(Waiter* waiter, std::uint32_t mode, Waiter*& next, uint32_t& signal) {
+  if (mode == kAboutToWait) {
+    return doFutexWait(waiter, next);
+  }
+
+  return spin(*waiter, signal, mode);
+}
+
+inline void recordTimedWaiterAndClearTimedBit(
+    bool& timedWaiter,
+    std::uintptr_t& previous) {
+  // the previous value in the mutex can never be kTimedWaiter, timed waiters
+  // always set (kTimedWaiter | kLocked) in the mutex word when they try and
+  // acquire the mutex
+  assert(previous != kTimedWaiter);
+
+  if ((previous & kTimedWaiter)) {
+    // record whether there was a timed waiter in the previous mutex state, and
+    // clear the timed bit from the previous state
+    timedWaiter = true;
+    previous = previous & (~kTimedWaiter);
+  }
+}
+
+template <typename Atomic>
+void wakeTimedWaiters(Atomic* state, bool timedWaiters) {
+  if ((timedWaiters)) {
+    atomic_notify_one(state);
+  }
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Func>
+auto DistributedMutex<Atomic, TimePublishing>::lock_combine(Func func)
+    -> decltype(std::declval<const Func&>()()) {
+  // invoke the lock implementation function and check whether we came out of
+  // it with our task executed as a combined critical section.  This usually
+  // happens when the mutex is contended.
+  //
+  // In the absence of contention, we just return from the try_lock() function
+  // with the lock acquired.  So we need to invoke the task and unlock
+  // the mutex before returning
+  auto&& task = Request<Func>{func};
+  auto&& state = lockImplementation(*this, state_, task);
+  if (!state.combined_) {
+    // to avoid having to play a return-value dance when the combinable
+    // returns void, we use a scope exit to perform the unlock after the
+    // function return has been processed
+    SCOPE_EXIT {
+      unlock(std::move(state));
+    };
+    return func();
+  }
+
+  // if we are here, that means we were able to get our request combined, we
+  // can return the value that was transferred to us
+  //
+  // each thread that enqueues as a part of a contention chain takes up the
+  // responsibility of any timed waiter that had come immediately before it,
+  // so we wake up timed waiters before exiting the lock function.  Another
+  // strategy might be to add the timed waiter information to the metadata and
+  // let a single leader wake up a timed waiter for better concurrency.  But
+  // this has proven not to be useful in benchmarks beyond a small 5% delta,
+  // so we avoid taking the complexity hit and branch to wake up timed waiters
+  // from each thread
+  wakeTimedWaiters(&state_, state.timedWaiters_);
+  return std::move(task).get();
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::lock() {
+  auto null = nullptr;
+  return lockImplementation(*this, state_, null);
+}
+
+template <typename Atomic, template <typename> class A, bool T>
+auto tryLockNoLoad(Atomic& atomic, DistributedMutex<A, T>&)
+    -> typename DistributedMutex<A, T>::DistributedMutexStateProxy {
+  // Try and set the least significant bit of the centralized lock state to 1,
+  // if this succeeds, it must have been the case that we had a kUnlocked (or
+  // 0) in the central storage before, since that is the only case where a 0
+  // can be found in the least significant bit
+  //
+  // If this fails, then it is a no-op
+  using Proxy = typename DistributedMutex<A, T>::DistributedMutexStateProxy;
+  auto previous = atomic_fetch_set(atomic, 0, std::memory_order_acquire);
+  if (!previous) {
+    return Proxy{nullptr, kLocked};
+  }
+
+  return Proxy{nullptr, 0};
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock() {
+  // The lock attempt below requires an expensive atomic fetch-and-mutate or
+  // an even more expensive atomic compare-and-swap loop depending on the
+  // platform.  These operations require pulling the lock cacheline into the
+  // current core in exclusive mode and are therefore hard to parallelize
+  //
+  // This probabilistically avoids the expense by first checking whether the
+  // mutex is currently locked
+  if (state_.load(std::memory_order_relaxed) != kUnlocked) {
+    return DistributedMutexStateProxy{nullptr, 0};
+  }
+
+  return tryLockNoLoad(state_, *this);
+}
+
+template <
+    template <typename> class Atomic,
+    bool TimePublishing,
+    typename State,
+    typename Request>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+lockImplementation(
+    DistributedMutex<Atomic, TimePublishing>& mutex,
+    State& atomic,
+    Request& request) {
+  // first try and acquire the lock as a fast path, the underlying
+  // implementation is slightly faster than using std::atomic::exchange() as
+  // is used in this function.  So we get a small perf boost in the
+  // uncontended case
+  //
+  // We only go through this fast path for the lock/unlock usage and avoid this
+  // for combined critical sections.  This check adds unnecessary overhead in
+  // that case as it causes an extra cacheline bounce
+  constexpr auto combineRequested = !std::is_same<Request, std::nullptr_t>{};
+  if (!combineRequested) {
+    if (auto state = tryLockNoLoad(atomic, mutex)) {
+      return state;
+    }
+  }
+
+  auto previous = std::uintptr_t{0};
+  auto waitMode = combineRequested ? kCombineUninitialized : kUninitialized;
+  auto nextWaitMode = kAboutToWait;
+  auto timedWaiter = false;
+  Waiter<Atomic>* nextSleeper = nullptr;
+  while (true) {
+    // construct the state needed to wait
+    //
+    // We can't use auto here because MSVC errors out due to a missing copy
+    // constructor
+    Waiter<Atomic> state{};
+    auto&& task = coalesce(request, state);
+    auto&& storage = makeReturnValueStorageFor(task);
+    auto&& address = folly::bit_cast<std::uintptr_t>(&state);
+    attach(task, storage);
+    state.initialize(waitMode, std::move(task));
+    assert(!(address & 0b1));
+
+    // set the locked bit in the address we will be persisting in the mutex
+    address |= kLocked;
+
+    // attempt to acquire the mutex, mutex acquisition is successful if the
+    // previous value is zeroed out
+    //
+    // we use memory_order_acq_rel here because we want the read-modify-write
+    // operation to be both acquire and release.  Acquire becasue if this is a
+    // successful lock acquisition, we want to acquire state any other thread
+    // has released from a prior unlock.  We want release semantics becasue
+    // other threads that read the address of this value should see the full
+    // well-initialized node we are going to wait on if the mutex acquisition
+    // was unsuccessful
+    previous = atomic.exchange(address, std::memory_order_acq_rel);
+    recordTimedWaiterAndClearTimedBit(timedWaiter, previous);
+    state.next_.store(previous, std::memory_order_relaxed);
+    if (previous == kUnlocked) {
+      return {/* next */ nullptr,
+              /* expected */ address,
+              /* timedWaiter */ timedWaiter,
+              /* combined */ false,
+              /* waker */ 0,
+              /* waiters */ nullptr,
+              /* ready */ nextSleeper};
+    }
+    assert(previous & kLocked);
+
+    // wait until we get a signal from another thread, if this returns false,
+    // we got skipped and had probably been scheduled out, so try again
+    auto signal = kUninitialized;
+    if (!wait(&state, waitMode, nextSleeper, signal)) {
+      std::swap(waitMode, nextWaitMode);
+      continue;
+    }
+
+    // at this point it is safe to access the other fields in the waiter state,
+    // since the thread that woke us up is gone and nobody will be touching this
+    // state again, note that this requires memory ordering, and this is why we
+    // use memory_order_acquire (among other reasons) in the above wait
+    //
+    // first we see if the value we took off the mutex state was the thread that
+    // initated the wakeups, if so, we are the terminal node of the current
+    // contention chain.  If we are the terminal node, then we should expect to
+    // see a kLocked in the mutex state when we unlock, if we see that, we can
+    // commit the unlock to the centralized mutex state.  If not, we need to
+    // continue wakeups
+    //
+    // a nice consequence of passing kLocked as the current address if we are
+    // the terminal node is that it naturally just works with the algorithm.  If
+    // we get a contention chain when coming out of a contention chain, the tail
+    // of the new contention chain will have kLocked set as the previous, which,
+    // as it happens "just works", since we have now established a recursive
+    // relationship until broken
+    auto next = previous;
+    auto expected = address;
+    if (previous == state.metadata_.waker_) {
+      next = 0;
+      expected = kLocked;
+    }
+
+    // if we were given a combine signal, detach the return value from the
+    // wait struct into the request, so the current thread can access it
+    // outside this function
+    auto combined = (signal == kCombined);
+    auto exceptionOccurred = (signal == kExceptionOccurred);
+    if (combined || exceptionOccurred) {
+      detach(request, state, exceptionOccurred, storage);
+    }
+
+    // if we are just coming out of a futex call, then it means that the next
+    // waiter we are responsible for is also a waiter waiting on a futex, so
+    // we return that list in the list of ready threads.  We wlil be waking up
+    // the ready threads on unlock no matter what
+    return {/* next */ extractPtr<Waiter<Atomic>>(next),
+            /* expected */ expected,
+            /* timedWaiter */ timedWaiter,
+            /* combined */ combineRequested && (combined || exceptionOccurred),
+            /* waker */ state.metadata_.waker_,
+            /* waiters */ extractPtr<Waiter<Atomic>>(state.metadata_.waiters_),
+            /* ready */ nextSleeper};
+  }
+}
+
+inline bool preempted(std::uint64_t value, std::chrono::nanoseconds now) {
+  auto currentTime = recover(strip(now));
+  auto nodeTime = recover(value);
+  auto preempted =
+      (currentTime > nodeTime + kScheduledAwaySpinThreshold.count()) &&
+      (nodeTime != recover(strip(std::chrono::nanoseconds::max())));
+
+  // we say that the thread has been preempted if its timestamp says so, and
+  // also if it is neither uninitialized nor skipped
+  assert(value != kSkipped);
+  return (preempted) && (value != kUninitialized) &&
+      (value != kCombineUninitialized);
+}
+
+inline bool isSleeper(std::uintptr_t value) {
+  return (value == kAboutToWait);
+}
+
+inline bool isInitialized(std::uintptr_t value) {
+  return (value != kUninitialized) && (value != kCombineUninitialized);
+}
+
+inline bool isCombiner(std::uintptr_t value) {
+  auto mode = (value & 0xff);
+  return (mode == kCombineWaiting) || (mode == kCombineUninitialized);
+}
+
+inline bool isWaitingCombiner(std::uintptr_t value) {
+  return (value & 0xff) == kCombineWaiting;
+}
+
+template <typename Waiter>
+CombineFunction loadTask(Waiter* current, std::uintptr_t value) {
+  // if we know that the waiter is a combiner of some sort, it is safe to read
+  // and copy the value of the function in the waiter struct, since we know
+  // that a waiter would have set it before enqueueing
+  if (isCombiner(value)) {
+    return current->function_;
+  }
+
+  return nullptr;
+}
+
+template <typename Waiter>
+void transferCurrentException(Waiter* waiter) {
+  assert(std::current_exception());
+  new (&waiter->storage_) std::exception_ptr(std::current_exception());
+  waiter->futex_.store(kExceptionOccurred, std::memory_order_release);
+}
+
+template <template <typename> class Atomic>
+inline std::uintptr_t tryCombine(
+    Waiter<Atomic>* waiter,
+    std::uintptr_t value,
+    std::uintptr_t next,
+    std::uint64_t iteration,
+    std::chrono::nanoseconds now,
+    CombineFunction task) {
+  // if the waiter has asked for a combine operation, we should combine its
+  // critical section and move on to the next waiter
+  //
+  // the waiter is combinable if the following conditions are satisfied
+  //
+  //  1) the state in the futex word is not uninitialized (kUninitialized)
+  //  2) it has a valid combine function
+  //  3) we are not past the limit of the number of combines we can perform
+  //     or the waiter thread been preempted.  If the waiter gets preempted,
+  //     its better to just execute their critical section before moving on.
+  //     As they will have to re-queue themselves after preemption anyway,
+  //     leading to further delays in critical section completion
+  //
+  // if all the above are satisfied, then we can combine the critical section.
+  // Note that if the waiter is in a combineable state, that means that it had
+  // finished its writes to both the task and the next_ value.  And observing
+  // a waiting state also means that we have acquired the writes to the other
+  // members of the waiter struct, so it's fine to use those values here
+  if (isWaitingCombiner(value) &&
+      (iteration <= kMaxCombineIterations || preempted(value, now))) {
+    try {
+      task();
+      waiter->futex_.store(kCombined, std::memory_order_release);
+    } catch (...) {
+      transferCurrentException(waiter);
+    }
+    return next;
+  }
+
+  return 0;
+}
+
+template <typename Waiter>
+inline std::uintptr_t tryWake(
+    bool publishing,
+    Waiter* waiter,
+    std::uintptr_t value,
+    std::uintptr_t next,
+    std::uintptr_t waker,
+    Waiter*& sleepers,
+    std::uint64_t iteration,
+    CombineFunction task) {
+  // try and combine the waiter's request first, if that succeeds that means
+  // we have successfully executed their critical section and can move on to
+  // the rest of the chain
+  auto now = time();
+  if (tryCombine(waiter, value, next, iteration, now, task)) {
+    return next;
+  }
+
+  // first we see if we can wake the current thread that is spinning
+  if ((!publishing || !preempted(value, now)) && !isSleeper(value)) {
+    // the Metadata class should be trivially destructible as we use placement
+    // new to set the relevant metadata without calling any destructor.  We
+    // need to use placement new because the class contains a futex, which is
+    // non-movable and non-copyable
+    using Metadata = _t<std::decay<decltype(waiter->metadata_)>>;
+    static_assert(std::is_trivially_destructible<Metadata>{}, "");
+
+    // we need release here because of the write to waker_ and also because we
+    // are unlocking the mutex, the thread we do the handoff to here should
+    // see the modified data
+    new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
+    waiter->futex_.store(kWake, std::memory_order_release);
+    return 0;
+  }
+
+  // if the thread is not a sleeper, and we were not able to catch it before
+  // preemption, we can just return a false, it is safe to read next_ because
+  // the thread was preempted.  Preemption signals can only come after the
+  // thread has set the next_ pointer, since the timestamp writes only start
+  // occurring after that point
+  //
+  // if a thread was preempted it must have stored next_ in the waiter struct,
+  // as the store to futex_ that resets the value from kUninitialized happens
+  // after the write to next
+  assert(publishing);
+  if (!isSleeper(value)) {
+    // go on to the next one
+    //
+    // Also, we need a memory_order_release here to prevent missed wakeups.  A
+    // missed wakeup here can happen when we see that a thread had been
+    // preempted and skip it.  Then go on to release the lock, and then when
+    // the thread which got skipped does an exchange on the central storage,
+    // still sees the locked bit, and never gets woken up
+    //
+    // Can we relax this?
+    assert(preempted(value, now));
+    assert(!isCombiner(value));
+    next = waiter->next_.load(std::memory_order_relaxed);
+    waiter->futex_.store(kSkipped, std::memory_order_release);
+    return next;
+  }
+
+  // if we are here the thread is a sleeper
+  //
+  // we attempt to catch the thread before it goes to futex().  If we are able
+  // to catch the thread before it sleeps on a futex, we are done, and don't
+  // need to go any further
+  //
+  // if we are not able to catch the thread before it goes to futex, we
+  // collect the current thread in the list of sleeping threads represented by
+  // sleepers, and return the next thread in the list and return false along
+  // with the previous next value
+  //
+  // it is safe to read the next_ pointer in the waiter struct if we were
+  // unable to catch the thread before it went to futex() because we use
+  // acquire-release ordering for the exchange operation below.  And if we see
+  // that the thread was already sleeping, we have synchronized with the write
+  // to next_ in the context of the sleeping thread
+  //
+  // Also we need to set the value of waiters_ and waker_ in the thread before
+  // doing the exchange because we need to pass on the list of sleepers in the
+  // event that we were able to catch the thread before it went to futex().
+  // If we were unable to catch the thread before it slept, these fields will
+  // be ignored when the thread wakes up anyway
+  assert(isSleeper(value));
+  waiter->metadata_.waker_ = waker;
+  waiter->metadata_.waiters_ = folly::bit_cast<std::uintptr_t>(sleepers);
+  auto pre =
+      waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
+
+  // we were able to catch the thread before it went to sleep, return true
+  if (pre != kSleeping) {
+    return 0;
+  }
+
+  // otherwise return false, with the value of next_, it is safe to read next
+  // because of the same logic as when a thread was preempted
+  //
+  // we also need to collect this sleeper in the list of sleepers being built
+  // up
+  next = waiter->next_.load(std::memory_order_relaxed);
+  auto head = folly::bit_cast<std::uintptr_t>(sleepers);
+  waiter->next_.store(head, std::memory_order_relaxed);
+  sleepers = waiter;
+  return next;
+}
+
+template <typename Waiter>
+bool wake(
+    bool publishing,
+    Waiter& waiter,
+    std::uintptr_t waker,
+    Waiter*& sleepers,
+    std::uint64_t iter) {
+  // loop till we find a node that is either at the end of the list (as
+  // specified by waker) or we find a node that is active (as specified by
+  // the last published timestamp of the node)
+  auto current = &waiter;
+  while (current) {
+    // it is important that we load the value of function and next_ after the
+    // initial acquire load.  This is required because we need to synchronize
+    // with the construction of the waiter struct before reading from it
+    //
+    // the load from the next_ variable is an optimistic load that assumes
+    // that the waiting thread has probably gone to the waiting state.  If the
+    // waiitng thread is in the waiting state (as revealed by the acquire load
+    // from the futex word), we will see a well formed next_ value because it
+    // happens-before the release store to the futex word.  The atomic load from
+    // next_ is an optimization to avoid branching before loading and prevent
+    // the compiler from eliding the load altogether (and using a pointer
+    // dereference when needed)
+    auto value = current->futex_.load(std::memory_order_acquire);
+    auto next = current->next_.load(std::memory_order_relaxed);
+    auto task = loadTask(current, value);
+    next =
+        tryWake(publishing, current, value, next, waker, sleepers, iter, task);
+
+    // if there is no next node, we have managed to wake someone up and have
+    // successfully migrated the lock to another thread
+    if (!next) {
+      return true;
+    }
+
+    // we need to read the value of the next node in the list before skipping
+    // it, this is because after we skip it the node might wake up and enqueue
+    // itself, and thereby gain a new next node
+    assert(publishing);
+    current = (next == waker) ? nullptr : extractPtr<Waiter>(next);
+  }
+
+  return false;
+}
+
+template <typename Atomic, typename Proxy, typename Sleepers>
+bool tryUnlockClean(Atomic& state, Proxy& proxy, Sleepers sleepers) {
+  auto expected = proxy.expected_;
+  while (true) {
+    if (state.compare_exchange_strong(
+            expected,
+            kUnlocked,
+            std::memory_order_release,
+            std::memory_order_relaxed)) {
+      // if we were able to commit an unlocked, we need to wake up the futex
+      // waiters, if any
+      doFutexWake(sleepers);
+      return true;
+    }
+
+    // if we failed the compare_exchange_strong() above, we check to see if
+    // the failure was because of the presence of a timed waiter.  If that
+    // was the case then we try one more time with the kTimedWaiter bit set
+    if (expected == (proxy.expected_ | kTimedWaiter)) {
+      proxy.timedWaiters_ = true;
+      continue;
+    }
+
+    // otherwise break, we have a contention chain
+    return false;
+  }
+}
+
+template <template <typename> class Atomic, bool Publish>
+void DistributedMutex<Atomic, Publish>::unlock(
+    DistributedMutex::DistributedMutexStateProxy proxy) {
+  // we always wake up ready threads and timed waiters if we saw either
+  assert(proxy);
+  assert(!proxy.combined_);
+  SCOPE_EXIT {
+    doFutexWake(proxy.ready_);
+    wakeTimedWaiters(&state_, proxy.timedWaiters_);
+  };
+
+  // if there is a wait queue we are responsible for, try and start wakeups,
+  // don't bother with the mutex state
+  auto sleepers = proxy.waiters_;
+  if (proxy.next_) {
+    if (wake(Publish, *proxy.next_, proxy.waker_, sleepers, 0)) {
+      return;
+    }
+
+    // At this point, if are in the if statement, we were not the terminal
+    // node of the wakeup chain.  Terminal nodes have the next_ pointer set to
+    // null in lock()
+    //
+    // So we need to pretend we were the end of the contention chain.  Coming
+    // out of a contention chain always has the kLocked state set in the
+    // mutex.  Unless there is another contention chain lined up, which does
+    // not matter since we are the terminal node anyway
+    proxy.expected_ = kLocked;
+  }
+
+  for (std::uint64_t i = 0; true; ++i) {
+    // otherwise, since we don't have anyone we need to wake up, we try and
+    // release the mutex just as is
+    //
+    // if this is successful, we can return, the unlock was successful, we have
+    // committed a nice kUnlocked to the central storage, yay
+    if (tryUnlockClean(state_, proxy, sleepers)) {
+      return;
+    }
+
+    // here we have a contention chain built up on the mutex.  We grab the
+    // wait queue and start executing wakeups.  We leave a locked bit on the
+    // centralized storage and handoff control to the head of the queue
+    //
+    // we use memory_order_acq_rel here because we want to see the
+    // full well-initialized node that the other thread is waiting on
+    //
+    // If we are unable to wake the contention chain, it is possible that when
+    // we come back to looping here, a new contention chain will form.  In
+    // that case we need to use kLocked as the waker_ value because the
+    // terminal node of the new chain will see kLocked in the central storage
+    auto head = state_.exchange(kLocked, std::memory_order_acq_rel);
+    recordTimedWaiterAndClearTimedBit(proxy.timedWaiters_, head);
+    auto next = extractPtr<Waiter<Atomic>>(head);
+    auto expected = folly::exchange(proxy.expected_, kLocked);
+    assert((head & kLocked) && (head != kLocked));
+    if (wake(Publish, *next, expected, sleepers, i)) {
+      break;
+    }
+  }
+}
+
+template <typename Atomic, typename Deadline, typename MakeProxy>
+auto timedLock(Atomic& state, Deadline deadline, MakeProxy proxy)
+    -> decltype(std::declval<MakeProxy&>()(nullptr, kLocked, true)) {
+  while (true) {
+    // we put a bit on the central state to show that there is a timed waiter
+    // and go to sleep on the central state
+    //
+    // when this thread goes to unlock the mutex, it will expect a 0b1 in the
+    // mutex state (0b1, not 0b11), but then it will see that the value in the
+    // mutex state is 0b11 and not 0b1, meaning that there might have been
+    // another timed waiter.  Even though there might not have been another
+    // timed waiter in the time being.  This sort of missed wakeup is
+    // desirable for timed waiters; it helps avoid thundering herds of timed
+    // waiters.  Because the mutex is packed in 8 bytes, and we need an
+    // address to be stored in those 8 bytes, we don't have much room to play
+    // with.  The only other solution is to issue a futexWake(INT_MAX) to wake
+    // up all waiters when a clean unlock is committed, when a thread saw a
+    // timed waiter in the mutex previously.
+    //
+    // putting a 0b11 here works for a set of reasons that is a superset of
+    // the set of reasons that make it okay to put a kLocked (0b1) in the
+    // mutex state.  Now that the thread has put (kTimedWaiter | kLocked)
+    // (0b11) in the mutex state and it expects a kLocked (0b1), there are two
+    // scenarios possible.  The first being when there is no contention chain
+    // formation in the mutex from the time a timed waiter got a lock to
+    // unlock.  In this case, the unlocker sees a 0b11 in the mutex state,
+    // adjusts to the presence of a timed waiter and cleanly unlocks with a
+    // kUnlocked (0b0).  The second is when there is a contention chain.
+    // When a thread puts its address in the mutex and sees the timed bit, it
+    // records the presence of a timed waiter, and then pretends as if it
+    // hadn't seen the timed bit.  So future contention chain releases, will
+    // terminate with a kLocked (0b1) and not a (kLocked | kTimedWaiter)
+    // (0b11).  This just works naturally with the rest of the algorithm
+    // without incurring a perf hit for the regular non-timed case
+    //
+    // this strategy does however mean, that when threads try to acquire the
+    // mutex and all time out, there will be a wasteful syscall to issue wakeups
+    // to waiting threads.  We don't do anything to try and minimize this
+    //
+    // we need to use a fetch_or() here because we need to convey two bits of
+    // information - 1, whether the mutex is locked or not, and 2, whether
+    // there is a timed waiter.  The alternative here is to use the second bit
+    // to convey information only, we can use a fetch_set() on the second bit
+    // to make this faster, but that comes at the expense of requiring regular
+    // fast path lock attempts.  Which use a single bit read-modify-write for
+    // better performance
+    auto data = kTimedWaiter | kLocked;
+    auto previous = state.fetch_or(data, std::memory_order_acquire);
+    if (!(previous & 0b1)) {
+      assert(!previous);
+      return proxy(nullptr, kLocked, true);
+    }
+
+    // wait on the futex until signalled, if we get a timeout, the try_lock
+    // fails
+    auto result = atomic_wait_until(&state, previous | data, deadline);
+    if (result == std::cv_status::timeout) {
+      return proxy(nullptr, std::uintptr_t{0}, false);
+    }
+  }
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Clock, typename Duration>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock_until(
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  // fast path for the uncontended case
+  //
+  // we get the time after trying to acquire the mutex because in the
+  // uncontended case, the price of getting the time is about 1/3 of the
+  // actual mutex acquisition.  So we only pay the price of that extra bit of
+  // latency when needed
+  //
+  // this is even higher when VDSO is involved on architectures that do not
+  // offer a direct interface to the timestamp counter
+  if (auto state = try_lock()) {
+    return state;
+  }
+
+  // fall back to the timed locking algorithm
+  using Proxy = DistributedMutexStateProxy;
+  return timedLock(
+      state_,
+      deadline,
+      [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
+        return Proxy{next, expected, timedWaiter};
+      });
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Rep, typename Period>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock_for(
+    const std::chrono::duration<Rep, Period>& duration) {
+  // fast path for the uncontended case.  Reasoning for doing this here is the
+  // same as in try_lock_until()
+  if (auto state = try_lock()) {
+    return state;
+  }
+
+  // fall back to the timed locking algorithm
+  using Proxy = DistributedMutexStateProxy;
+  auto deadline = std::chrono::steady_clock::now() + duration;
+  return timedLock(
+      state_,
+      deadline,
+      [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
+        return Proxy{next, expected, timedWaiter};
+      });
+}
+} // namespace distributed_mutex
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex.cpp b/third-party/folly/folly/synchronization/DistributedMutex.cpp
new file mode 100644
index 00000000000..28684ff2974
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex.cpp
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+
+template class DistributedMutex<std::atomic, true>;
+
+} // namespace distributed_mutex
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex.h b/third-party/folly/folly/synchronization/DistributedMutex.h
new file mode 100644
index 00000000000..7acf04f458b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex.h
@@ -0,0 +1,304 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+
+/**
+ * DistributedMutex is a small, exclusive-only mutex that distributes the
+ * bookkeeping required for mutual exclusion in the stacks of threads that are
+ * contending for it.  It has a mode that can combine critical sections when
+ * the mutex experiences contention; this allows the implementation to elide
+ * several expensive coherence and synchronization operations to boost
+ * throughput, surpassing even atomic instructions in some cases.  It has a
+ * smaller memory footprint than std::mutex, a similar level of fairness
+ * (better in some cases) and no dependencies on heap allocation.  It is the
+ * same width as a single pointer (8 bytes on most platforms), where on the
+ * other hand, std::mutex and pthread_mutex_t are both 40 bytes.  It is larger
+ * than some of the other smaller locks, but the wide majority of cases using
+ * the small locks are wasting the difference in alignment padding anyway
+ *
+ * Benchmark results are good - at the time of writing, in the contended case,
+ * for lock/unlock based critical sections, it is about 4-5x faster than the
+ * smaller locks and about ~2x faster than std::mutex.  When used in
+ * combinable mode, it is much faster than the alternatives, going more than
+ * 10x faster than the small locks, about 6x faster than std::mutex, 2-3x
+ * faster than flat combining and even faster than std::atomic<> in some
+ * cases, allowing more work with higher throughput.  In the uncontended case,
+ * it is a few cycles faster than folly::MicroLock but a bit slower than
+ * std::mutex.  DistributedMutex is also resistent to tail latency pathalogies
+ * unlike many of the other mutexes in use, which sleep for large time
+ * quantums to reduce spin churn, this causes elevated latencies for threads
+ * that enter the sleep cycle.  The tail latency of lock acquisition can go up
+ * to 10x lower because of a more deterministic scheduling algorithm that is
+ * managed almost entirely in userspace.  Detailed results comparing the
+ * throughput and latencies of different mutex implementations and atomics are
+ * at the bottom of folly/synchronization/test/SmallLocksBenchmark.cpp
+ *
+ * Theoretically, write locks promote concurrency when the critical sections
+ * are small as most of the work is done outside the lock.  And indeed,
+ * performant concurrent applications go through several pains to limit the
+ * amount of work they do while holding a lock.  However, most times, the
+ * synchronization and scheduling overhead of a write lock in the critical
+ * path is so high, that after a certain point, making critical sections
+ * smaller does not actually increase the concurrency of the application and
+ * throughput plateaus.  DistributedMutex moves this breaking point to the
+ * level of hardware atomic instructions, so applications keep getting
+ * concurrency even under very high contention.  It does this by reducing
+ * cache misses and contention in userspace and in the kernel by making each
+ * thread wait on a thread local node and futex.  When combined critical
+ * sections are used DistributedMutex leverages template metaprogramming to
+ * allow the mutex to make better synchronization decisions based on the
+ * layout of the input and output data.  This allows threads to keep working
+ * only on their own cache lines without requiring cache coherence operations
+ * when a mutex experiences heavy contention
+ *
+ * Non-timed mutex acquisitions are scheduled through intrusive LIFO
+ * contention chains.  Each thread starts by spinning for a short quantum and
+ * falls back to two phased sleeping.  Enqueue operations are lock free and
+ * are piggybacked off mutex acquisition attempts.  The LIFO behavior of a
+ * contention chain is good in the case where the mutex is held for a short
+ * amount of time, as the head of the chain is likely to not have slept on
+ * futex() after exhausting its spin quantum.  This allow us to avoid
+ * unnecessary traversal and syscalls in the fast path with a higher
+ * probability.  Even though the contention chains are LIFO, the mutex itself
+ * does not adhere to that scheduling policy globally.  During contention,
+ * threads that fail to lock the mutex form a LIFO chain on the central mutex
+ * state, this chain is broken when a wakeup is scheduled, and future enqueue
+ * operations form a new chain.  This makes the chains themselves LIFO, but
+ * preserves global fairness through a constant factor which is limited to the
+ * number of concurrent failed mutex acquisition attempts.  This binds the
+ * last in first out behavior to the number of contending threads and helps
+ * prevent starvation and latency outliers
+ *
+ * This strategy of waking up wakers one by one in a queue does not scale well
+ * when the number of threads goes past the number of cores.  At which point
+ * preemption causes elevated lock acquisition latencies.  DistributedMutex
+ * implements a hardware timestamp publishing heuristic to detect and adapt to
+ * preemption.
+ *
+ * DistributedMutex does not have the typical mutex API - it does not satisfy
+ * the Lockable concept.  It requires the user to maintain ephemeral bookkeeping
+ * and pass that bookkeeping around to unlock() calls.  The API overhead,
+ * however, comes for free when you wrap this mutex for usage with
+ * std::unique_lock, which is the recommended usage (std::lock_guard, in
+ * optimized mode, has no performance benefit over std::unique_lock, so has been
+ * omitted).  A benefit of this API is that it disallows incorrect usage where a
+ * thread unlocks a mutex that it does not own, thinking a mutex is functionally
+ * identical to a binary semaphore, which, unlike a mutex, is a suitable
+ * primitive for that usage
+ *
+ * Combined critical sections allow the implementation to elide several
+ * expensive operations during the lifetime of a critical section that cause
+ * slowdowns with regular lock/unlock based usage.  DistributedMutex resolves
+ * contention through combining up to a constant factor of 2 contention chains
+ * to prevent issues with fairness and latency outliers, so we retain the
+ * fairness benefits of the lock/unlock implementation with no noticeable
+ * regression when switching between the lock methods.  Despite the efficiency
+ * benefits, combined critical sections can only be used when the critical
+ * section does not depend on thread local state and does not introduce new
+ * dependencies between threads when the critical section gets combined.  For
+ * example, locking or unlocking an unrelated mutex in a combined critical
+ * section might lead to unexpected results or even undefined behavior.  This
+ * can happen if, for example, a different thread unlocks a mutex locked by
+ * the calling thread, leading to undefined behavior as the mutex might not
+ * allow locking and unlocking from unrelated threads (the posix and C++
+ * standard disallow this usage for their mutexes)
+ *
+ * Timed locking through DistributedMutex is implemented through a centralized
+ * algorithm.  The underlying contention-chains framework used in
+ * DistributedMutex is not abortable so we build abortability on the side.
+ * All waiters wait on the central mutex state, by setting and resetting bits
+ * within the pointer-length word.  Since pointer length atomic integers are
+ * incompatible with futex(FUTEX_WAIT) on most systems, a non-standard
+ * implementation of futex() is used, where wait queues are managed in
+ * user-space (see p1135r0 and folly::ParkingLot for more)
+ */
+template <
+    template <typename> class Atomic = std::atomic,
+    bool TimePublishing = true>
+class DistributedMutex {
+ public:
+  class DistributedMutexStateProxy;
+
+  /**
+   * DistributedMutex is only default constructible, it can neither be moved
+   * nor copied
+   */
+  DistributedMutex();
+  DistributedMutex(DistributedMutex&&) = delete;
+  DistributedMutex(const DistributedMutex&) = delete;
+  DistributedMutex& operator=(DistributedMutex&&) = delete;
+  DistributedMutex& operator=(const DistributedMutex&) = delete;
+
+  /**
+   * Acquires the mutex in exclusive mode
+   *
+   * This returns an ephemeral proxy that contains internal mutex state.  This
+   * must be kept around for the duration of the critical section and passed
+   * subsequently to unlock() as an rvalue
+   *
+   * The proxy has no public API and is intended to be for internal usage only
+   *
+   * There are three notable cases where this method causes undefined
+   * behavior:
+   *
+   *  - This is not a recursive mutex.  Trying to acquire the mutex twice from
+   *    the same thread without unlocking it results in undefined behavior
+   *  - Thread, coroutine or fiber migrations from within a critical section
+   *    are disallowed.  This is because the implementation requires owning the
+   *    stack frame through the execution of the critical section for both
+   *    lock/unlock or combined critical sections.  This also means that you
+   *    cannot allow another thread, fiber or coroutine to unlock the mutex
+   *  - This mutex cannot be used in a program compiled with segmented stacks,
+   *    there is currently no way to detect the presence of segmented stacks
+   *    at compile time or runtime, so we have no checks against this
+   */
+  DistributedMutexStateProxy lock();
+
+  /**
+   * Unlocks the mutex
+   *
+   * The proxy returned by lock must be passed to unlock as an rvalue.  No
+   * other option is possible here, since the proxy is only movable and not
+   * copyable
+   *
+   * It is undefined behavior to unlock from a thread that did not lock the
+   * mutex
+   */
+  void unlock(DistributedMutexStateProxy);
+
+  /**
+   * Try to acquire the mutex
+   *
+   * A non blocking version of the lock() function.  The returned object is
+   * contextually convertible to bool.  And has the value true when the mutex
+   * was successfully acquired, false otherwise
+   *
+   * This is allowed to return false spuriously, i.e. this is not guaranteed
+   * to return true even when the mutex is currently unlocked.  In the event
+   * of a failed acquisition, this does not impose any memory ordering
+   * constraints for other threads
+   */
+  DistributedMutexStateProxy try_lock();
+
+  /**
+   * Try to acquire the mutex, blocking for the given time
+   *
+   * Like try_lock(), this is allowed to fail spuriously and is not guaranteed
+   * to return false even when the mutex is currently unlocked.  But only
+   * after the given time has elapsed
+   *
+   * try_lock_for() accepts a duration to block for, and try_lock_until()
+   * accepts an absolute wall clock time point
+   */
+  template <typename Rep, typename Period>
+  DistributedMutexStateProxy try_lock_for(
+      const std::chrono::duration<Rep, Period>& duration);
+
+  /**
+   * Try to acquire the lock, blocking until the given deadline
+   *
+   * Other than the difference in the meaning of the second argument, the
+   * semantics of this function are identical to try_lock_for()
+   */
+  template <typename Clock, typename Duration>
+  DistributedMutexStateProxy try_lock_until(
+      const std::chrono::time_point<Clock, Duration>& deadline);
+
+  /**
+   * Execute a task as a combined critical section
+   *
+   * Unlike traditional lock and unlock methods, lock_combine() enqueues the
+   * passed task for execution on any arbitrary thread.  This allows the
+   * implementation to prevent cache line invalidations originating from
+   * expensive synchronization operations.  The thread holding the lock is
+   * allowed to execute the task before unlocking, thereby forming a "combined
+   * critical section".
+   *
+   * This idea is inspired by Flat Combining.  Flat Combining was introduced
+   * in the SPAA 2010 paper titled "Flat Combining and the
+   * Synchronization-Parallelism Tradeoff", by Danny Hendler, Itai Incze, Nir
+   * Shavit, and Moran Tzafrir -
+   * https://www.cs.bgu.ac.il/~hendlerd/papers/flat-combining.pdf.  The
+   * implementation used here is significantly different from that described
+   * in the paper.  The high-level goal of reducing the overhead of
+   * synchronization, however, is the same.
+   *
+   * Combined critical sections work best when kept simple.  Since the
+   * critical section might be executed on any arbitrary thread, relying on
+   * things like thread local state or mutex locking and unlocking might cause
+   * incorrectness.  Associativity is important.  For example
+   *
+   *    auto one = std::unique_lock{one_};
+   *    two_.lock_combine([&]() {
+   *      if (bar()) {
+   *        one.unlock();
+   *      }
+   *    });
+   *
+   * This has the potential to cause undefined behavior because mutexes are
+   * only meant to be acquired and released from the owning thread.  Similar
+   * errors can arise from a combined critical section introducing implicit
+   * dependencies based on the state of the combining thread.  For example
+   *
+   *    // thread 1
+   *    auto one = std::unique_lock{one_};
+   *    auto two = std::unique_lock{two_};
+   *
+   *    // thread 2
+   *    two_.lock_combine([&]() {
+   *      auto three = std::unique_lock{three_};
+   *    });
+   *
+   * Here, because we used a combined critical section, we have introduced a
+   * dependency from one -> three that might not obvious to the reader
+   *
+   * This function is exception-safe.  If the passed task throws an exception,
+   * it will be propagated to the caller, even if the task is running on
+   * another thread
+   *
+   * There are three notable cases where this method causes undefined
+   * behavior:
+   *
+   *  - This is not a recursive mutex.  Trying to acquire the mutex twice from
+   *    the same thread without unlocking it results in undefined behavior
+   *  - Thread, coroutine or fiber migrations from within a critical section
+   *    are disallowed.  This is because the implementation requires owning the
+   *    stack frame through the execution of the critical section for both
+   *    lock/unlock or combined critical sections.  This also means that you
+   *    cannot allow another thread, fiber or coroutine to unlock the mutex
+   *  - This mutex cannot be used in a program compiled with segmented stacks,
+   *    there is currently no way to detect the presence of segmented stacks
+   *    at compile time or runtime, so we have no checks against this
+   */
+  template <typename Task>
+  auto lock_combine(Task task) -> decltype(std::declval<const Task&>()());
+
+ private:
+  Atomic<std::uintptr_t> state_{0};
+};
+
+} // namespace distributed_mutex
+} // namespace detail
+
+/**
+ * Bring the default instantiation of DistributedMutex into the folly
+ * namespace without requiring any template arguments for public usage
+ */
+extern template class detail::distributed_mutex::DistributedMutex<>;
+using DistributedMutex = detail::distributed_mutex::DistributedMutex<>;
+
+} // namespace folly
+
+#include <folly/synchronization/DistributedMutex-inl.h>
+#include <folly/synchronization/DistributedMutexSpecializations.h>
diff --git a/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h b/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h
new file mode 100644
index 00000000000..451aa69bcf0
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/synchronization/DistributedMutex.h>
+#include <folly/synchronization/detail/ProxyLockable.h>
+
+/**
+ * Specializations for DistributedMutex allow us to use it like a normal
+ * mutex.  Even though it has a non-usual interface
+ */
+namespace std {
+template <template <typename> class Atom, bool TimePublishing>
+class unique_lock<
+    ::folly::detail::distributed_mutex::DistributedMutex<Atom, TimePublishing>>
+    : public ::folly::detail::ProxyLockableUniqueLock<
+          ::folly::detail::distributed_mutex::
+              DistributedMutex<Atom, TimePublishing>> {
+ public:
+  using ::folly::detail::ProxyLockableUniqueLock<
+      ::folly::detail::distributed_mutex::
+          DistributedMutex<Atom, TimePublishing>>::ProxyLockableUniqueLock;
+};
+
+template <template <typename> class Atom, bool TimePublishing>
+class lock_guard<
+    ::folly::detail::distributed_mutex::DistributedMutex<Atom, TimePublishing>>
+    : public ::folly::detail::ProxyLockableLockGuard<
+          ::folly::detail::distributed_mutex::
+              DistributedMutex<Atom, TimePublishing>> {
+ public:
+  using ::folly::detail::ProxyLockableLockGuard<
+      ::folly::detail::distributed_mutex::
+          DistributedMutex<Atom, TimePublishing>>::ProxyLockableLockGuard;
+};
+} // namespace std
diff --git a/third-party/folly/folly/synchronization/ParkingLot.cpp b/third-party/folly/folly/synchronization/ParkingLot.cpp
new file mode 100644
index 00000000000..74fba8e936b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/ParkingLot.cpp
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/ParkingLot.h>
+
+#include <array>
+
+namespace folly {
+namespace parking_lot_detail {
+
+Bucket& Bucket::bucketFor(uint64_t key) {
+  constexpr size_t const kNumBuckets = 4096;
+
+  // Statically allocating this lets us use this in allocation-sensitive
+  // contexts. This relies on the assumption that std::mutex won't dynamically
+  // allocate memory, which we assume to be the case on Linux and iOS.
+  static Indestructible<std::array<Bucket, kNumBuckets>> gBuckets;
+  return (*gBuckets)[key % kNumBuckets];
+}
+
+std::atomic<uint64_t> idallocator{0};
+
+} // namespace parking_lot_detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/ParkingLot.h b/third-party/folly/folly/synchronization/ParkingLot.h
new file mode 100644
index 00000000000..bb324fb0a77
--- /dev/null
+++ b/third-party/folly/folly/synchronization/ParkingLot.h
@@ -0,0 +1,318 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+#include <folly/hash/Hash.h>
+#include <folly/Indestructible.h>
+#include <folly/Unit.h>
+
+namespace folly {
+
+namespace parking_lot_detail {
+
+struct WaitNodeBase {
+  const uint64_t key_;
+  const uint64_t lotid_;
+  WaitNodeBase* next_{nullptr};
+  WaitNodeBase* prev_{nullptr};
+
+  // tricky: hold both bucket and node mutex to write, either to read
+  bool signaled_;
+  std::mutex mutex_;
+  std::condition_variable cond_;
+
+  WaitNodeBase(uint64_t key, uint64_t lotid)
+      : key_(key), lotid_(lotid), signaled_(false) {}
+
+  template <typename Clock, typename Duration>
+  std::cv_status wait(std::chrono::time_point<Clock, Duration> deadline) {
+    std::cv_status status = std::cv_status::no_timeout;
+    std::unique_lock<std::mutex> nodeLock(mutex_);
+    while (!signaled_ && status != std::cv_status::timeout) {
+      if (deadline != std::chrono::time_point<Clock, Duration>::max()) {
+        status = cond_.wait_until(nodeLock, deadline);
+      } else {
+        cond_.wait(nodeLock);
+      }
+    }
+    return status;
+  }
+
+  void wake() {
+    std::lock_guard<std::mutex> nodeLock(mutex_);
+    signaled_ = true;
+    cond_.notify_one();
+  }
+
+  bool signaled() {
+    return signaled_;
+  }
+};
+
+extern std::atomic<uint64_t> idallocator;
+
+// Our emulated futex uses 4096 lists of wait nodes.  There are two levels
+// of locking: a per-list mutex that controls access to the list and a
+// per-node mutex, condvar, and bool that are used for the actual wakeups.
+// The per-node mutex allows us to do precise wakeups without thundering
+// herds.
+struct Bucket {
+  std::mutex mutex_;
+  WaitNodeBase* head_;
+  WaitNodeBase* tail_;
+  std::atomic<uint64_t> count_;
+
+  static Bucket& bucketFor(uint64_t key);
+
+  void push_back(WaitNodeBase* node) {
+    if (tail_) {
+      assert(head_);
+      node->prev_ = tail_;
+      tail_->next_ = node;
+      tail_ = node;
+    } else {
+      tail_ = node;
+      head_ = node;
+    }
+  }
+
+  void erase(WaitNodeBase* node) {
+    assert(count_.load(std::memory_order_relaxed) >= 1);
+    if (head_ == node && tail_ == node) {
+      assert(node->prev_ == nullptr);
+      assert(node->next_ == nullptr);
+      head_ = nullptr;
+      tail_ = nullptr;
+    } else if (head_ == node) {
+      assert(node->prev_ == nullptr);
+      assert(node->next_);
+      head_ = node->next_;
+      head_->prev_ = nullptr;
+    } else if (tail_ == node) {
+      assert(node->next_ == nullptr);
+      assert(node->prev_);
+      tail_ = node->prev_;
+      tail_->next_ = nullptr;
+    } else {
+      assert(node->next_);
+      assert(node->prev_);
+      node->next_->prev_ = node->prev_;
+      node->prev_->next_ = node->next_;
+    }
+    count_.fetch_sub(1, std::memory_order_relaxed);
+  }
+};
+
+} // namespace parking_lot_detail
+
+enum class UnparkControl {
+  RetainContinue,
+  RemoveContinue,
+  RetainBreak,
+  RemoveBreak,
+};
+
+enum class ParkResult {
+  Skip,
+  Unpark,
+  Timeout,
+};
+
+/*
+ * ParkingLot provides an interface that is similar to Linux's futex
+ * system call, but with additional functionality.  It is implemented
+ * in a portable way on top of std::mutex and std::condition_variable.
+ *
+ * Additional reading:
+ * https://webkit.org/blog/6161/locking-in-webkit/
+ * https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/ParkingLot.h
+ * https://locklessinc.com/articles/futex_cheat_sheet/
+ *
+ * The main difference from futex is that park/unpark take lambdas,
+ * such that nearly anything can be done while holding the bucket
+ * lock.  Unpark() lambda can also be used to wake up any number of
+ * waiters.
+ *
+ * ParkingLot is templated on the data type, however, all ParkingLot
+ * implementations are backed by a single static array of buckets to
+ * avoid large memory overhead.  Lambdas will only ever be called on
+ * the specific ParkingLot's nodes.
+ */
+template <typename Data = Unit>
+class ParkingLot {
+  const uint64_t lotid_;
+  ParkingLot(const ParkingLot&) = delete;
+
+  struct WaitNode : public parking_lot_detail::WaitNodeBase {
+    const Data data_;
+
+    template <typename D>
+    WaitNode(uint64_t key, uint64_t lotid, D&& data)
+        : WaitNodeBase(key, lotid), data_(std::forward<D>(data)) {}
+  };
+
+ public:
+  ParkingLot() : lotid_(parking_lot_detail::idallocator++) {}
+
+  /* Park API
+   *
+   * Key is almost always the address of a variable.
+   *
+   * ToPark runs while holding the bucket lock: usually this
+   * is a check to see if we can sleep, by checking waiter bits.
+   *
+   * PreWait is usually used to implement condition variable like
+   * things, such that you can unlock the condition variable's lock at
+   * the appropriate time.
+   */
+  template <typename Key, typename D, typename ToPark, typename PreWait>
+  ParkResult park(const Key key, D&& data, ToPark&& toPark, PreWait&& preWait) {
+    return park_until(
+        key,
+        std::forward<D>(data),
+        std::forward<ToPark>(toPark),
+        std::forward<PreWait>(preWait),
+        std::chrono::steady_clock::time_point::max());
+  }
+
+  template <
+      typename Key,
+      typename D,
+      typename ToPark,
+      typename PreWait,
+      typename Clock,
+      typename Duration>
+  ParkResult park_until(
+      const Key key,
+      D&& data,
+      ToPark&& toPark,
+      PreWait&& preWait,
+      std::chrono::time_point<Clock, Duration> deadline);
+
+  template <
+      typename Key,
+      typename D,
+      typename ToPark,
+      typename PreWait,
+      typename Rep,
+      typename Period>
+  ParkResult park_for(
+      const Key key,
+      D&& data,
+      ToPark&& toPark,
+      PreWait&& preWait,
+      std::chrono::duration<Rep, Period>& timeout) {
+    return park_until(
+        key,
+        std::forward<D>(data),
+        std::forward<ToPark>(toPark),
+        std::forward<PreWait>(preWait),
+        timeout + std::chrono::steady_clock::now());
+  }
+
+  /*
+   * Unpark API
+   *
+   * Key is the same uniqueaddress used in park(), and is used as a
+   * hash key for lookup of waiters.
+   *
+   * Unparker is a function that is given the Data parameter, and
+   * returns an UnparkControl.  The Remove* results will remove and
+   * wake the waiter, the Ignore/Stop results will not, while stopping
+   * or continuing iteration of the waiter list.
+   */
+  template <typename Key, typename Unparker>
+  void unpark(const Key key, Unparker&& func);
+};
+
+template <typename Data>
+template <
+    typename Key,
+    typename D,
+    typename ToPark,
+    typename PreWait,
+    typename Clock,
+    typename Duration>
+ParkResult ParkingLot<Data>::park_until(
+    const Key bits,
+    D&& data,
+    ToPark&& toPark,
+    PreWait&& preWait,
+    std::chrono::time_point<Clock, Duration> deadline) {
+  auto key = hash::twang_mix64(uint64_t(bits));
+  auto& bucket = parking_lot_detail::Bucket::bucketFor(key);
+  WaitNode node(key, lotid_, std::forward<D>(data));
+
+  {
+    // A: Must be seq_cst.  Matches B.
+    bucket.count_.fetch_add(1, std::memory_order_seq_cst);
+
+    std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
+
+    if (!std::forward<ToPark>(toPark)()) {
+      bucketLock.unlock();
+      bucket.count_.fetch_sub(1, std::memory_order_relaxed);
+      return ParkResult::Skip;
+    }
+
+    bucket.push_back(&node);
+  } // bucketLock scope
+
+  std::forward<PreWait>(preWait)();
+
+  auto status = node.wait(deadline);
+
+  if (status == std::cv_status::timeout) {
+    // it's not really a timeout until we unlink the unsignaled node
+    std::lock_guard<std::mutex> bucketLock(bucket.mutex_);
+    if (!node.signaled()) {
+      bucket.erase(&node);
+      return ParkResult::Timeout;
+    }
+  }
+
+  return ParkResult::Unpark;
+}
+
+template <typename Data>
+template <typename Key, typename Func>
+void ParkingLot<Data>::unpark(const Key bits, Func&& func) {
+  auto key = hash::twang_mix64(uint64_t(bits));
+  auto& bucket = parking_lot_detail::Bucket::bucketFor(key);
+  // B: Must be seq_cst.  Matches A.  If true, A *must* see in seq_cst
+  // order any atomic updates in toPark() (and matching updates that
+  // happen before unpark is called)
+  if (bucket.count_.load(std::memory_order_seq_cst) == 0) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> bucketLock(bucket.mutex_);
+
+  for (auto iter = bucket.head_; iter != nullptr;) {
+    auto node = static_cast<WaitNode*>(iter);
+    iter = iter->next_;
+    if (node->key_ == key && node->lotid_ == lotid_) {
+      auto result = std::forward<Func>(func)(node->data_);
+      if (result == UnparkControl::RemoveBreak ||
+          result == UnparkControl::RemoveContinue) {
+        // we unlink, but waiter destroys the node
+        bucket.erase(node);
+
+        node->wake();
+      }
+      if (result == UnparkControl::RemoveBreak ||
+          result == UnparkControl::RetainBreak) {
+        return;
+      }
+    }
+  }
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/WaitOptions.cpp b/third-party/folly/folly/synchronization/WaitOptions.cpp
new file mode 100644
index 00000000000..0c1fe2b93bd
--- /dev/null
+++ b/third-party/folly/folly/synchronization/WaitOptions.cpp
@@ -0,0 +1,12 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/WaitOptions.h>
+
+namespace folly {
+
+constexpr std::chrono::nanoseconds WaitOptions::Defaults::spin_max;
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/WaitOptions.h b/third-party/folly/folly/synchronization/WaitOptions.h
new file mode 100644
index 00000000000..b28deb54d0d
--- /dev/null
+++ b/third-party/folly/folly/synchronization/WaitOptions.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <chrono>
+
+namespace folly {
+
+/// WaitOptions
+///
+/// Various synchronization primitives as well as various concurrent data
+/// structures built using them have operations which might wait. This type
+/// represents a set of options for controlling such waiting.
+class WaitOptions {
+ public:
+  struct Defaults {
+    /// spin_max
+    ///
+    /// If multiple threads are actively using a synchronization primitive,
+    /// whether indirectly via a higher-level concurrent data structure or
+    /// directly, where the synchronization primitive has an operation which
+    /// waits and another operation which wakes the waiter, it is common for
+    /// wait and wake events to happen almost at the same time. In this state,
+    /// we lose big 50% of the time if the wait blocks immediately.
+    ///
+    /// We can improve our chances of being waked immediately, before blocking,
+    /// by spinning for a short duration, although we have to balance this
+    /// against the extra cpu utilization, latency reduction, power consumption,
+    /// and priority inversion effect if we end up blocking anyway.
+    ///
+    /// We use a default maximum of 2 usec of spinning. As partial consolation,
+    /// since spinning as implemented in folly uses the pause instruction where
+    /// available, we give a small speed boost to the colocated hyperthread.
+    ///
+    /// On circa-2013 devbox hardware, it costs about 7 usec to FUTEX_WAIT and
+    /// then be awoken. Spins on this hw take about 7 nsec, where all but 0.5
+    /// nsec is the pause instruction.
+    static constexpr std::chrono::nanoseconds spin_max =
+        std::chrono::microseconds(2);
+  };
+
+  std::chrono::nanoseconds spin_max() const {
+    return spin_max_;
+  }
+  WaitOptions& spin_max(std::chrono::nanoseconds dur) {
+    spin_max_ = dur;
+    return *this;
+  }
+
+ private:
+  std::chrono::nanoseconds spin_max_ = Defaults::spin_max;
+};
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h b/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h
new file mode 100644
index 00000000000..6782c792eae
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+#include <folly/functional/Invoke.h>
+#include <folly/lang/Launder.h>
+
+namespace folly {
+namespace detail {
+
+/**
+ * InlineFunctionRef is similar to folly::FunctionRef but has the additional
+ * benefit of being able to store the function it was instantiated with inline
+ * in a buffer of the given capacity.  Inline storage is only used if the
+ * function object and a pointer (for type-erasure) are small enough to fit in
+ * the templated size.  If there is not enough in-situ capacity for the
+ * callable, this just stores a reference to the function object like
+ * FunctionRef.
+ *
+ * This helps give a perf boost in the case where the data gets separated from
+ * the point of invocation.  If, for example, at the point of invocation, the
+ * InlineFunctionRef object is not cached, a remote memory/cache read might be
+ * required to invoke the original callable.  Customizable inline storage
+ * helps tune storage so we can store a type-erased callable with better
+ * performance and locality.  A real-life example of this might be a
+ * folly::FunctionRef with a function pointer.  The folly::FunctionRef would
+ * point to the function pointer object in a remote location.  This causes a
+ * double-indirection at the point of invocation, and if that memory is dirty,
+ * or not cached, it would cause additional cache misses.  On the other hand
+ * with InlineFunctionRef, inline storage would store the value of the
+ * function pointer, avoiding the need to do a remote lookup to fetch the
+ * value of the function pointer.
+ *
+ * To prevent misuse, InlineFunctionRef disallows construction from an lvalue
+ * callable.  This is to prevent usage where a user relies on the callable's
+ * state after invocation through InlineFunctionRef.  This has the potential
+ * to copy the callable into inline storage when the callable is small, so we
+ * might not use the same function when invoking, but rather a copy of it.
+ *
+ * Also note that InlineFunctionRef will always invoke the const qualified
+ * version of the call operator for any callable that is passed.  Regardless
+ * of whether it has a non-const version.  This is done to enforce the logical
+ * constraint of function state being immutable.
+ *
+ * This class is always trivially-copyable (and therefore
+ * trivially-destructible), making it suitable for use in a union without
+ * requiring manual destruction.
+ */
+template <typename FunctionType, std::size_t Size>
+class InlineFunctionRef;
+
+template <typename ReturnType, typename... Args, std::size_t Size>
+class InlineFunctionRef<ReturnType(Args...), Size> {
+  using Storage =
+      _t<std::aligned_storage<Size - sizeof(uintptr_t), sizeof(uintptr_t)>>;
+  using Call = ReturnType (*)(const Storage&, Args&&...);
+
+  struct InSituTag {};
+  struct RefTag {};
+
+  static_assert(
+      (Size % sizeof(uintptr_t)) == 0,
+      "Size has to be a multiple of sizeof(uintptr_t)");
+  static_assert(Size >= 2 * sizeof(uintptr_t), "This doesn't work");
+  static_assert(alignof(Call) == alignof(Storage), "Mismatching alignments");
+
+  // This defines a mode tag that is used in the construction of
+  // InlineFunctionRef to determine the storage and indirection method for the
+  // passed callable.
+  //
+  // This requires that the we pass in a type that is not ref-qualified.
+  template <typename Func>
+  using ConstructMode = _t<std::conditional<
+      folly::is_trivially_copyable<Func>{} &&
+          (sizeof(Func) <= sizeof(Storage)) &&
+          (alignof(Func) <= alignof(Storage)),
+      InSituTag,
+      RefTag>>;
+
+ public:
+  /**
+   * InlineFunctionRef can be constructed from a nullptr, callable or another
+   * InlineFunctionRef with the same size.  These are the constructors that
+   * don't take a callable.
+   *
+   * InlineFunctionRef is meant to be trivially copyable so we default the
+   * constructors and assignment operators.
+   */
+  InlineFunctionRef(std::nullptr_t) : call_{nullptr} {}
+  InlineFunctionRef() : call_{nullptr} {}
+  InlineFunctionRef(const InlineFunctionRef& other) = default;
+  InlineFunctionRef(InlineFunctionRef&&) = default;
+  InlineFunctionRef& operator=(const InlineFunctionRef&) = default;
+  InlineFunctionRef& operator=(InlineFunctionRef&&) = default;
+
+  /**
+   * Constructors from callables.
+   *
+   * If all of the following conditions are satisfied, then we store the
+   * callable in the inline storage:
+   *
+   *  1) The function has been passed as an rvalue, meaning that there is no
+   *     use of the original in the user's code after it has been passed to
+   *     us.
+   *  2) Size of the callable is less than the size of the inline storage
+   *     buffer.
+   *  3) The callable is trivially constructible and destructible.
+   *
+   * If any one of the above conditions is not satisfied, we fall back to
+   * reference semantics and store the function as a pointer, and add a level
+   * of indirection through type erasure.
+   */
+  template <
+      typename Func,
+      _t<std::enable_if<
+          !std::is_same<_t<std::decay<Func>>, InlineFunctionRef>{} &&
+          !std::is_reference<Func>{} &&
+          std::is_convertible<
+              decltype(std::declval<Func&&>()(std::declval<Args&&>()...)),
+              ReturnType>{}>>* = nullptr>
+  InlineFunctionRef(Func&& func) {
+    // We disallow construction from lvalues, so assert that this is not a
+    // reference type.  When invoked with an lvalue, Func is a lvalue
+    // reference type, when invoked with an rvalue, Func is not ref-qualified.
+    static_assert(
+        !std::is_reference<Func>{},
+        "InlineFunctionRef cannot be used with lvalues");
+    static_assert(std::is_rvalue_reference<Func&&>{}, "");
+    construct(ConstructMode<Func>{}, folly::as_const(func));
+  }
+
+  /**
+   * The call operator uses the function pointer and a reference to the
+   * storage to do the dispatch.  The function pointer takes care of the
+   * appropriate casting.
+   */
+  ReturnType operator()(Args... args) const {
+    return call_(storage_, static_cast<Args&&>(args)...);
+  }
+
+  /**
+   * We have a function engaged if the call function points to anything other
+   * than null.
+   */
+  operator bool() const noexcept {
+    return call_;
+  }
+
+ private:
+  friend class InlineFunctionRefTest;
+
+  /**
+   * Inline storage constructor implementation.
+   */
+  template <typename Func>
+  void construct(InSituTag, Func& func) {
+    using Value = _t<std::remove_reference<Func>>;
+
+    // Assert that the following two assumptions are valid
+    //    1) fit in the storage space we have and match alignments, and
+    //    2) be invocable in a const context, it does not make sense to copy a
+    //       callable into inline storage if it makes state local
+    //       modifications.
+    static_assert(alignof(Value) <= alignof(Storage), "");
+    static_assert(is_invocable<const _t<std::decay<Func>>, Args&&...>{}, "");
+    static_assert(folly::is_trivially_copyable<Value>{}, "");
+
+    new (&storage_) Value{func};
+    call_ = &callInline<Value>;
+  }
+
+  /**
+   * Ref storage constructor implementation.  This is identical to
+   * folly::FunctionRef.
+   */
+  template <typename Func>
+  void construct(RefTag, Func& func) {
+    // store a pointer to the function
+    using Pointer = _t<std::add_pointer<_t<std::remove_reference<Func>>>>;
+    new (&storage_) Pointer{&func};
+    call_ = &callPointer<Pointer>;
+  }
+
+  template <typename Func>
+  static ReturnType callInline(const Storage& object, Args&&... args) {
+    // The only type of pointer allowed is a function pointer, no other
+    // pointer types are invocable.
+    static_assert(
+        !std::is_pointer<Func>::value ||
+            std::is_function<_t<std::remove_pointer<Func>>>::value,
+        "");
+    return (*folly::launder(reinterpret_cast<const Func*>(&object)))(
+        static_cast<Args&&>(args)...);
+  }
+
+  template <typename Func>
+  static ReturnType callPointer(const Storage& object, Args&&... args) {
+    // When the function we were instantiated with was not trivial, the given
+    // pointer points to a pointer, which pointers to the callable.  So we
+    // cast to a pointer and then to the pointee.
+    static_assert(std::is_pointer<Func>::value, "");
+    return (**folly::launder(reinterpret_cast<const Func*>(&object)))(
+        static_cast<Args&&>(args)...);
+  }
+
+  Call call_;
+  Storage storage_;
+};
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h b/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h
new file mode 100644
index 00000000000..573330ceb08
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h
@@ -0,0 +1,207 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Optional.h>
+#include <folly/Portability.h>
+#include <folly/Utility.h>
+
+#include <cassert>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <utility>
+
+namespace folly {
+namespace detail {
+namespace proxylockable_detail {
+template <typename Bool>
+void throwIfAlreadyLocked(Bool&& locked) {
+  if (kIsDebug && locked) {
+    throw std::system_error{
+        std::make_error_code(std::errc::resource_deadlock_would_occur)};
+  }
+}
+
+template <typename Bool>
+void throwIfNotLocked(Bool&& locked) {
+  if (kIsDebug && !locked) {
+    throw std::system_error{
+        std::make_error_code(std::errc::operation_not_permitted)};
+  }
+}
+
+template <typename Bool>
+void throwIfNoMutex(Bool&& mutex) {
+  if (kIsDebug && !mutex) {
+    throw std::system_error{
+        std::make_error_code(std::errc::operation_not_permitted)};
+  }
+}
+} // namespace proxylockable_detail
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::~ProxyLockableUniqueLock() {
+  if (owns_lock()) {
+    unlock();
+  }
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx) noexcept {
+  proxy_.emplace(mtx.lock());
+  mutex_ = std::addressof(mtx);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    ProxyLockableUniqueLock&& a) noexcept {
+  *this = std::move(a);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>& ProxyLockableUniqueLock<Mutex>::operator=(
+    ProxyLockableUniqueLock&& other) noexcept {
+  proxy_ = std::move(other.proxy_);
+  mutex_ = folly::exchange(other.mutex_, nullptr);
+  return *this;
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    std::defer_lock_t) noexcept {
+  mutex_ = std::addressof(mtx);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    std::try_to_lock_t) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock()) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+template <typename Rep, typename Period>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    const std::chrono::duration<Rep, Period>& duration) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock_for(duration)) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+template <typename Clock, typename Duration>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    const std::chrono::time_point<Clock, Duration>& time) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock_until(time)) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::lock() {
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+  proxylockable_detail::throwIfNoMutex(mutex_);
+
+  proxy_.emplace(mutex_->lock());
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::unlock() {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfNotLocked(proxy_);
+
+  mutex_->unlock(std::move(*proxy_));
+  proxy_.reset();
+}
+
+template <typename Mutex>
+bool ProxyLockableUniqueLock<Mutex>::try_lock() {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock()) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+template <typename Rep, typename Period>
+bool ProxyLockableUniqueLock<Mutex>::try_lock_for(
+    const std::chrono::duration<Rep, Period>& duration) {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock_for(duration)) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+template <typename Clock, typename Duration>
+bool ProxyLockableUniqueLock<Mutex>::try_lock_until(
+    const std::chrono::time_point<Clock, Duration>& time) {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock_until(time)) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::swap(
+    ProxyLockableUniqueLock& other) noexcept {
+  std::swap(mutex_, other.mutex_);
+  std::swap(proxy_, other.proxy_);
+}
+
+template <typename Mutex>
+typename ProxyLockableUniqueLock<Mutex>::mutex_type*
+ProxyLockableUniqueLock<Mutex>::mutex() const noexcept {
+  return mutex_;
+}
+
+template <typename Mutex>
+typename ProxyLockableUniqueLock<Mutex>::proxy_type*
+ProxyLockableUniqueLock<Mutex>::proxy() const noexcept {
+  return proxy_ ? std::addressof(proxy_.value()) : nullptr;
+}
+
+template <typename Mutex>
+bool ProxyLockableUniqueLock<Mutex>::owns_lock() const noexcept {
+  return proxy_.has_value();
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::operator bool() const noexcept {
+  return owns_lock();
+}
+
+template <typename Mutex>
+ProxyLockableLockGuard<Mutex>::ProxyLockableLockGuard(mutex_type& mtx)
+    : ProxyLockableUniqueLock<Mutex>{mtx} {}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/ProxyLockable.h b/third-party/folly/folly/synchronization/detail/ProxyLockable.h
new file mode 100644
index 00000000000..af922daf4a6
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/ProxyLockable.h
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Optional.h>
+
+#include <mutex>
+
+namespace folly {
+namespace detail {
+
+/**
+ * ProxyLockable is a "concept" that is used usually for mutexes that don't
+ * return void, but rather a proxy object that contains data that should be
+ * passed to the unlock function.
+ *
+ * This is in contrast with the normal Lockable concept that imposes no
+ * requirement on the return type of lock(), and requires an unlock() with no
+ * parameters.  Here we require that lock() returns non-void and that unlock()
+ * accepts the return type of lock() by value, rvalue-reference or
+ * const-reference
+ *
+ * Here we define two classes, that can be used by the top level to implement
+ * specializations for std::unique_lock and std::lock_guard.  Both
+ * ProxyLockableUniqueLock and ProxyLockableLockGuard implement the entire
+ * interface of std::unique_lock and std::lock_guard respectively
+ */
+template <typename Mutex>
+class ProxyLockableUniqueLock {
+ public:
+  using mutex_type = Mutex;
+  using proxy_type =
+      _t<std::decay<decltype(std::declval<mutex_type>().lock())>>;
+
+  /**
+   * Default constructor initializes the unique_lock to an empty state
+   */
+  ProxyLockableUniqueLock() = default;
+
+  /**
+   * Destructor releases the mutex if it is locked
+   */
+  ~ProxyLockableUniqueLock();
+
+  /**
+   * Move constructor and move assignment operators take state from the other
+   * lock
+   */
+  ProxyLockableUniqueLock(ProxyLockableUniqueLock&& other) noexcept;
+  ProxyLockableUniqueLock& operator=(ProxyLockableUniqueLock&&) noexcept;
+
+  /**
+   * Locks the mutex, blocks until the mutex can be acquired.
+   *
+   * The mutex is guaranteed to be acquired after this function returns.
+   */
+  ProxyLockableUniqueLock(mutex_type&) noexcept;
+
+  /**
+   * Explicit locking constructors to control how the lock() method is called
+   *
+   * std::defer_lock_t causes the mutex to get tracked, but not locked
+   * std::try_to_lock_t causes try_lock() to be called.  The current object is
+   *                    converts to true if the lock was successful
+   */
+  ProxyLockableUniqueLock(mutex_type& mtx, std::defer_lock_t) noexcept;
+  ProxyLockableUniqueLock(mutex_type& mtx, std::try_to_lock_t);
+
+  /**
+   * Timed locking constructors
+   */
+  template <typename Rep, typename Period>
+  ProxyLockableUniqueLock(
+      mutex_type& mtx,
+      const std::chrono::duration<Rep, Period>& duration);
+  template <typename Clock, typename Duration>
+  ProxyLockableUniqueLock(
+      mutex_type& mtx,
+      const std::chrono::time_point<Clock, Duration>& time);
+
+  /**
+   * Lock and unlock methods
+   *
+   * lock() and try_lock() throw if the mutex is already locked, or there is
+   * no mutex.  unlock() throws if there is no mutex or if the mutex was not
+   * locked
+   */
+  void lock();
+  void unlock();
+  bool try_lock();
+
+  /**
+   * Timed locking methods
+   *
+   * These throw if there was no mutex, or if the mutex was already locked
+   */
+  template <typename Rep, typename Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& duration);
+  template <typename Clock, typename Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& time);
+
+  /**
+   * Swap this unique lock with the other one
+   */
+  void swap(ProxyLockableUniqueLock& other) noexcept;
+
+  /**
+   * Returns true if the unique lock contains a lock and also has acquired an
+   * exclusive lock successfully
+   */
+  bool owns_lock() const noexcept;
+  explicit operator bool() const noexcept;
+
+  /**
+   * mutex() return a pointer to the mutex if there is a contained mutex and
+   * proxy() returns a pointer to the contained proxy if the mutex is locked
+   *
+   * If the unique lock was not constructed with a mutex, then mutex() returns
+   * nullptr.  If the mutex is not locked, then proxy() returns nullptr
+   */
+  mutex_type* mutex() const noexcept;
+  proxy_type* proxy() const noexcept;
+
+ private:
+  friend class ProxyLockableTest;
+
+  /**
+   * If the optional has a value, the mutex is locked, if it is empty, it is
+   * not
+   */
+  mutable folly::Optional<proxy_type> proxy_{};
+  mutex_type* mutex_{nullptr};
+};
+
+template <typename Mutex>
+class ProxyLockableLockGuard : private ProxyLockableUniqueLock<Mutex> {
+ public:
+  using mutex_type = Mutex;
+
+  /**
+   * Constructor locks the mutex, and destructor unlocks
+   */
+  ProxyLockableLockGuard(mutex_type& mtx);
+  ~ProxyLockableLockGuard() = default;
+
+  /**
+   * This class is not movable or assignable
+   *
+   * For more complicated usecases, consider the UniqueLock variant, which
+   * provides more options
+   */
+  ProxyLockableLockGuard(const ProxyLockableLockGuard&) = delete;
+  ProxyLockableLockGuard(ProxyLockableLockGuard&&) = delete;
+  ProxyLockableLockGuard& operator=(ProxyLockableLockGuard&&) = delete;
+  ProxyLockableLockGuard& operator=(const ProxyLockableLockGuard&) = delete;
+};
+
+} // namespace detail
+} // namespace folly
+
+#include <folly/synchronization/detail/ProxyLockable-inl.h>
diff --git a/third-party/folly/folly/synchronization/detail/Sleeper.h b/third-party/folly/folly/synchronization/detail/Sleeper.h
new file mode 100644
index 00000000000..5bc98b3333e
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/Sleeper.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/*
+ * @author Keith Adams <kma@fb.com>
+ * @author Jordan DeLong <delong.j@fb.com>
+ */
+
+#include <cstdint>
+#include <thread>
+
+#include <folly/portability/Asm.h>
+
+namespace folly {
+
+//////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/*
+ * A helper object for the contended case. Starts off with eager
+ * spinning, and falls back to sleeping for small quantums.
+ */
+class Sleeper {
+  static const uint32_t kMaxActiveSpin = 4000;
+
+  uint32_t spinCount;
+
+ public:
+  Sleeper() noexcept : spinCount(0) {}
+
+  static void sleep() noexcept {
+    /*
+     * Always sleep 0.5ms, assuming this will make the kernel put
+     * us down for whatever its minimum timer resolution is (in
+     * linux this varies by kernel version from 1ms to 10ms).
+     */
+    std::this_thread::sleep_for(std::chrono::microseconds{500});
+  }
+
+  void wait() noexcept {
+    if (spinCount < kMaxActiveSpin) {
+      ++spinCount;
+      asm_volatile_pause();
+    } else {
+      sleep();
+    }
+  }
+};
+
+} // namespace detail
+} // namespace folly
+
diff --git a/third-party/folly/folly/synchronization/detail/Spin.h b/third-party/folly/folly/synchronization/detail/Spin.h
new file mode 100644
index 00000000000..6eabc334e99
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/Spin.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <thread>
+
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/WaitOptions.h>
+
+namespace folly {
+namespace detail {
+
+enum class spin_result {
+  success, // condition passed
+  timeout, // exceeded deadline
+  advance, // exceeded current wait-options component timeout
+};
+
+template <typename Clock, typename Duration, typename F>
+spin_result spin_pause_until(
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    WaitOptions const& opt,
+    F f) {
+  if (opt.spin_max() <= opt.spin_max().zero()) {
+    return spin_result::advance;
+  }
+
+  auto tbegin = Clock::now();
+  while (true) {
+    if (f()) {
+      return spin_result::success;
+    }
+
+    auto const tnow = Clock::now();
+    if (tnow >= deadline) {
+      return spin_result::timeout;
+    }
+
+    //  Backward time discontinuity in Clock? revise pre_block starting point
+    tbegin = std::min(tbegin, tnow);
+    if (tnow >= tbegin + opt.spin_max()) {
+      return spin_result::advance;
+    }
+
+    //  The pause instruction is the polite way to spin, but it doesn't
+    //  actually affect correctness to omit it if we don't have it. Pausing
+    //  donates the full capabilities of the current core to its other
+    //  hyperthreads for a dozen cycles or so.
+    asm_volatile_pause();
+  }
+}
+
+template <typename Clock, typename Duration, typename F>
+spin_result spin_yield_until(
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    F f) {
+  while (true) {
+    if (f()) {
+      return spin_result::success;
+    }
+
+    auto const max = std::chrono::time_point<Clock, Duration>::max();
+    if (deadline != max && Clock::now() >= deadline) {
+      return spin_result::timeout;
+    }
+
+    std::this_thread::yield();
+  }
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
new file mode 100644
index 00000000000..3de232b6a8e
--- /dev/null
+++ b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
@@ -0,0 +1,1130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+#include <folly/container/Array.h>
+#include <folly/synchronization/Baton.h>
+
+#ifdef OS_AIX
+#include "gtest/gtest.h"
+#else
+#include <gtest/gtest.h>
+#endif
+
+#include <chrono>
+#include <cmath>
+#include <thread>
+
+namespace folly {
+namespace test {
+template <template <typename> class Atomic>
+using TestDistributedMutex =
+    folly::detail::distributed_mutex::DistributedMutex<Atomic, false>;
+} // namespace test
+
+namespace {
+constexpr auto kStressFactor = 1000;
+constexpr auto kStressTestSeconds = 2;
+constexpr auto kForever = std::chrono::hours{100};
+
+int sum(int n) {
+  return (n * (n + 1)) / 2;
+}
+
+template <template <typename> class Atom = std::atomic>
+void basicNThreads(int numThreads, int iterations = kStressFactor) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& result = std::vector<int>{};
+
+  auto&& function = [&](int id) {
+    return [&, id] {
+      for (auto j = 0; j < iterations; ++j) {
+        auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        result.push_back(id);
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    };
+  };
+
+  for (auto i = 1; i <= numThreads; ++i) {
+    threads.push_back(std::thread(function(i)));
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  auto total = 0;
+  for (auto value : result) {
+    total += value;
+  }
+  EXPECT_EQ(total, sum(numThreads) * iterations);
+}
+
+template <template <typename> class Atom = std::atomic>
+void lockWithTryAndTimedNThreads(
+    int numThreads,
+    std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  auto timedLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock_for(kForever)) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(timedLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& function = [&]() {
+    return [&] {
+      auto&& expected = std::uint64_t{0};
+      auto&& local = std::atomic<std::uint64_t>{0};
+      auto&& result = std::atomic<std::uint64_t>{0};
+      while (!stop.load()) {
+        ++expected;
+        auto current = mutex.lock_combine([&]() {
+          result.fetch_add(1);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return local.fetch_add(1);
+        });
+        EXPECT_EQ(current, expected - 1);
+      }
+
+      EXPECT_EQ(expected, result.load());
+    };
+  };
+
+  for (auto i = 1; i <= numThreads; ++i) {
+    threads.push_back(std::thread(function()));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithLockNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+        return iteration;
+      });
+
+      EXPECT_EQ(expected, current + 1);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  for (auto i = 1; i < (numThreads / 2); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 2); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithTryLockNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+        return iteration;
+      });
+
+      EXPECT_EQ(expected, current + 1);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithLockTryAndTimedNThreads(
+    int numThreads,
+    std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+
+        // return a non-trivially-copyable object that occupies all the
+        // storage we use to coalesce returns to test that codepath
+        return folly::make_array(
+            iteration,
+            iteration + 1,
+            iteration + 2,
+            iteration + 3,
+            iteration + 4,
+            iteration + 5);
+      });
+
+      EXPECT_EQ(expected, current[0] + 1);
+      EXPECT_EQ(expected, current[1]);
+      EXPECT_EQ(expected, current[2] - 1);
+      EXPECT_EQ(expected, current[3] - 2);
+      EXPECT_EQ(expected, current[4] - 3);
+      EXPECT_EQ(expected, current[5] - 4);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  auto timedLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock_for(kForever)) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(timedLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, InternalDetailTestOne) {
+  auto value = 0;
+  auto ptr = reinterpret_cast<std::uintptr_t>(&value);
+  EXPECT_EQ(folly::detail::distributed_mutex::extractPtr<int>(ptr), &value);
+  ptr = ptr | 0b1;
+  EXPECT_EQ(folly::detail::distributed_mutex::extractPtr<int>(ptr), &value);
+}
+
+TEST(DistributedMutex, Basic) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto state = mutex.lock();
+  mutex.unlock(std::move(state));
+}
+
+TEST(DistributedMutex, BasicTryLock) {
+  auto&& mutex = folly::DistributedMutex{};
+
+  while (true) {
+    auto state = mutex.try_lock();
+    if (state) {
+      mutex.unlock(std::move(state));
+      break;
+    }
+  }
+}
+
+TEST(DistributedMutex, StressTwoThreads) {
+  basicNThreads(2);
+}
+TEST(DistributedMutex, StressThreeThreads) {
+  basicNThreads(3);
+}
+TEST(DistributedMutex, StressFourThreads) {
+  basicNThreads(4);
+}
+TEST(DistributedMutex, StressFiveThreads) {
+  basicNThreads(5);
+}
+TEST(DistributedMutex, StressSixThreads) {
+  basicNThreads(6);
+}
+TEST(DistributedMutex, StressSevenThreads) {
+  basicNThreads(7);
+}
+TEST(DistributedMutex, StressEightThreads) {
+  basicNThreads(8);
+}
+TEST(DistributedMutex, StressSixteenThreads) {
+  basicNThreads(16);
+}
+TEST(DistributedMutex, StressThirtyTwoThreads) {
+  basicNThreads(32);
+}
+TEST(DistributedMutex, StressSixtyFourThreads) {
+  basicNThreads(64);
+}
+TEST(DistributedMutex, StressHundredThreads) {
+  basicNThreads(100);
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreads) {
+  basicNThreads(std::thread::hardware_concurrency());
+}
+
+TEST(DistributedMutex, StressThreeThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHwConcThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTwoThreadsCombine) {
+  combineNThreads(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThreeThreadsCombine) {
+  combineNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourThreadsCombine) {
+  combineNThreads(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFiveThreadsCombine) {
+  combineNThreads(5, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombine) {
+  combineNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSevenThreadsCombine) {
+  combineNThreads(7, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressEightThreadsCombine) {
+  combineNThreads(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixteenThreadsCombine) {
+  combineNThreads(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThirtyTwoThreadsCombine) {
+  combineNThreads(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombine) {
+  combineNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHundredThreadsCombine) {
+  combineNThreads(100, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombine) {
+  combineNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTwoThreadsCombineAndLock) {
+  combineWithLockNThreads(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourThreadsCombineAndLock) {
+  combineWithLockNThreads(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressEightThreadsCombineAndLock) {
+  combineWithLockNThreads(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixteenThreadsCombineAndLock) {
+  combineWithLockNThreads(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThirtyTwoThreadsCombineAndLock) {
+  combineWithLockNThreads(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineAndLock) {
+  combineWithLockNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombineAndLock) {
+  combineWithLockNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressThreeThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressThreeThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHwConcurrencyThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTryLock) {
+  auto&& mutex = folly::DistributedMutex{};
+
+  for (auto i = 0; i < kStressFactor; ++i) {
+    while (true) {
+      auto state = mutex.try_lock();
+      if (state) {
+        mutex.unlock(std::move(state));
+        break;
+      }
+    }
+  }
+}
+
+TEST(DistributedMutex, TimedLockTimeout) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& start = folly::Baton<>{};
+  auto&& done = folly::Baton<>{};
+
+  auto thread = std::thread{[&]() {
+    auto state = mutex.lock();
+    start.post();
+    done.wait();
+    mutex.unlock(std::move(state));
+  }};
+
+  start.wait();
+  auto result = mutex.try_lock_for(std::chrono::milliseconds{10});
+  EXPECT_FALSE(result);
+  done.post();
+  thread.join();
+}
+
+TEST(DistributedMutex, TimedLockAcquireAfterUnlock) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& start = folly::Baton<>{};
+
+  auto thread = std::thread{[&]() {
+    auto state = mutex.lock();
+    start.post();
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds{10});
+    mutex.unlock(std::move(state));
+  }};
+
+  start.wait();
+  auto result = mutex.try_lock_for(kForever);
+  EXPECT_TRUE(result);
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void stressTryLockWithConcurrentLocks(
+    int numThreads,
+    int iterations = kStressFactor) {
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& atomic = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      for (auto j = 0; j < iterations; ++j) {
+        auto state = mutex.lock();
+        EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+        mutex.unlock(std::move(state));
+      }
+    }));
+  }
+
+  for (auto i = 0; i < iterations; ++i) {
+    if (auto state = mutex.try_lock()) {
+      EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+      EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+      mutex.unlock(std::move(state));
+    }
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksTwoThreads) {
+  stressTryLockWithConcurrentLocks(2);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksFourThreads) {
+  stressTryLockWithConcurrentLocks(4);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksEightThreads) {
+  stressTryLockWithConcurrentLocks(8);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksSixteenThreads) {
+  stressTryLockWithConcurrentLocks(16);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksThirtyTwoThreads) {
+  stressTryLockWithConcurrentLocks(32);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksSixtyFourThreads) {
+  stressTryLockWithConcurrentLocks(64);
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentTryLocks(int numThreads, int iterations = kStressFactor) {
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& atomic = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      for (auto j = 0; j < iterations; ++j) {
+        if (auto state = mutex.try_lock()) {
+          EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+          mutex.unlock(std::move(state));
+        }
+      }
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressTryLockWithTwoThreads) {
+  concurrentTryLocks(2);
+}
+TEST(DistributedMutex, StressTryLockFourThreads) {
+  concurrentTryLocks(4);
+}
+TEST(DistributedMutex, StressTryLockEightThreads) {
+  concurrentTryLocks(8);
+}
+TEST(DistributedMutex, StressTryLockSixteenThreads) {
+  concurrentTryLocks(16);
+}
+TEST(DistributedMutex, StressTryLockThirtyTwoThreads) {
+  concurrentTryLocks(32);
+}
+TEST(DistributedMutex, StressTryLockSixtyFourThreads) {
+  concurrentTryLocks(64);
+}
+
+namespace {
+class TestConstruction {
+ public:
+  TestConstruction() = delete;
+  explicit TestConstruction(int) {
+    defaultConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction(TestConstruction&&) noexcept {
+    moveConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction(const TestConstruction&) {
+    copyConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction& operator=(const TestConstruction&) {
+    copyAssigns().fetch_add(1, std::memory_order_relaxed);
+    return *this;
+  }
+  TestConstruction& operator=(TestConstruction&&) {
+    moveAssigns().fetch_add(1, std::memory_order_relaxed);
+    return *this;
+  }
+  ~TestConstruction() {
+    destructs().fetch_add(1, std::memory_order_relaxed);
+  }
+
+  static std::atomic<std::uint64_t>& defaultConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& moveConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& copyConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& moveAssigns() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& copyAssigns() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& destructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+
+  static void reset() {
+    defaultConstructs().store(0);
+    moveConstructs().store(0);
+    copyConstructs().store(0);
+    copyAssigns().store(0);
+    destructs().store(0);
+  }
+};
+} // namespace
+
+TEST(DistributedMutex, TestAppropriateDestructionAndConstructionWithCombine) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& stop = std::atomic<bool>{false};
+
+  // test the simple return path to make sure that in the absence of
+  // contention, we get the right number of constructs and destructs
+  mutex.lock_combine([]() { return TestConstruction{1}; });
+  auto moves = TestConstruction::moveConstructs().load();
+  auto defaults = TestConstruction::defaultConstructs().load();
+  EXPECT_EQ(TestConstruction::defaultConstructs().load(), 1);
+  EXPECT_TRUE(moves == 0 || moves == 1);
+  EXPECT_EQ(TestConstruction::destructs().load(), moves + defaults);
+
+  // loop and make sure we were able to test the path where the critical
+  // section of the thread gets combined, and assert that we see the expected
+  // number of constructions and destructions
+  //
+  // this implements a timed backoff to test the combined path, so we use the
+  // smallest possible delay in tests
+  auto thread = std::thread{[&]() {
+    auto&& duration = std::chrono::milliseconds{10};
+    while (!stop.load()) {
+      TestConstruction::reset();
+      auto&& ready = folly::Baton<>{};
+      auto&& release = folly::Baton<>{};
+
+      // make one thread start it's critical section, signal and wait for
+      // another thread to enqueue, to test the
+      auto innerThread = std::thread{[&]() {
+        mutex.lock_combine([&]() {
+          ready.post();
+          release.wait();
+          /* sleep override */
+          std::this_thread::sleep_for(duration);
+        });
+      }};
+
+      // wait for the thread to get in its critical section, then tell it to go
+      ready.wait();
+      release.post();
+      mutex.lock_combine([&]() { return TestConstruction{1}; });
+
+      innerThread.join();
+
+      // at this point we should have only one default construct, either 3
+      // or 4 move constructs the same number of destructions as
+      // constructions
+      auto innerDefaults = TestConstruction::defaultConstructs().load();
+      auto innerMoves = TestConstruction::moveConstructs().load();
+      auto destructs = TestConstruction::destructs().load();
+      EXPECT_EQ(innerDefaults, 1);
+      EXPECT_TRUE(innerMoves == 3 || innerMoves == 4 || innerMoves == 1);
+      EXPECT_EQ(destructs, innerMoves + innerDefaults);
+      EXPECT_EQ(TestConstruction::moveAssigns().load(), 0);
+      EXPECT_EQ(TestConstruction::copyAssigns().load(), 0);
+
+      // increase duration by 100ms each iteration
+      duration = duration + std::chrono::milliseconds{100};
+    }
+  }};
+
+  /* sleep override */
+  std::this_thread::sleep_for(std::chrono::seconds{kStressTestSeconds});
+  stop.store(true);
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentLocksManyMutexes(int numThreads, std::chrono::seconds duration) {
+  using DMutex = folly::detail::distributed_mutex::DistributedMutex<Atom>;
+  const auto&& kNumMutexes = 10;
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutexes = std::vector<DMutex>(kNumMutexes);
+  auto&& barriers = std::vector<std::atomic<std::uint64_t>>(kNumMutexes);
+  auto&& stop = std::atomic<bool>{false};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      auto&& total = std::atomic<std::uint64_t>{0};
+      auto&& expected = std::uint64_t{0};
+
+      for (auto j = 0; !stop.load(std::memory_order_relaxed); ++j) {
+        auto& mutex = mutexes[j % kNumMutexes];
+        auto& barrier = barriers[j % kNumMutexes];
+
+        ++expected;
+        auto result = mutex.lock_combine([&]() {
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return total.fetch_add(1, std::memory_order_relaxed);
+        });
+        EXPECT_EQ(result, expected - 1);
+      }
+
+      EXPECT_EQ(total.load(), expected);
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressWithManyMutexesAlternatingTwoThreads) {
+  concurrentLocksManyMutexes(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingFourThreads) {
+  concurrentLocksManyMutexes(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingEightThreads) {
+  concurrentLocksManyMutexes(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingSixteenThreads) {
+  concurrentLocksManyMutexes(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingThirtyTwoThreads) {
+  concurrentLocksManyMutexes(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingSixtyFourThreads) {
+  concurrentLocksManyMutexes(64, std::chrono::seconds{kStressTestSeconds});
+}
+
+namespace {
+class ExceptionWithConstructionTrack : public std::exception {
+ public:
+  explicit ExceptionWithConstructionTrack(int id)
+      : id_{std::to_string(id)}, constructionTrack_{id} {}
+
+  const char* what() const noexcept override {
+    return id_.c_str();
+  }
+
+ private:
+  std::string id_;
+  TestConstruction constructionTrack_;
+};
+} // namespace
+
+TEST(DistributedMutex, TestExceptionPropagationUncontended) {
+  TestConstruction::reset();
+  auto&& mutex = folly::DistributedMutex{};
+
+  auto&& thread = std::thread{[&]() {
+    try {
+      mutex.lock_combine([&]() { throw ExceptionWithConstructionTrack{46}; });
+    } catch (std::exception& exc) {
+      auto integer = std::stoi(exc.what());
+      EXPECT_EQ(integer, 46);
+      EXPECT_GT(TestConstruction::defaultConstructs(), 0);
+    }
+
+    EXPECT_EQ(
+        TestConstruction::defaultConstructs(), TestConstruction::destructs());
+  }};
+
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentExceptionPropagationStress(
+    int numThreads,
+    std::chrono::milliseconds t) {
+  TestConstruction::reset();
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+  auto&& barrier = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&]() {
+      for (auto j = 0; !stop.load(); ++j) {
+        auto value = int{0};
+        try {
+          value = mutex.lock_combine([&]() {
+            EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+            EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+            std::this_thread::yield();
+            SCOPE_EXIT {
+              EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+            };
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+
+            // we only throw an exception once every 3 times
+            if (!(j % 3)) {
+              throw ExceptionWithConstructionTrack{j};
+            }
+
+            return j;
+          });
+        } catch (std::exception& exc) {
+          value = std::stoi(exc.what());
+        }
+
+        EXPECT_EQ(value, j);
+      }
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(t);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, TestExceptionPropagationStressTwoThreads) {
+  concurrentExceptionPropagationStress(
+      2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressFourThreads) {
+  concurrentExceptionPropagationStress(
+      4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressEightThreads) {
+  concurrentExceptionPropagationStress(
+      8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressSixteenThreads) {
+  concurrentExceptionPropagationStress(
+      16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressThirtyTwoThreads) {
+  concurrentExceptionPropagationStress(
+      32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressSixtyFourThreads) {
+  concurrentExceptionPropagationStress(
+      64, std::chrono::seconds{kStressTestSeconds});
+}
+
+namespace {
+std::array<std::uint64_t, 8> makeMonotonicArray(int start) {
+  auto array = std::array<std::uint64_t, 8>{};
+  for (auto& element : array) { element = start++; }
+  return array;
+}
+
+template <template <typename> class Atom = std::atomic>
+void concurrentBigValueReturnStress(
+    int numThreads,
+    std::chrono::milliseconds t) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+  auto&& barrier = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&]() {
+      auto&& value = std::atomic<std::uint64_t>{0};
+
+      for (auto j = 0; !stop.load(); ++j) {
+        auto returned = mutex.lock_combine([&]() {
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          // return an entire cacheline worth of data
+          auto current = value.fetch_add(1, std::memory_order_relaxed);
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return makeMonotonicArray(static_cast<int>(current));
+        });
+
+        auto expected = value.load() - 1;
+        for (auto& element : returned) {
+          EXPECT_EQ(element, expected++);
+        }
+      }
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(t);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressBigValueReturnTwoThreads) {
+  concurrentBigValueReturnStress(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnFourThreads) {
+  concurrentBigValueReturnStress(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnEightThreads) {
+  concurrentBigValueReturnStress(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnSixteenThreads) {
+  concurrentBigValueReturnStress(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnThirtyTwoThreads) {
+  concurrentBigValueReturnStress(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnSixtyFourThreads) {
+  concurrentBigValueReturnStress(64, std::chrono::seconds{kStressTestSeconds});
+}
+
+} // namespace folly
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From e0b84538af3804a960755997aa7ffd964dd02322 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Wed, 7 Aug 2019 20:15:21 -0700
Subject: [PATCH 286/572] Fix clang_check and lite failures (#5680)

Summary:
This PR fixes two test failures:
1. clang check:
```
third-party/folly/folly/detail/Futex.cpp:52:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
  int rv = syscall(
      ~~   ^~~~~~~~
third-party/folly/folly/detail/Futex.cpp:114:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
  int rv = syscall(
      ~~   ^~~~~~~~
```
2. lite
```
./third-party/folly/folly/synchronization/DistributedMutex-inl.h:1337:7: error: exception handling disabled, use -fexceptions to enable
     } catch (...) {
       ^
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5680

Differential Revision: D16704042

Pulled By: miasantreble

fbshipit-source-id: a53cb06128365d9e864f07476b0af8fc27140f07
---
 third-party/folly/folly/detail/Futex.cpp                    | 6 +++---
 .../folly/folly/synchronization/DistributedMutex-inl.h      | 3 ++-
 .../folly/synchronization/test/DistributedMutexTest.cpp     | 6 +++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/third-party/folly/folly/detail/Futex.cpp b/third-party/folly/folly/detail/Futex.cpp
index 208578a901d..62d6ea2b201 100644
--- a/third-party/folly/folly/detail/Futex.cpp
+++ b/third-party/folly/folly/detail/Futex.cpp
@@ -49,7 +49,7 @@ namespace {
 #endif
 
 int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) {
-  int rv = syscall(
+  long rv = syscall(
       __NR_futex,
       addr, /* addr1 */
       FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */
@@ -65,7 +65,7 @@ int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) {
   if (rv < 0) {
     return 0;
   }
-  return rv;
+  return static_cast<int>(rv);
 }
 
 template <class Clock>
@@ -111,7 +111,7 @@ FutexResult nativeFutexWaitImpl(
 
   // Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout
   // value - http://locklessinc.com/articles/futex_cheat_sheet/
-  int rv = syscall(
+  long rv = syscall(
       __NR_futex,
       addr, /* addr1 */
       op, /* op */
diff --git a/third-party/folly/folly/synchronization/DistributedMutex-inl.h b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
index b3f7d201406..8eedb9cd346 100644
--- a/third-party/folly/folly/synchronization/DistributedMutex-inl.h
+++ b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
@@ -1311,6 +1311,7 @@ inline std::uintptr_t tryCombine(
     std::uint64_t iteration,
     std::chrono::nanoseconds now,
     CombineFunction task) {
+#ifndef ROCKSDB_LITE
   // if the waiter has asked for a combine operation, we should combine its
   // critical section and move on to the next waiter
   //
@@ -1339,7 +1340,7 @@ inline std::uintptr_t tryCombine(
     }
     return next;
   }
-
+#endif  // ROCKSDB_LITE
   return 0;
 }
 
diff --git a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
index 3de232b6a8e..b83fdda7120 100644
--- a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
+++ b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
@@ -13,6 +13,8 @@
 #include <gtest/gtest.h>
 #endif
 
+#ifndef ROCKSDB_LITE
+
 #include <chrono>
 #include <cmath>
 #include <thread>
@@ -960,7 +962,6 @@ class ExceptionWithConstructionTrack : public std::exception {
 TEST(DistributedMutex, TestExceptionPropagationUncontended) {
   TestConstruction::reset();
   auto&& mutex = folly::DistributedMutex{};
-
   auto&& thread = std::thread{[&]() {
     try {
       mutex.lock_combine([&]() { throw ExceptionWithConstructionTrack{46}; });
@@ -969,11 +970,9 @@ TEST(DistributedMutex, TestExceptionPropagationUncontended) {
       EXPECT_EQ(integer, 46);
       EXPECT_GT(TestConstruction::defaultConstructs(), 0);
     }
-
     EXPECT_EQ(
         TestConstruction::defaultConstructs(), TestConstruction::destructs());
   }};
-
   thread.join();
 }
 
@@ -1123,6 +1122,7 @@ TEST(DistributedMutex, StressBigValueReturnSixtyFourThreads) {
 }
 
 } // namespace folly
+#endif  // ROCKSDB_LITE
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);

From 3da225716c9e5037c536643a8bcacbc8ea33d320 Mon Sep 17 00:00:00 2001
From: haoyuhuang <aronexplorer@gmail.com>
Date: Fri, 9 Aug 2019 13:09:04 -0700
Subject: [PATCH 287/572] Block cache analyzer: Support reading from human
 readable trace file. (#5679)

Summary:
This PR adds support in block cache trace analyzer to read from human readable trace file. This is needed when a user does not have access to the binary trace file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5679

Test Plan: USE_CLANG=1 make check -j32

Differential Revision: D16697239

Pulled By: HaoyuHuang

fbshipit-source-id: f2e29d7995816c389b41458f234ec8e184a924db
---
 .../block_cache_trace_analyzer.cc             |  82 ++++-------
 .../block_cache_trace_analyzer.h              |   8 +-
 .../block_cache_trace_analyzer_test.cc        |  14 +-
 trace_replay/block_cache_tracer.cc            | 139 ++++++++++++++++++
 trace_replay/block_cache_tracer.h             |  36 +++++
 trace_replay/block_cache_tracer_test.cc       |  49 ++++++
 6 files changed, 266 insertions(+), 62 deletions(-)

diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index e1021b466c3..2c078e5f8ec 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -24,6 +24,10 @@
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_string(block_cache_trace_path, "", "The trace file path.");
+DEFINE_bool(is_block_cache_human_readable_trace, false,
+            "Is the trace file provided for analysis generated by running "
+            "block_cache_trace_analyzer with "
+            "FLAGS_human_readable_trace_file_path is specified.");
 DEFINE_string(
     block_cache_sim_config_path, "",
     "The config file path. One cache configuration per line. The format of a "
@@ -1433,6 +1437,7 @@ BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
     const std::string& trace_file_path, const std::string& output_dir,
     const std::string& human_readable_trace_file_path,
     bool compute_reuse_distance, bool mrc_only,
+    bool is_human_readable_trace_file,
     std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
     : env_(rocksdb::Env::Default()),
       trace_file_path_(trace_file_path),
@@ -1440,6 +1445,7 @@ BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
       human_readable_trace_file_path_(human_readable_trace_file_path),
       compute_reuse_distance_(compute_reuse_distance),
       mrc_only_(mrc_only),
+      is_human_readable_trace_file_(is_human_readable_trace_file),
       cache_simulator_(std::move(cache_simulator)) {}
 
 void BlockCacheTraceAnalyzer::ComputeReuseDistance(
@@ -1460,33 +1466,6 @@ void BlockCacheTraceAnalyzer::ComputeReuseDistance(
   info->unique_blocks_since_last_access.clear();
 }
 
-Status BlockCacheTraceAnalyzer::WriteHumanReadableTraceRecord(
-    const BlockCacheTraceRecord& access, uint64_t block_id,
-    uint64_t get_key_id) {
-  if (!human_readable_trace_file_writer_) {
-    return Status::OK();
-  }
-  int ret = snprintf(
-      trace_record_buffer_, sizeof(trace_record_buffer_),
-      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
-      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64
-      ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
-      access.access_timestamp, block_id, access.block_type, access.block_size,
-      access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
-      access.caller, access.no_insert, access.get_id, get_key_id,
-      access.referenced_data_size, access.is_cache_hit,
-      access.referenced_key_exist_in_block, access.num_keys_in_block,
-      BlockCacheTraceHelper::GetTableId(access),
-      BlockCacheTraceHelper::GetSequenceNumber(access), access.block_key.size(),
-      access.referenced_key.size(),
-      BlockCacheTraceHelper::GetBlockOffsetInFile(access));
-  if (ret < 0) {
-    return Status::IOError("failed to format the output");
-  }
-  std::string printout(trace_record_buffer_);
-  return human_readable_trace_file_writer_->Append(printout);
-}
-
 Status BlockCacheTraceAnalyzer::RecordAccess(
     const BlockCacheTraceRecord& access) {
   ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
@@ -1535,25 +1514,30 @@ Status BlockCacheTraceAnalyzer::RecordAccess(
       }
     }
   }
-  return WriteHumanReadableTraceRecord(access, block_access_info.block_id,
-                                       get_key_id);
+  return human_readable_trace_writer_.WriteHumanReadableTraceRecord(
+      access, block_access_info.block_id, get_key_id);
 }
 
 Status BlockCacheTraceAnalyzer::Analyze() {
-  std::unique_ptr<TraceReader> trace_reader;
-  Status s =
-      NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
-  if (!s.ok()) {
-    return s;
-  }
-  BlockCacheTraceReader reader(std::move(trace_reader));
-  s = reader.ReadHeader(&header_);
-  if (!s.ok()) {
-    return s;
+  std::unique_ptr<BlockCacheTraceReader> reader;
+  Status s = Status::OK();
+  if (is_human_readable_trace_file_) {
+    reader.reset(new BlockCacheHumanReadableTraceReader(trace_file_path_));
+  } else {
+    std::unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+    if (!s.ok()) {
+      return s;
+    }
+    reader.reset(new BlockCacheTraceReader(std::move(trace_reader)));
+    s = reader->ReadHeader(&header_);
+    if (!s.ok()) {
+      return s;
+    }
   }
   if (!human_readable_trace_file_path_.empty()) {
-    s = env_->NewWritableFile(human_readable_trace_file_path_,
-                              &human_readable_trace_file_writer_, EnvOptions());
+    s = human_readable_trace_writer_.NewWritableFile(
+        human_readable_trace_file_path_, env_);
     if (!s.ok()) {
       return s;
     }
@@ -1562,7 +1546,7 @@ Status BlockCacheTraceAnalyzer::Analyze() {
   uint64_t time_interval = 0;
   while (s.ok()) {
     BlockCacheTraceRecord access;
-    s = reader.ReadAccess(&access);
+    s = reader->ReadAccess(&access);
     if (!s.ok()) {
       break;
     }
@@ -1598,10 +1582,6 @@ Status BlockCacheTraceAnalyzer::Analyze() {
       time_interval++;
     }
   }
-  if (human_readable_trace_file_writer_) {
-    human_readable_trace_file_writer_->Flush();
-    human_readable_trace_file_writer_->Close();
-  }
   uint64_t now = env_->NowMicros();
   uint64_t duration = (now - start) / kMicrosInSecond;
   uint64_t trace_duration =
@@ -2152,11 +2132,11 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
       exit(1);
     }
   }
-  BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
-                                   FLAGS_block_cache_analysis_result_dir,
-                                   FLAGS_human_readable_trace_file_path,
-                                   !FLAGS_reuse_distance_labels.empty(),
-                                   FLAGS_mrc_only, std::move(cache_simulator));
+  BlockCacheTraceAnalyzer analyzer(
+      FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
+      FLAGS_human_readable_trace_file_path,
+      !FLAGS_reuse_distance_labels.empty(), FLAGS_mrc_only,
+      FLAGS_is_block_cache_human_readable_trace, std::move(cache_simulator));
   Status s = analyzer.Analyze();
   if (!s.IsIncomplete() && !s.ok()) {
     // Read all traces.
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
index f22a9da68f3..d7abd010010 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -145,6 +145,7 @@ class BlockCacheTraceAnalyzer {
       const std::string& trace_file_path, const std::string& output_dir,
       const std::string& human_readable_trace_file_path,
       bool compute_reuse_distance, bool mrc_only,
+      bool is_human_readable_trace_file,
       std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
   ~BlockCacheTraceAnalyzer() = default;
   // No copy and move.
@@ -365,15 +366,13 @@ class BlockCacheTraceAnalyzer {
       const std::map<std::string, Predictions>& label_predictions,
       uint32_t max_number_of_values) const;
 
-  Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
-                                       uint64_t block_id, uint64_t get_key_id);
-
   rocksdb::Env* env_;
   const std::string trace_file_path_;
   const std::string output_dir_;
   std::string human_readable_trace_file_path_;
   const bool compute_reuse_distance_;
   const bool mrc_only_;
+  const bool is_human_readable_trace_file_;
 
   BlockCacheTraceHeader header_;
   std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
@@ -386,8 +385,7 @@ class BlockCacheTraceAnalyzer {
   MissRatioStats miss_ratio_stats_;
   uint64_t unique_block_id_ = 1;
   uint64_t unique_get_key_id_ = 1;
-  char trace_record_buffer_[1024 * 1024];
-  std::unique_ptr<rocksdb::WritableFile> human_readable_trace_file_writer_;
+  BlockCacheHumanReadableTraceWriter human_readable_trace_writer_;
 };
 
 int block_cache_trace_analyzer_tool(int argc, char** argv);
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index eecd6e80d9d..fc9ec34705c 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -634,12 +634,14 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
     ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
     // Read blocks.
-    BlockCacheTraceAnalyzer analyzer(trace_file_path_,
-                                     /*output_miss_ratio_curve_path=*/"",
-                                     /*human_readable_trace_file_path=*/"",
-                                     /*compute_reuse_distance=*/true,
-                                     /*mrc_only=*/false,
-                                     /*simulator=*/nullptr);
+    BlockCacheTraceAnalyzer analyzer(
+        trace_file_path_,
+        /*output_miss_ratio_curve_path=*/"",
+        /*human_readable_trace_file_path=*/"",
+        /*compute_reuse_distance=*/true,
+        /*mrc_only=*/false,
+        /*is_block_cache_human_readable_trace=*/false,
+        /*simulator=*/nullptr);
     // The analyzer ends when it detects an incomplete access record.
     ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
     const uint64_t expected_num_cfs = 1;
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 4f39be609fe..c70290b6787 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -5,6 +5,10 @@
 
 #include "trace_replay/block_cache_tracer.h"
 
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "rocksdb/slice.h"
@@ -300,6 +304,141 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
   return Status::OK();
 }
 
+BlockCacheHumanReadableTraceWriter::~BlockCacheHumanReadableTraceWriter() {
+  if (human_readable_trace_file_writer_) {
+    human_readable_trace_file_writer_->Flush();
+    human_readable_trace_file_writer_->Close();
+  }
+}
+
+Status BlockCacheHumanReadableTraceWriter::NewWritableFile(
+    const std::string& human_readable_trace_file_path, rocksdb::Env* env) {
+  if (human_readable_trace_file_path.empty()) {
+    return Status::InvalidArgument(
+        "The provided human_readable_trace_file_path is null.");
+  }
+  return env->NewWritableFile(human_readable_trace_file_path,
+                              &human_readable_trace_file_writer_, EnvOptions());
+}
+
+Status BlockCacheHumanReadableTraceWriter::WriteHumanReadableTraceRecord(
+    const BlockCacheTraceRecord& access, uint64_t block_id,
+    uint64_t get_key_id) {
+  if (!human_readable_trace_file_writer_) {
+    return Status::OK();
+  }
+  int ret = snprintf(
+      trace_record_buffer_, sizeof(trace_record_buffer_),
+      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
+      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64
+      ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
+      access.access_timestamp, block_id, access.block_type, access.block_size,
+      access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
+      access.caller, access.no_insert, access.get_id, get_key_id,
+      access.referenced_data_size, access.is_cache_hit,
+      access.referenced_key_exist_in_block, access.num_keys_in_block,
+      BlockCacheTraceHelper::GetTableId(access),
+      BlockCacheTraceHelper::GetSequenceNumber(access),
+      static_cast<uint64_t>(access.block_key.size()),
+      static_cast<uint64_t>(access.referenced_key.size()),
+      BlockCacheTraceHelper::GetBlockOffsetInFile(access));
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(trace_record_buffer_);
+  return human_readable_trace_file_writer_->Append(printout);
+}
+
+BlockCacheHumanReadableTraceReader::BlockCacheHumanReadableTraceReader(
+    const std::string& trace_file_path)
+    : BlockCacheTraceReader(/*trace_reader=*/nullptr) {
+  human_readable_trace_reader_.open(trace_file_path, std::ifstream::in);
+}
+
+BlockCacheHumanReadableTraceReader::~BlockCacheHumanReadableTraceReader() {
+  human_readable_trace_reader_.close();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadHeader(
+    BlockCacheTraceHeader* /*header*/) {
+  return Status::OK();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadAccess(
+    BlockCacheTraceRecord* record) {
+  std::string line;
+  if (!std::getline(human_readable_trace_reader_, line)) {
+    return Status::Incomplete("No more records to read.");
+  }
+  std::stringstream ss(line);
+  std::vector<std::string> record_strs;
+  while (ss.good()) {
+    std::string substr;
+    getline(ss, substr, ',');
+    record_strs.push_back(substr);
+  }
+  if (record_strs.size() != 21) {
+    return Status::Incomplete("Records format is wrong.");
+  }
+
+  record->access_timestamp = ParseUint64(record_strs[0]);
+  uint64_t block_key = ParseUint64(record_strs[1]);
+  record->block_type = static_cast<TraceType>(ParseUint64(record_strs[2]));
+  record->block_size = ParseUint64(record_strs[3]);
+  record->cf_id = ParseUint64(record_strs[4]);
+  record->cf_name = record_strs[5];
+  record->level = static_cast<uint32_t>(ParseUint64(record_strs[6]));
+  record->sst_fd_number = ParseUint64(record_strs[7]);
+  record->caller = static_cast<TableReaderCaller>(ParseUint64(record_strs[8]));
+  record->no_insert = static_cast<Boolean>(ParseUint64(record_strs[9]));
+  record->get_id = ParseUint64(record_strs[10]);
+  uint64_t get_key_id = ParseUint64(record_strs[11]);
+
+  record->referenced_data_size = ParseUint64(record_strs[12]);
+  record->is_cache_hit = static_cast<Boolean>(ParseUint64(record_strs[13]));
+  record->referenced_key_exist_in_block =
+      static_cast<Boolean>(ParseUint64(record_strs[14]));
+  record->num_keys_in_block = ParseUint64(record_strs[15]);
+  uint64_t table_id = ParseUint64(record_strs[16]);
+  if (table_id > 0) {
+    // Decrement since valid table id in the trace file equals traced table id
+    // + 1.
+    table_id -= 1;
+  }
+  uint64_t get_sequence_number = ParseUint64(record_strs[17]);
+  if (get_sequence_number > 0) {
+    record->get_from_user_specified_snapshot = Boolean::kTrue;
+    // Decrement since valid seq number in the trace file equals traced seq
+    // number + 1.
+    get_sequence_number -= 1;
+  }
+  uint64_t block_key_size = ParseUint64(record_strs[18]);
+  uint64_t get_key_size = ParseUint64(record_strs[19]);
+  uint64_t block_offset = ParseUint64(record_strs[20]);
+
+  std::string tmp_block_key;
+  PutVarint64(&tmp_block_key, block_key);
+  PutVarint64(&tmp_block_key, block_offset);
+  // Append 1 until the size is the same as traced block key size.
+  while (record->block_key.size() < block_key_size - tmp_block_key.size()) {
+    record->block_key += "1";
+  }
+  record->block_key += tmp_block_key;
+
+  if (get_key_id != 0) {
+    std::string tmp_get_key;
+    PutFixed64(&tmp_get_key, get_key_id);
+    PutFixed64(&tmp_get_key, get_sequence_number << 8);
+    PutFixed32(&record->referenced_key, static_cast<uint32_t>(table_id));
+    // Append 1 until the size is the same as traced key size.
+    while (record->referenced_key.size() < get_key_size - tmp_get_key.size()) {
+      record->referenced_key += "1";
+    }
+    record->referenced_key += tmp_get_key;
+  }
+  return Status::OK();
+}
+
 BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); }
 
 BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index b109b1db01c..7c03e611df2 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <atomic>
+#include <fstream>
 
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/env.h"
@@ -195,6 +196,24 @@ class BlockCacheTraceWriter {
   std::unique_ptr<TraceWriter> trace_writer_;
 };
 
+// Write a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for details.
+class BlockCacheHumanReadableTraceWriter {
+ public:
+  ~BlockCacheHumanReadableTraceWriter();
+
+  Status NewWritableFile(const std::string& human_readable_trace_file_path,
+                         rocksdb::Env* env);
+
+  Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
+                                       uint64_t block_id, uint64_t get_key_id);
+
+ private:
+  char trace_record_buffer_[1024 * 1024];
+  std::unique_ptr<rocksdb::WritableFile> human_readable_trace_file_writer_;
+};
+
 // BlockCacheTraceReader helps read the trace file generated by
 // BlockCacheTraceWriter using a user provided TraceReader.
 class BlockCacheTraceReader {
@@ -215,6 +234,23 @@ class BlockCacheTraceReader {
   std::unique_ptr<TraceReader> trace_reader_;
 };
 
+// Read a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for detailed.
+class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
+ public:
+  BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
+
+  ~BlockCacheHumanReadableTraceReader();
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::ifstream human_readable_trace_reader_;
+};
+
 // A block cache tracer. It downsamples the accesses according to
 // trace_options and uses BlockCacheTraceWriter to write the access record to
 // the trace file.
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index c9983aee190..cc245c30ed8 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -321,6 +321,55 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
   }
 }
 
+TEST_F(BlockCacheTracerTest, HumanReadableTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  record.get_id = 1;
+  record.referenced_key = "";
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kTrue;
+  record.referenced_data_size = kReferencedDataSize;
+  PutFixed32(&record.referenced_key, 111);
+  PutLengthPrefixedSlice(&record.referenced_key, "get_key");
+  PutFixed64(&record.referenced_key, 2 << 8);
+  PutLengthPrefixedSlice(&record.block_key, "block_key");
+  PutVarint64(&record.block_key, 333);
+  {
+    // Generate a human readable trace file.
+    BlockCacheHumanReadableTraceWriter writer;
+    ASSERT_OK(writer.NewWritableFile(trace_file_path_, env_));
+    ASSERT_OK(writer.WriteHumanReadableTraceRecord(record, 1, 1));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    BlockCacheHumanReadableTraceReader reader(trace_file_path_);
+    BlockCacheTraceHeader header;
+    BlockCacheTraceRecord read_record;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_OK(reader.ReadAccess(&read_record));
+    ASSERT_EQ(TraceType::kBlockTraceDataBlock, read_record.block_type);
+    ASSERT_EQ(kBlockSize, read_record.block_size);
+    ASSERT_EQ(kCFId, read_record.cf_id);
+    ASSERT_EQ(kDefaultColumnFamilyName, read_record.cf_name);
+    ASSERT_EQ(TableReaderCaller::kUserGet, read_record.caller);
+    ASSERT_EQ(kLevel, read_record.level);
+    ASSERT_EQ(kSSTFDNumber, read_record.sst_fd_number);
+    ASSERT_EQ(Boolean::kFalse, read_record.is_cache_hit);
+    ASSERT_EQ(Boolean::kFalse, read_record.no_insert);
+    ASSERT_EQ(1, read_record.get_id);
+    ASSERT_EQ(Boolean::kTrue, read_record.get_from_user_specified_snapshot);
+    ASSERT_EQ(Boolean::kTrue, read_record.referenced_key_exist_in_block);
+    ASSERT_EQ(kNumKeysInBlock, read_record.num_keys_in_block);
+    ASSERT_EQ(kReferencedDataSize, read_record.referenced_data_size);
+    ASSERT_EQ(record.block_key.size(), read_record.block_key.size());
+    ASSERT_EQ(record.referenced_key.size(), record.referenced_key.size());
+    ASSERT_EQ(112, BlockCacheTraceHelper::GetTableId(read_record));
+    ASSERT_EQ(3, BlockCacheTraceHelper::GetSequenceNumber(read_record));
+    ASSERT_EQ(333, BlockCacheTraceHelper::GetBlockOffsetInFile(read_record));
+    // Read again should fail.
+    ASSERT_NOK(reader.ReadAccess(&read_record));
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 5d9a67e7188da4d1b366c0c099acc86de52bae69 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 9 Aug 2019 15:08:36 -0700
Subject: [PATCH 288/572] Support loading custom objects in unit tests (#5676)

Summary:
Most existing RocksDB unit tests run on `Env::Default()`. It will be useful to port the unit tests to non-default environments, e.g. `HdfsEnv`, etc.
This pull request is one step towards this goal. If RocksDB unit tests are built with a static library exposing a function `RegisterCustomObjects()`, then it is possible to implement custom object registrar logic in the library. RocksDB unit test can call `RegisterCustomObjects()` at the beginning.
By default, `ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS` is not defined, thus this PR has no impact on existing RocksDB because `RegisterCustomObjects()` is a noop.
Test plan (on devserver):
```
$make clean && COMPILE_WITH_ASAN=1 make -j32 all
$make check
```
All unit tests must pass.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5676

Differential Revision: D16679157

Pulled By: riversand963

fbshipit-source-id: aca571af3fd0525277cdc674248d0fe06e060f9d
---
 HISTORY.md               |  2 ++
 db/column_family_test.cc | 27 +++++++++++++++++++++++++--
 db/db_basic_test.cc      |  9 +++++++++
 db/db_test.cc            |  9 +++++++++
 db/db_test2.cc           |  9 +++++++++
 db/db_test_util.cc       | 35 +++++++++++++++++++++++------------
 db/db_test_util.h        |  1 +
 7 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 201cef2b1b3..8bcb47505a7 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
+### New Features
+* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 63d987f3c99..b55d50a4916 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -17,9 +17,11 @@
 #include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
@@ -60,8 +62,18 @@ class EnvCounter : public EnvWrapper {
 
 class ColumnFamilyTestBase : public testing::Test {
  public:
-  ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
-    env_ = new EnvCounter(Env::Default());
+  explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    Env* base_env = Env::Default();
+    if (test_env_uri) {
+      Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
+                                                                &env_guard_);
+      base_env = env_guard_.get();
+      EXPECT_OK(s);
+      EXPECT_NE(Env::Default(), base_env);
+    }
+    EXPECT_NE(nullptr, base_env);
+    env_ = new EnvCounter(base_env);
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
     db_options_.fail_if_options_file_error = true;
@@ -532,6 +544,7 @@ class ColumnFamilyTestBase : public testing::Test {
   std::string dbname_;
   DB* db_ = nullptr;
   EnvCounter* env_;
+  std::shared_ptr<Env> env_guard_;
   Random rnd_;
   uint32_t format_;
 };
@@ -3312,7 +3325,17 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index dc77fb91a9b..6104b142a30 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1776,8 +1776,17 @@ INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_test.cc b/db/db_test.cc
index 5c96bec36c5..ae0481da89d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6263,8 +6263,17 @@ TEST_F(DBTest, LargeBlockSizeTest) {
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 26604c53ad8..2c993580ac2 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3823,8 +3823,17 @@ TEST_F(DBTest2, RowCacheSnapshot) {
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index ebfc7a9cad3..7abee3504ba 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 #include "db/forward_iterator.h"
 #include "rocksdb/env_encryption.h"
+#include "rocksdb/utilities/object_registry.h"
 
 namespace rocksdb {
 
@@ -47,20 +48,30 @@ ROT13BlockCipher rot13Cipher_(16);
 #endif  // ROCKSDB_LITE
 
 DBTestBase::DBTestBase(const std::string path)
-    : mem_env_(!getenv("MEM_ENV") ? nullptr : new MockEnv(Env::Default())),
-#ifndef ROCKSDB_LITE
-      encrypted_env_(
-          !getenv("ENCRYPTED_ENV")
-              ? nullptr
-              : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(),
-                                new CTREncryptionProvider(rot13Cipher_))),
-#else
+    : mem_env_(nullptr),
       encrypted_env_(nullptr),
-#endif  // ROCKSDB_LITE
-      env_(new SpecialEnv(encrypted_env_
-                              ? encrypted_env_
-                              : (mem_env_ ? mem_env_ : Env::Default()))),
       option_config_(kDefault) {
+  const char* test_env_uri = getenv("TEST_ENV_URI");
+  Env* base_env = Env::Default();
+  if (test_env_uri) {
+    Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
+                                                              &env_guard_);
+    base_env = env_guard_.get();
+    EXPECT_OK(s);
+    EXPECT_NE(Env::Default(), base_env);
+  }
+  EXPECT_NE(nullptr, base_env);
+  if (getenv("MEM_ENV")) {
+    mem_env_ = new MockEnv(base_env);
+  }
+#ifndef ROCKSDB_LITE
+  if (getenv("ENCRYPTED_ENV")) {
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env,
+                                     new CTREncryptionProvider(rot13Cipher_));
+  }
+#endif  // !ROCKSDB_LITE
+  env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+                                       : (mem_env_ ? mem_env_ : base_env));
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
   dbname_ = test::PerThreadDBPath(env_, path);
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 6e1d0ed7a13..6673714872f 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -702,6 +702,7 @@ class DBTestBase : public testing::Test {
   MockEnv* mem_env_;
   Env* encrypted_env_;
   SpecialEnv* env_;
+  std::shared_ptr<Env> env_guard_;
   DB* db_;
   std::vector<ColumnFamilyHandle*> handles_;
 

From 12eaacb71dfdc89661b8c759a22dfd7a0d03dac6 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Fri, 9 Aug 2019 16:35:16 -0700
Subject: [PATCH 289/572] WritePrepared: Fix SmallestUnCommittedSeq bug (#5683)

Summary:
SmallestUnCommittedSeq reads two data structures, prepared_txns_ and delayed_prepared_. These two are updated in CheckPreparedAgainstMax when max_evicted_seq_ advances some prepared entires. To avoid the cost of acquiring a mutex, the read from them in SmallestUnCommittedSeq is not atomic. This creates a potential race condition.
The fix is to read the two data structures in the reverse order of their update. CheckPreparedAgainstMax copies the prepared entry to delayed_prepared_ before removing it from prepared_txns_ and SmallestUnCommittedSeq looks into prepared_txns_ before reading delayed_prepared_.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5683

Differential Revision: D16744699

Pulled By: maysamyabandeh

fbshipit-source-id: b1bdb134018beb0b9de58827f512662bea35cad0
---
 .../write_prepared_transaction_test.cc        | 62 +++++++++++++++++++
 .../transactions/write_prepared_txn_db.cc     |  6 +-
 .../transactions/write_prepared_txn_db.h      | 21 ++++---
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 2cb91f0d350..1e7384dc708 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1575,6 +1575,68 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) {
   delete txn0;
 }
 
+// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
+// delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
+// which moves prepared txns from prepared_txns_ to delayed_prepared_.
+TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // disable commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::vector<Transaction*> txns, committed_txns;
+
+  const int cnt = 100;
+  for (int i = 0; i < cnt; i++) {
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName("xid" + ToString(i)));
+    auto key = "key1" + ToString(i);
+    auto value = "value1" + ToString(i);
+    ASSERT_OK(txn->Put(Slice(key), Slice(value)));
+    ASSERT_OK(txn->Prepare());
+    txns.push_back(txn);
+  }
+
+  port::Mutex mutex;
+  Random rnd(1103);
+  rocksdb::port::Thread commit_thread([&]() {
+    for (int i = 0; i < cnt; i++) {
+      uint32_t index = rnd.Uniform(cnt - i);
+      Transaction* txn;
+      {
+        MutexLock l(&mutex);
+        txn = txns[index];
+        txns.erase(txns.begin() + index);
+      }
+      // Since commit cahce is practically disabled, commit results in immediate
+      // advance in max_evicted_seq_ and subsequently moving some prepared txns
+      // to delayed_prepared_.
+      txn->Commit();
+      committed_txns.push_back(txn);
+    }
+  });
+  rocksdb::port::Thread read_thread([&]() {
+    while (1) {
+      MutexLock l(&mutex);
+      if (txns.empty()) {
+        break;
+      }
+      auto min_uncommitted = wp_db->SmallestUnCommittedSeq();
+      ASSERT_LE(min_uncommitted, (*txns.begin())->GetId());
+    }
+  });
+
+  commit_thread.join();
+  read_thread.join();
+  for (auto txn : committed_txns) {
+    delete txn;
+  }
+}
+
 TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
   // Given the sequential run of txns, with this timeout we should never see a
   // deadlock nor a timeout unless we have a key conflict, which should be
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 7441cb3c093..7ff89a4f87f 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -432,8 +432,12 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
                      " new_max=%" PRIu64,
                      static_cast<uint64_t>(delayed_prepared_.size()),
                      to_be_popped, new_max);
-      prepared_txns_.pop();
       delayed_prepared_empty_.store(false, std::memory_order_release);
+      // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise
+      // there will be a point in time that the entry is neither in
+      // prepared_txns_ nor in delayed_prepared_, which will not be checked if
+      // delayed_prepared_empty_ is false.
+      prepared_txns_.pop();
     }
     if (locked) {
       prepared_txns_.push_pop_mutex()->Lock();
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 4ee7d8e6cf8..cdcee494100 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -500,6 +500,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
   friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
   friend class WritePreparedTransactionTest_RollbackTest_Test;
+  friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test;
   friend class WriteUnpreparedTxn;
   friend class WriteUnpreparedTxnDB;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
@@ -626,6 +627,19 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
                             const SequenceNumber& new_max);
 
   inline SequenceNumber SmallestUnCommittedSeq() {
+    // Note: We have two lists to look into, but for performance reasons they
+    // are not read atomically. Since CheckPreparedAgainstMax copies the entry
+    // to delayed_prepared_ before removing it from prepared_txns_, to ensure
+    // that a prepared entry will not go unmissed, we look into them in opposite
+    // order: first read prepared_txns_ and then delayed_prepared_.
+
+    // This must be called before calling ::top. This is because the concurrent
+    // thread would call ::RemovePrepared before updating
+    // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
+    // that the ::top that we read would be lower the ::top if we had otherwise
+    // update/read them atomically.
+    auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
+    auto min_prepare = prepared_txns_.top();
     // Since we update the prepare_heap always from the main write queue via
     // PreReleaseCallback, the prepared_txns_.top() indicates the smallest
     // prepared data in 2pc transactions. For non-2pc transactions that are
@@ -638,13 +652,6 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
         return *delayed_prepared_.begin();
       }
     }
-    // This must be called before calling ::top. This is because the concurrent
-    // thread would call ::RemovePrepared before updating
-    // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
-    // that the ::top that we read would be lower the ::top if we had otherwise
-    // update/read them atomically.
-    auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
-    auto min_prepare = prepared_txns_.top();
     bool empty = min_prepare == kMaxSequenceNumber;
     if (empty) {
       // Since GetLatestSequenceNumber is updated

From de3fb9a6ff7cbe77459f817815e58f89b75f468d Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Sat, 10 Aug 2019 19:12:09 -0700
Subject: [PATCH 290/572] exclude TEST_ENV_URI from rocksdb lite (#5686)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
PR https://github.com/facebook/rocksdb/pull/5676 added some test coverage for `TEST_ENV_URI`, which unfortunately isn't supported in lite mode, causing some test failures for rocksdb lite. For example,
```
db/db_test_util.cc: In constructor ‘rocksdb::DBTestBase::DBTestBase(std::__cxx11::string)’:
db/db_test_util.cc:57:16: error: ‘ObjectRegistry’ has not been declared
     Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
                ^
```
This PR fixes these errors by excluding the new code from test functions for lite mode.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5686

Differential Revision: D16749000

Pulled By: miasantreble

fbshipit-source-id: e8b3088c31a78b3dffc5fe7814261909d2c3e369
---
 db/column_family_test.cc | 4 +++-
 db/db_test_util.cc       | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index b55d50a4916..235313f48fb 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -63,8 +63,9 @@ class EnvCounter : public EnvWrapper {
 class ColumnFamilyTestBase : public testing::Test {
  public:
   explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
     Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+    const char* test_env_uri = getenv("TEST_ENV_URI");
     if (test_env_uri) {
       Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
                                                                 &env_guard_);
@@ -72,6 +73,7 @@ class ColumnFamilyTestBase : public testing::Test {
       EXPECT_OK(s);
       EXPECT_NE(Env::Default(), base_env);
     }
+#endif  // !ROCKSDB_LITE
     EXPECT_NE(nullptr, base_env);
     env_ = new EnvCounter(base_env);
     dbname_ = test::PerThreadDBPath("column_family_test");
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 7abee3504ba..da71429a7e9 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -51,8 +51,9 @@ DBTestBase::DBTestBase(const std::string path)
     : mem_env_(nullptr),
       encrypted_env_(nullptr),
       option_config_(kDefault) {
-  const char* test_env_uri = getenv("TEST_ENV_URI");
   Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+  const char* test_env_uri = getenv("TEST_ENV_URI");
   if (test_env_uri) {
     Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
                                                               &env_guard_);
@@ -60,6 +61,7 @@ DBTestBase::DBTestBase(const std::string path)
     EXPECT_OK(s);
     EXPECT_NE(Env::Default(), base_env);
   }
+#endif  // !ROCKSDB_LITE
   EXPECT_NE(nullptr, base_env);
   if (getenv("MEM_ENV")) {
     mem_env_ = new MockEnv(base_env);

From 6f0f82de87382b2c2748926386c4678db6cda26e Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Mon, 12 Aug 2019 12:11:21 -0700
Subject: [PATCH 291/572] WriteUnPrepared: increase test coverage in
 transaction_test (#5658)

Summary:
The changes transaction_test to set `txn_db_options.default_write_batch_flush_threshold = 1` in order to give better test coverage for WriteUnprepared.

As part of the change, some tests had to be updated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5658

Differential Revision: D16740468

Pulled By: lth

fbshipit-source-id: 3821eec20baf13917c8c1fab444332f75a509de9
---
 include/rocksdb/utilities/transaction.h       |  4 ++
 include/rocksdb/utilities/transaction_db.h    |  6 +++
 utilities/transactions/transaction_test.cc    | 31 ++++++++-------
 utilities/transactions/transaction_test.h     | 36 ++++++++++++++++--
 .../write_prepared_transaction_test.cc        |  2 +-
 .../write_unprepared_transaction_test.cc      | 12 +++++-
 .../transactions/write_unprepared_txn.cc      | 38 ++++++++++++++-----
 utilities/transactions/write_unprepared_txn.h | 15 ++++++++
 8 files changed, 116 insertions(+), 28 deletions(-)

diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index a3f9f6303cb..c0d8537a2bb 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -522,9 +522,13 @@ class Transaction {
     id_ = id;
   }
 
+  virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
  private:
   friend class PessimisticTransactionDB;
   friend class WriteUnpreparedTxnDB;
+  friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+  friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
   // No copying allowed
   Transaction(const Transaction&);
   void operator=(const Transaction&);
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 7798e63da7b..fa271c35d56 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -112,8 +112,14 @@ struct TransactionDBOptions {
   // 8m entry, 64MB size
   size_t wp_commit_cache_bits = static_cast<size_t>(23);
 
+  // For testing, whether transaction name should be auto-generated or not. This
+  // is useful for write unprepared which requires named transactions.
+  bool autogenerate_name = false;
+
   friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxn;
   friend class WritePreparedTransactionTestBase;
+  friend class TransactionTestBase;
   friend class MySQLStyleTransactionTest;
 };
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 98548dd9555..551632614c1 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -1727,7 +1727,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   // our log should be in the heap
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
             txn1->GetLogNumber());
-  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
 
   // flush default cf to crate new log
   s = db->Put(wopts, "foo", "bar");
@@ -1736,12 +1736,12 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   ASSERT_OK(s);
 
   // make sure we are on a new log
-  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
 
   // put txn2 prep section in this log
   s = txn2->Prepare();
   ASSERT_OK(s);
-  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
 
   // heap should still see first log
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
@@ -1777,7 +1777,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   ASSERT_OK(s);
 
   // make sure we are on a new log
-  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
 
   // commit txn2
   s = txn2->Commit();
@@ -1878,7 +1878,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
   s = db->Put(wopts, "cats", "dogs1");
   ASSERT_OK(s);
 
-  auto prepare_log_no = txn1->GetLogNumber();
+  auto prepare_log_no = txn1->GetLastLogNumber();
 
   // roll to LOG B
   s = db_impl->TEST_FlushMemTable(true);
@@ -1905,7 +1905,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
       assert(false);
   }
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
-            prepare_log_no);
+            txn1->GetLogNumber());
   ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
 
   // commit in LOG B
@@ -2604,10 +2604,8 @@ TEST_P(TransactionTest, ColumnFamiliesTest) {
 
   std::vector<ColumnFamilyHandle*> handles;
 
-  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
-                          &handles, &db);
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
   assert(db != nullptr);
-  ASSERT_OK(s);
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -2769,10 +2767,8 @@ TEST_P(TransactionTest, MultiGetBatchedTest) {
   std::vector<ColumnFamilyHandle*> handles;
 
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
-  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
-                          &handles, &db);
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
   assert(db != nullptr);
-  ASSERT_OK(s);
 
   // Write some data to the db
   WriteBatch batch;
@@ -3132,6 +3128,12 @@ TEST_P(TransactionTest, LostUpdate) {
 }
 
 TEST_P(TransactionTest, UntrackedWrites) {
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    // TODO(lth): For WriteUnprepared, validate that untracked writes are
+    // not supported.
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options;
   std::string value;
@@ -3376,7 +3378,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // Open DB with a lock limit of 3
   txn_db_options.max_num_locks = 3;
-  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(ReOpen());
   assert(db != nullptr);
   ASSERT_OK(s);
 
@@ -5285,6 +5287,9 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   TransactionOptions txn_options;
   // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
   txn_options.max_write_batch_size = 29;
+  // Set threshold to unlimited so that the write batch does not get flushed,
+  // and can hit the memory limit.
+  txn_options.write_batch_flush_threshold = 0;
   std::string value;
   Status s;
 
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 9b634c11ca7..abfa7d8a9ed 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -27,6 +27,7 @@
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
 
 #include "port/port.h"
 
@@ -67,6 +68,12 @@ class TransactionTestBase : public ::testing::Test {
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
     txn_db_options.rollback_merge_operands = true;
+    // This will stress write unprepared, by forcing write batch flush on every
+    // write.
+    txn_db_options.default_write_batch_flush_threshold = 1;
+    // Write unprepared requires all transactions to be named. This setting
+    // autogenerates the name so that existing tests can pass.
+    txn_db_options.autogenerate_name = true;
     Status s;
     if (use_stackable_db == false) {
       s = TransactionDB::Open(options, txn_db_options, dbname, &db);
@@ -273,13 +280,20 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // Consume one seq per key
       exp_seq += 4;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       if (options.two_write_queues) {
         // Consume one seq for commit
         exp_seq++;
       }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 4;
+      // WriteUnprepared implements CommitWithoutPrepareInternal by simply
+      // calling Prepare then Commit. Consume one seq for the prepare.
+      exp_seq++;
     }
     delete txn;
     with_empty_commits++;
@@ -303,11 +317,17 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // Consume one seq per key
       exp_seq += 5;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       // Consume one seq per commit marker
       exp_seq++;
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per commit marker
+      exp_seq++;
     }
     delete txn;
   };
@@ -330,7 +350,8 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // No seq is consumed for deleting the txn buffer
       exp_seq += 0;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       // Consume one seq per rollback batch
@@ -339,6 +360,15 @@ class TransactionTestBase : public ::testing::Test {
         // Consume one seq for rollback commit
         exp_seq++;
       }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per rollback batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for rollback commit
+        exp_seq++;
+      }
     }
     delete txn;
   };
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 1e7384dc708..d4f0d993aa8 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1612,7 +1612,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
         txn = txns[index];
         txns.erase(txns.begin() + index);
       }
-      // Since commit cahce is practically disabled, commit results in immediate
+      // Since commit cache is practically disabled, commit results in immediate
       // advance in max_evicted_seq_ and subsequently moving some prepared txns
       // to delayed_prepared_.
       txn->Commit();
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index e9d305c69e9..88b6389751d 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -335,7 +335,11 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
           for (int i = 0; i < num_batches; i++) {
             ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
             if (txn_options.write_batch_flush_threshold == 1) {
-              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+              // WriteUnprepared will check write_batch_flush_threshold and
+              // possibly flush before appending to the write batch. No flush
+              // will happen at the first write because the batch is still
+              // empty, so after k puts, there should be k-1 flushed batches.
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
             } else {
               ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
             }
@@ -411,7 +415,11 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
         for (int i = 0; i < kNumKeys; i++) {
           txn->Put("k" + ToString(i), "v" + ToString(i));
           if (txn_options.write_batch_flush_threshold == 1) {
-            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+            // WriteUnprepared will check write_batch_flush_threshold and
+            // possibly flush before appending to the write batch. No flush will
+            // happen at the first write because the batch is still empty, so
+            // after k puts, there should be k-1 flushed batches.
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
           } else {
             ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
           }
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 321110ea1b6..85a38981c16 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -37,6 +37,7 @@ WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const TransactionOptions& txn_options)
     : WritePreparedTxn(txn_db, write_options, txn_options),
       wupt_db_(txn_db),
+      last_log_number_(0),
       recovered_txn_(false),
       largest_validated_seq_(0) {
   if (txn_options.write_batch_flush_threshold < 0) {
@@ -56,10 +57,15 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() {
     // We should rollback regardless of GetState, but some unit tests that
     // test crash recovery run the destructor assuming that rollback does not
     // happen, so that rollback during recovery can be exercised.
-    if (GetState() == STARTED) {
-      auto s __attribute__((__unused__)) = RollbackInternal();
-      // TODO(lth): Better error handling.
+    if (GetState() == STARTED || GetState() == LOCKS_STOLEN) {
+      auto s = RollbackInternal();
       assert(s.ok());
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(
+            wupt_db_->info_log_,
+            "Rollback of WriteUnprepared transaction failed in destructor: %s",
+            s.ToString().c_str());
+      }
       dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
           log_number_);
     }
@@ -233,6 +239,7 @@ Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
   const bool kPrepared = true;
   Status s;
   if (write_batch_flush_threshold_ > 0 &&
+      write_batch_.GetWriteBatch()->Count() > 0 &&
       write_batch_.GetDataSize() >
           static_cast<size_t>(write_batch_flush_threshold_)) {
     assert(GetState() != PREPARED);
@@ -257,7 +264,17 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
 
 Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   if (name_.empty()) {
-    return Status::InvalidArgument("Cannot write to DB without SetName.");
+    assert(!prepared);
+#ifndef NDEBUG
+    static std::atomic_ullong autogen_id{0};
+    // To avoid changing all tests to call SetName, just autogenerate one.
+    if (wupt_db_->txn_db_options_.autogenerate_name) {
+      SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+    } else
+#endif
+    {
+      return Status::InvalidArgument("Cannot write to DB without SetName.");
+    }
   }
 
   // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
@@ -285,11 +302,14 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   // from the current transaction. This means that if log_number_ is set,
   // WriteImpl should not overwrite that value, so set log_used to nullptr if
   // log_number_ is already set.
-  uint64_t* log_used = log_number_ ? nullptr : &log_number_;
-  auto s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                               /*callback*/ nullptr, log_used, /*log ref*/
-                               0, !DISABLE_MEMTABLE, &seq_used,
-                               prepare_batch_cnt_, &add_prepared_callback);
+  auto s =
+      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &last_log_number_, /*log ref*/
+                          0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
+  if (log_number_ == 0) {
+    log_number_ = last_log_number_;
+  }
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   auto prepare_seq = seq_used;
 
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 5c654b05ba8..cfa18d6eecd 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -145,8 +145,21 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                               const SliceParts& key,
                               const bool assume_tracked = false) override;
 
+  // In WriteUnprepared, untracked writes will break snapshot validation logic.
+  // Snapshot validation will only check the largest sequence number of a key to
+  // see if it was committed or not. However, an untracked unprepared write will
+  // hide smaller committed sequence numbers.
+  //
+  // TODO(lth): Investigate whether it is worth having snapshot validation
+  // validate all values larger than snap_seq. Otherwise, we should return
+  // Status::NotSupported for untracked writes.
+
   virtual Status RebuildFromWriteBatch(WriteBatch*) override;
 
+  virtual uint64_t GetLastLogNumber() const override {
+    return last_log_number_;
+  }
+
  protected:
   void Initialize(const TransactionOptions& txn_options) override;
 
@@ -219,6 +232,8 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // commit callbacks.
   std::map<SequenceNumber, size_t> unprep_seqs_;
 
+  uint64_t last_log_number_;
+
   // Recovered transactions have tracked_keys_ populated, but are not actually
   // locked for efficiency reasons. For recovered transactions, skip unlocking
   // keys when transaction ends.

From 64855979aeb74ae4eaa5b6576243a132ae2267bc Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 12 Aug 2019 12:17:26 -0700
Subject: [PATCH 292/572] WriteUnPrepared: Pass snap_released to the callback
 (#5691)

Summary:
With changes made in https://github.com/facebook/rocksdb/pull/5664 we meant to pass snap_released parameter of ::IsInSnapshot from the read callbacks. Although the variable was defined, passing it to the callback in WritePreparedTxnReadCallback was missing, which is fixed in this PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5691

Differential Revision: D16767310

Pulled By: maysamyabandeh

fbshipit-source-id: 3bf53f5964a2756a66ceef7c8f6b3ac75f102f48
---
 utilities/transactions/write_unprepared_txn.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 85a38981c16..761da30afec 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -26,7 +26,8 @@ bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
   }
 
   bool snap_released = false;
-  auto ret = db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
+  auto ret =
+      db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released);
   assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
   snap_released_ |= snap_released;
   return ret;

From 04a849b7b408e05d55557a16d1d677b80c543b30 Mon Sep 17 00:00:00 2001
From: Yi Zhang <yzha@fb.com>
Date: Mon, 12 Aug 2019 16:39:02 -0700
Subject: [PATCH 293/572] Fix compiler error by deleting GetContext default
 ctor (#5685)

Summary:
When updating compiler version for MyRocks I'm seeing this error with rocksdb:

```
ome/yzha/mysql/mysql-fork2/rocksdb/table/get_context.h:91:3: error: explicitly defaulted default constructor is implicitly deleted
      [-Werror,-Wdefaulted-function-deleted]
  GetContext() = default;
  ^
/home/yzha/mysql/mysql-fork2/rocksdb/table/get_context.h:166:18: note: default constructor of 'GetContext' is implicitly deleted because field
      'tracing_get_id_' of const-qualified type 'const uint64_t' (aka 'const unsigned long') would not be initialized
  const uint64_t tracing_get_id_;
                 ^
```

The error itself is rather self explanatory and makes sense.

Given that no one seems to be using the default ctor (they shouldn't, anyway), I'm deleting it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5685

Differential Revision: D16747712

Pulled By: yizhang82

fbshipit-source-id: 95c0acb958a1ed41154c0047d2e6fce7644de53f
---
 table/get_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/get_context.h b/table/get_context.h
index 97d73ec0b3a..1a15949904f 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -95,7 +95,7 @@ class GetContext {
              ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
              uint64_t tracing_get_id = 0);
 
-  GetContext() = default;
+  GetContext() = delete;
 
   // This can be called to indicate that a key may be present, but cannot be
   // confirmed due to IO not allowed

From 8a678a50ba2048f1b65247fca80037c390a2ff38 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Tue, 13 Aug 2019 13:08:48 -0700
Subject: [PATCH 294/572] WriteUnPrepared: Relax restriction on iterators and
 writes with no snapshot (#5697)

Summary:
Currently, if a write is done without a snapshot, then `largest_validated_seq_` is set to `kMaxSequenceNumber`. This is too aggressive, because an iterator with a snapshot created after this write should be valid.

Set `largest_validated_seq_` to `GetLastPublishedSequence` instead. The variable means that no keys in the current tracked key set has changed by other transactions since `largest_validated_seq_`.

Also, do some extra cleanup in Clear() for safety.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5697

Differential Revision: D16788613

Pulled By: lth

fbshipit-source-id: f2aa40b8b12e0c0cf9e38c940fecc8f1cc0d2385
---
 .../write_unprepared_transaction_test.cc      | 27 +++++++++++++++++++
 .../transactions/write_unprepared_txn.cc      | 10 ++++++-
 utilities/transactions/write_unprepared_txn.h |  6 ++---
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 88b6389751d..48a07fa12de 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -537,6 +537,33 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
   }
 }
 
+TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+
+  // Do some writes with no snapshot
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Put("c", "c"));
+
+  // Test that it is still possible to create iterators after writes with no
+  // snapshot, if iterator snapshot is fresh enough.
+  ReadOptions roptions;
+  auto iter = txn->GetIterator(roptions);
+  int keys = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+  }
+  ASSERT_EQ(keys, 3);
+
+  delete iter;
+  delete txn;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 761da30afec..af39680ac61 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -106,7 +106,10 @@ Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
       largest_validated_seq_ =
           std::max(largest_validated_seq_, snapshot_->GetSequenceNumber());
     } else {
-      largest_validated_seq_ = kMaxSequenceNumber;
+      // TODO(lth): We should use the same number as tracked_at_seq in TryLock,
+      // because what is actually being tracked is the sequence number at which
+      // this key was locked at.
+      largest_validated_seq_ = db_impl_->GetLastPublishedSequence();
     }
   }
   return s;
@@ -680,6 +683,11 @@ void WriteUnpreparedTxn::Clear() {
   if (!recovered_txn_) {
     txn_db_impl_->UnLock(this, &GetTrackedKeys());
   }
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  recovered_txn_ = false;
+  largest_validated_seq_ = 0;
   TransactionBaseImpl::Clear();
 }
 
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index cfa18d6eecd..692578f612f 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -241,9 +241,9 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   // Track the largest sequence number at which we performed snapshot
   // validation. If snapshot validation was skipped because no snapshot was set,
-  // then this is set to kMaxSequenceNumber. This value is useful because it
-  // means that for keys that have unprepared seqnos, we can guarantee that no
-  // committed keys by other transactions can exist between
+  // then this is set to GetLastPublishedSequence. This value is useful because
+  // it means that for keys that have unprepared seqnos, we can guarantee that
+  // no committed keys by other transactions can exist between
   // largest_validated_seq_ and max_unprep_seq. See
   // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
   // necessary for iterator Prev().

From 90cd6c2bb17a54e97c32856254fd666c9cab8dc5 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Tue, 13 Aug 2019 21:51:42 -0700
Subject: [PATCH 295/572] Fix double deletion in transaction_test (#5700)

Summary:
Fix the following clang analyze failures:
```
In file included from utilities/transactions/transaction_test.cc:8:
./utilities/transactions/transaction_test.h:174:14: warning: Attempt to delete released memory
      delete root_db;
             ^
```
The destructor of StackableDB already deletes the root db and there is no need to delete the db separately.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5700

Test Plan: USE_CLANG=1 TEST_TMPDIR=/dev/shm/rocksdb OPT=-g make -j24 analyze

Differential Revision: D16800579

Pulled By: maysamyabandeh

fbshipit-source-id: 64c2d70f23e07e6a15242add97c744902ea33be5
---
 utilities/transactions/transaction_test.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index abfa7d8a9ed..03bfb6537fd 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -170,8 +170,6 @@ class TransactionTestBase : public ::testing::Test {
     }
     if (!s.ok()) {
       delete stackable_db;
-      // just in case it was not deleted (and not set to nullptr).
-      delete root_db;
     }
     return s;
   }
@@ -207,8 +205,6 @@ class TransactionTestBase : public ::testing::Test {
     delete handles[0];
     if (!s.ok()) {
       delete stackable_db;
-      // just in case it was not deleted (and not set to nullptr).
-      delete root_db;
     }
     return s;
   }

From 4c70cb730614388041b97a31ae2e5addb1279284 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Wed, 14 Aug 2019 14:25:00 -0700
Subject: [PATCH 296/572] WriteUnPrepared: support iterating while writing to
 transaction (#5699)

Summary:
In MyRocks, there are cases where we write while iterating through keys. This currently breaks WBWIIterator, because if a write batch flushes during iteration, the delta iterator would point to invalid memory.

For now, fix by disallowing flush if there are active iterators. In the future, we will loop through all the iterators on a transaction, and refresh the iterators when a write batch is flushed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5699

Differential Revision: D16794157

Pulled By: lth

fbshipit-source-id: 5d5bf70688bd68fe58e8a766475ae88fd1be3190
---
 .../write_unprepared_transaction_test.cc      | 57 +++++++++++++++++++
 .../transactions/write_unprepared_txn.cc      | 26 +++++++--
 utilities/transactions/write_unprepared_txn.h | 13 +++++
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 48a07fa12de..51e860e6376 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -564,6 +564,63 @@ TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
   delete txn;
 }
 
+// Test whether write to a transaction while iterating is supported.
+TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { DO_DELETE, DO_UPDATE };
+
+  for (Action a : {DO_DELETE, DO_UPDATE}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, ToString(i), ToString(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    // write_batch_ now contains 1 key.
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter = txn->GetIterator(roptions);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      if (iter->key() == "9") {
+        ASSERT_EQ(iter->value().ToString(), "a");
+      } else {
+        ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+      }
+
+      if (a == DO_DELETE) {
+        ASSERT_OK(txn->Delete(iter->key()));
+      } else {
+        ASSERT_OK(txn->Put(iter->key(), "b"));
+      }
+    }
+
+    delete iter;
+    ASSERT_OK(txn->Commit());
+
+    iter = db->NewIterator(roptions);
+    if (a == DO_DELETE) {
+      // Check that db is empty.
+      iter->SeekToFirst();
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      int keys = 0;
+      // Check that all values are updated to b.
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) {
+        ASSERT_OK(iter->status());
+        ASSERT_EQ(iter->value().ToString(), "b");
+      }
+      ASSERT_EQ(keys, 100);
+    }
+
+    delete iter;
+    delete txn;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index af39680ac61..18ebc37002c 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -93,12 +93,17 @@ void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   unflushed_save_points_.reset(nullptr);
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
+  assert(active_iterators_.empty());
+  active_iterators_.clear();
 }
 
 Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
+  Status s;
+  if (active_iterators_.empty()) {
+    s = MaybeFlushWriteBatchToDB();
+    if (!s.ok()) {
+      return s;
+    }
   }
   s = do_write();
   if (s.ok()) {
@@ -688,6 +693,8 @@ void WriteUnpreparedTxn::Clear() {
   unflushed_save_points_.reset(nullptr);
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
+  assert(active_iterators_.empty());
+  active_iterators_.clear();
   TransactionBaseImpl::Clear();
 }
 
@@ -862,6 +869,14 @@ Status WriteUnpreparedTxn::Get(const ReadOptions& options,
   }
 }
 
+namespace {
+static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) {
+  auto txn = reinterpret_cast<WriteUnpreparedTxn*>(arg1);
+  auto iter = reinterpret_cast<Iterator*>(arg2);
+  txn->RemoveActiveIterator(iter);
+}
+}  // anonymous namespace
+
 Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) {
   return GetIterator(options, wupt_db_->DefaultColumnFamily());
 }
@@ -872,7 +887,10 @@ Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
   Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+  auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter);
+  active_iterators_.push_back(iter);
+  iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter);
+  return iter;
 }
 
 Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 692578f612f..e2a5399c3b0 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -160,6 +160,12 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
     return last_log_number_;
   }
 
+  void RemoveActiveIterator(Iterator* iter) {
+    active_iterators_.erase(
+        std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
+        active_iterators_.end());
+  }
+
  protected:
   void Initialize(const TransactionOptions& txn_options) override;
 
@@ -302,6 +308,13 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
       flushed_save_points_;
   std::unique_ptr<autovector<size_t>> unflushed_save_points_;
+
+  // It is currently unsafe to flush a write batch if there are active iterators
+  // created from this transaction. This is because we use WriteBatchWithIndex
+  // to do merging reads from the DB and the write batch. If we flush the write
+  // batch, it is possible that the delta iterator on the iterator will point to
+  // invalid memory.
+  std::vector<Iterator*> active_iterators_;
 };
 
 }  // namespace rocksdb

From 0a97125ec07f72c0dc0dd11d21327be070a03536 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Wed, 14 Aug 2019 16:07:03 -0700
Subject: [PATCH 297/572] Fix data races in BlobDB (#5698)

Summary:
Some accesses to blob_files_ and open_ttl_files_ in BlobDBImpl, as well
as to expiration_range_ in BlobFile were not properly synchronized.
The patch fixes this and also makes sure the invariant that obsolete_files_
is a subset of blob_files_ holds even when an attempt to delete an obsolete
blob file fails.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5698

Test Plan:
COMPILE_WITH_TSAN=1 make blob_db_test
gtest-parallel --repeat=1000 ./blob_db_test --gtest_filter="*ShutdownWait*"

The test fails with TSAN errors ~20 times out of 1000 without the patch but
completes successfully 1000 out of 1000 times with the fix.

Differential Revision: D16793235

Pulled By: ltamasi

fbshipit-source-id: 8034b987598d4fdc9f15098d4589cc49cde484e9
---
 utilities/blob_db/blob_db_impl.cc | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 86501280d22..8088e4273dc 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -724,6 +724,7 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
     }
     if (s.ok()) {
       if (expiration != kNoExpiration) {
+        WriteLock file_lock(&blob_file->mutex_);
         blob_file->ExtendExpirationRange(expiration);
       }
       s = CloseBlobFileIfNeeded(blob_file);
@@ -1177,6 +1178,8 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
     return std::make_pair(false, -1);
   }
 
+  ReadLock rl(&mutex_);
+
   ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
   ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt,
                  blob_files_.size());
@@ -1198,7 +1201,13 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
                        blob_file->BlobFileNumber(), blob_file->GetFileSize(),
                        blob_file->BlobCount(), blob_file->Immutable());
     if (blob_file->HasTTL()) {
-      auto expiration_range = blob_file->GetExpirationRange();
+      ExpirationRange expiration_range;
+
+      {
+        ReadLock file_lock(&blob_file->mutex_);
+        expiration_range = blob_file->GetExpirationRange();
+      }
+
       pos += snprintf(buf + pos, sizeof(buf) - pos,
                       ", expiration range (%" PRIu64 ", %" PRIu64 ")",
                       expiration_range.first, expiration_range.second);
@@ -1501,7 +1510,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
   // this reads the key but skips the blob
   Reader::ReadLevel shallow = Reader::kReadHeaderKey;
 
-  bool file_expired = has_ttl && now >= bfptr->GetExpirationRange().second;
+  ExpirationRange expiration_range;
+
+  {
+    ReadLock file_lock(&bfptr->mutex_);
+    expiration_range = bfptr->GetExpirationRange();
+  }
+
+  bool file_expired = has_ttl && now >= expiration_range.second;
 
   if (!file_expired) {
     // read the blob because you have to write it back to new file
@@ -1761,7 +1777,11 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
                    "Will delete file due to snapshot success %s",
                    bfile->PathName().c_str());
 
-    blob_files_.erase(bfile->BlobFileNumber());
+    {
+      WriteLock wl(&mutex_);
+      blob_files_.erase(bfile->BlobFileNumber());
+    }
+
     Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
                             bfile->PathName(), blob_dir_, true,
                             /*force_fg=*/false);
@@ -1794,6 +1814,7 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
   if (!tobsolete.empty()) {
     WriteLock wl(&mutex_);
     for (auto bfile : tobsolete) {
+      blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
       obsolete_files_.push_front(bfile);
     }
   }

From 7785f61132afdea73d2bfef5b9ccb79dfc7a0968 Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Wed, 14 Aug 2019 16:08:38 -0700
Subject: [PATCH 298/572] WriteUnPrepared: Fix bug in savepoints (#5703)

Summary:
Fix a bug in write unprepared savepoints. When flushing the write batch according to savepoint boundaries, we were forgetting to flush the last write batch after the last savepoint, meaning that some data was not written to DB.

Also, add a small optimization where we avoid flushing empty batches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5703

Differential Revision: D16811996

Pulled By: lth

fbshipit-source-id: 600c7e0e520ad7a8fad32d77e11d932453e68e3f
---
 .../write_unprepared_transaction_test.cc      | 20 +++++++++++++
 .../transactions/write_unprepared_txn.cc      | 30 +++++++++++--------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 51e860e6376..7257c9880cf 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -621,6 +621,26 @@ TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
   }
 }
 
+TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+  txn->SetSavePoint();
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Commit());
+
+  ReadOptions roptions;
+  std::string value;
+  ASSERT_OK(txn->Get(roptions, "a", &value));
+  ASSERT_EQ(value, "a");
+  ASSERT_OK(txn->Get(roptions, "b", &value));
+  ASSERT_EQ(value, "b");
+  delete txn;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 18ebc37002c..d4e5abad542 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -407,10 +407,13 @@ Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
 
   size_t prev_boundary = WriteBatchInternal::kHeader;
   const bool kPrepared = true;
-  for (size_t i = 0; i < unflushed_save_points_->size(); i++) {
+  for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) {
+    bool trailing_batch = i == unflushed_save_points_->size();
     SavePointBatchHandler sp_handler(&write_batch_,
                                      *wupt_db_->GetCFHandleMap().get());
-    size_t curr_boundary = (*unflushed_save_points_)[i];
+    size_t curr_boundary = trailing_batch
+                               ? wb.GetWriteBatch()->GetDataSize()
+                               : (*unflushed_save_points_)[i];
 
     // Construct the partial write batch up to the savepoint.
     //
@@ -424,18 +427,22 @@ Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
       return s;
     }
 
-    // Flush the write batch.
-    s = FlushWriteBatchToDBInternal(!kPrepared);
-    if (!s.ok()) {
-      return s;
+    if (write_batch_.GetWriteBatch()->Count() > 0) {
+      // Flush the write batch.
+      s = FlushWriteBatchToDBInternal(!kPrepared);
+      if (!s.ok()) {
+        return s;
+      }
     }
 
-    if (flushed_save_points_ == nullptr) {
-      flushed_save_points_.reset(
-          new autovector<WriteUnpreparedTxn::SavePoint>());
+    if (!trailing_batch) {
+      if (flushed_save_points_ == nullptr) {
+        flushed_save_points_.reset(
+            new autovector<WriteUnpreparedTxn::SavePoint>());
+      }
+      flushed_save_points_->emplace_back(
+          unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
     }
-    flushed_save_points_->emplace_back(
-        unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
 
     prev_boundary = curr_boundary;
     const bool kClear = true;
@@ -736,7 +743,6 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
   assert(flushed_save_points_->size() > 0);
   WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
 
-  assert(top.unprep_seqs_.size() > 0);
   assert(save_points_ != nullptr && save_points_->size() > 0);
   const TransactionKeyMap& tracked_keys = save_points_->top().new_keys_;
 

From 77273d4137b4ee3be284c72f0c9dc3ce369038b0 Mon Sep 17 00:00:00 2001
From: Aaryaman Sagar <rmn100@gmail.com>
Date: Wed, 14 Aug 2019 16:58:11 -0700
Subject: [PATCH 299/572] Fix TSAN failures in DistributedMutex tests (#5684)

Summary:
TSAN was not able to correctly instrument atomic bts and btr instructions, so
when TSAN is enabled implement those with std::atomic::fetch_or and
std::atomic::fetch_and. Also disable tests that fail on TSAN with false
negatives (we know these are false negatives because this other verifiably
correct program fails with the same TSAN error <link>)

```
make clean
TEST_TMPDIR=/dev/shm/rocksdb OPT=-g COMPILE_WITH_TSAN=1 make J=1 -j56 folly_synchronization_distributed_mutex_test
```

This is the code that fails with the same false-negative with TSAN
```
namespace {
class ExceptionWithConstructionTrack : public std::exception {
 public:
  explicit ExceptionWithConstructionTrack(int id)
      : id_{folly::to<std::string>(id)}, constructionTrack_{id} {}

  const char* what() const noexcept override {
    return id_.c_str();
  }

 private:
  std::string id_;
  TestConstruction constructionTrack_;
};

template <typename Storage, typename Atomic>
void transferCurrentException(Storage& storage, Atomic& produced) {
  assert(std::current_exception());
  new (&storage) std::exception_ptr(std::current_exception());
  produced->store(true, std::memory_order_release);
}

void concurrentExceptionPropagationStress(
    int numThreads,
    std::chrono::milliseconds milliseconds) {
  auto&& stop = std::atomic<bool>{false};
  auto&& exceptions = std::vector<std::aligned_storage<48, 8>::type>{};
  auto&& produced = std::vector<std::unique_ptr<std::atomic<bool>>>{};
  auto&& consumed = std::vector<std::unique_ptr<std::atomic<bool>>>{};
  auto&& consumers = std::vector<std::thread>{};
  for (auto i = 0; i < numThreads; ++i) {
    produced.emplace_back(new std::atomic<bool>{false});
    consumed.emplace_back(new std::atomic<bool>{false});
    exceptions.push_back({});
  }

  auto producer = std::thread{[&]() {
    auto counter = std::vector<int>(numThreads, 0);
    for (auto i = 0; true; i = ((i + 1) % numThreads)) {
      try {
        throw ExceptionWithConstructionTrack{counter.at(i)++};
      } catch (...) {
        transferCurrentException(exceptions.at(i), produced.at(i));
      }

      while (!consumed.at(i)->load(std::memory_order_acquire)) {
        if (stop.load(std::memory_order_acquire)) {
          return;
        }
      }

      consumed.at(i)->store(false, std::memory_order_release);
    }
  }};

  for (auto i = 0; i < numThreads; ++i) {
    consumers.emplace_back([&, i]() {
      auto counter = 0;
      while (true) {
        while (!produced.at(i)->load(std::memory_order_acquire)) {
          if (stop.load(std::memory_order_acquire)) {
            return;
          }
        }
        produced.at(i)->store(false, std::memory_order_release);

        try {
          auto storage = &exceptions.at(i);
          auto exc = folly::launder(
            reinterpret_cast<std::exception_ptr*>(storage));
          auto copy = std::move(*exc);
          exc->std::exception_ptr::~exception_ptr();
          std::rethrow_exception(std::move(copy));
        } catch (std::exception& exc) {
          auto value = std::stoi(exc.what());
          EXPECT_EQ(value, counter++);
        }

        consumed.at(i)->store(true, std::memory_order_release);
      }
    });
  }

  std::this_thread::sleep_for(milliseconds);
  stop.store(true);
  producer.join();
  for (auto& thread : consumers) {
    thread.join();
  }
}
} // namespace
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5684

Differential Revision: D16746077

Pulled By: miasantreble

fbshipit-source-id: 8af88dcf9161c05daec1a76290f577918638f79d
---
 Makefile                                             |  6 +++---
 third-party/folly/folly/CPortability.h               | 12 ++++++++++++
 third-party/folly/folly/Portability.h                | 10 ++++++++++
 .../folly/folly/synchronization/AtomicUtil-inl.h     | 10 ++++++----
 .../synchronization/test/DistributedMutexTest.cpp    |  6 ++++++
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index ccca3ac5efb..d8b540fcff6 100644
--- a/Makefile
+++ b/Makefile
@@ -256,8 +256,8 @@ endif
 ifdef COMPILE_WITH_TSAN
 	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=thread
-	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC
-	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
         # Turn off -pg when enabling TSAN testing, because that induces
         # a link failure.  TODO: find the root cause
 	PROFILING_FLAGS =
@@ -1285,7 +1285,7 @@ db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJEC
 
 db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
-	
+
 db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/third-party/folly/folly/CPortability.h b/third-party/folly/folly/CPortability.h
index 3ce3a7785ac..56cb6b1a58c 100644
--- a/third-party/folly/folly/CPortability.h
+++ b/third-party/folly/folly/CPortability.h
@@ -13,3 +13,15 @@
 #else
 #define FOLLY_EXPORT
 #endif
+
+#if defined(__has_feature)
+#define FOLLY_HAS_FEATURE(...) __has_feature(__VA_ARGS__)
+#else
+#define FOLLY_HAS_FEATURE(...) 0
+#endif
+
+#if FOLLY_HAS_FEATURE(thread_sanitizer) || __SANITIZE_THREAD__
+#ifndef FOLLY_SANITIZE_THREAD
+#define FOLLY_SANITIZE_THREAD 1
+#endif
+#endif
diff --git a/third-party/folly/folly/Portability.h b/third-party/folly/folly/Portability.h
index 2c6544c1961..61c05ff2254 100644
--- a/third-party/folly/folly/Portability.h
+++ b/third-party/folly/folly/Portability.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <folly/CPortability.h>
+
 #if defined(__arm__)
 #define FOLLY_ARM 1
 #else
@@ -72,3 +74,11 @@ constexpr bool kIsMsvc = true;
 constexpr bool kIsMsvc = false;
 #endif
 } // namespace folly
+
+namespace folly {
+#if FOLLY_SANITIZE_THREAD
+constexpr bool kIsSanitizeThread = true;
+#else
+constexpr bool kIsSanitizeThread = false;
+#endif
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicUtil-inl.h b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
index 6adba2b31a5..4c10d845104 100644
--- a/third-party/folly/folly/synchronization/AtomicUtil-inl.h
+++ b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
@@ -230,8 +230,9 @@ bool atomic_fetch_set(Atomic& atomic, std::size_t bit, std::memory_order mo) {
   static_assert(!std::is_const<Atomic>{}, "");
   assert(bit < (sizeof(Integer) * 8));
 
-  if (folly::kIsArchAmd64) {
-    // do the optimized thing on x86 builds
+  // do the optimized thing on x86 builds.  Also, some versions of TSAN do not
+  // properly instrument the inline assembly, so avoid it when TSAN is enabled
+  if (folly::kIsArchAmd64 && !folly::kIsSanitizeThread) {
     return detail::atomic_fetch_set_x86(atomic, bit, mo);
   } else {
     // otherwise default to the default implementation using fetch_or()
@@ -246,8 +247,9 @@ bool atomic_fetch_reset(Atomic& atomic, std::size_t bit, std::memory_order mo) {
   static_assert(!std::is_const<Atomic>{}, "");
   assert(bit < (sizeof(Integer) * 8));
 
-  if (folly::kIsArchAmd64) {
-    // do the optimized thing on x86 builds
+  // do the optimized thing on x86 builds.  Also, some versions of TSAN do not
+  // properly instrument the inline assembly, so avoid it when TSAN is enabled
+  if (folly::kIsArchAmd64 && !folly::kIsSanitizeThread) {
     return detail::atomic_fetch_reset_x86(atomic, bit, mo);
   } else {
     // otherwise default to the default implementation using fetch_and()
diff --git a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
index b83fdda7120..e75d0f35c61 100644
--- a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
+++ b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
@@ -981,6 +981,12 @@ template <template <typename> class Atom = std::atomic>
 void concurrentExceptionPropagationStress(
     int numThreads,
     std::chrono::milliseconds t) {
+  // this test fails under with a false negative under older versions of TSAN
+  // for some reason so disable it when TSAN is enabled
+  if (folly::kIsSanitizeThread) {
+    return;
+  }
+
   TestConstruction::reset();
   auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
   auto&& threads = std::vector<std::thread>{};

From d92a59b6f2c0c3cb3420504d26610edd5c7d71b4 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Wed, 14 Aug 2019 18:13:14 -0700
Subject: [PATCH 300/572] Fix regression affecting partitioned indexes/filters
 when cache_index_and_filter_blocks is false (#5705)

Summary:
PR https://github.com/facebook/rocksdb/issues/5298 (and subsequent related patches) unintentionally changed the
semantics of cache_index_and_filter_blocks: historically, this option
only affected the main index/filter block; with the changes, it affects
index/filter partitions as well. This can cause performance issues when
cache_index_and_filter_blocks is false since in this case, partitions are
neither cached nor preloaded (i.e. they are loaded on demand upon each
access). The patch reverts to the earlier behavior, that is, partitions
are cached similarly to data blocks regardless of the value of the above
option.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5705

Test Plan:
make check
./db_bench -benchmarks=fillrandom --statistics --stats_interval_seconds=1 --duration=30 --num=500000000 --bloom_bits=20 --partition_index_and_filters=true --cache_index_and_filter_blocks=false
./db_bench -benchmarks=readrandom --use_existing_db --statistics --stats_interval_seconds=1 --duration=10 --num=500000000 --bloom_bits=20 --partition_index_and_filters=true --cache_index_and_filter_blocks=false --cache_size=8000000000

Relevant statistics from the readrandom benchmark with the old code:

rocksdb.block.cache.index.miss COUNT : 0
rocksdb.block.cache.index.hit COUNT : 0
rocksdb.block.cache.index.add COUNT : 0
rocksdb.block.cache.index.bytes.insert COUNT : 0
rocksdb.block.cache.index.bytes.evict COUNT : 0
rocksdb.block.cache.filter.miss COUNT : 0
rocksdb.block.cache.filter.hit COUNT : 0
rocksdb.block.cache.filter.add COUNT : 0
rocksdb.block.cache.filter.bytes.insert COUNT : 0
rocksdb.block.cache.filter.bytes.evict COUNT : 0

With the new code:

rocksdb.block.cache.index.miss COUNT : 2500
rocksdb.block.cache.index.hit COUNT : 42696
rocksdb.block.cache.index.add COUNT : 2500
rocksdb.block.cache.index.bytes.insert COUNT : 4050048
rocksdb.block.cache.index.bytes.evict COUNT : 0
rocksdb.block.cache.filter.miss COUNT : 2500
rocksdb.block.cache.filter.hit COUNT : 4550493
rocksdb.block.cache.filter.add COUNT : 2500
rocksdb.block.cache.filter.bytes.insert COUNT : 10331040
rocksdb.block.cache.filter.bytes.evict COUNT : 0

Differential Revision: D16817382

Pulled By: ltamasi

fbshipit-source-id: 28a516b0da1f041a03313e0b70b28cf5cf205d00
---
 table/block_based/block_based_filter_block.cc |  4 +-
 table/block_based/block_based_table_reader.cc | 47 +++++++++++--------
 table/block_based/block_based_table_reader.h  |  2 +-
 .../block_based/filter_block_reader_common.cc | 16 +++++--
 .../block_based/filter_block_reader_common.h  |  3 +-
 table/block_based/full_filter_block.cc        |  4 +-
 table/block_based/partitioned_filter_block.cc |  7 +--
 .../block_based/uncompression_dict_reader.cc  | 22 ++++++---
 table/block_based/uncompression_dict_reader.h |  4 +-
 9 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
index 5585b8441c5..319c5bf6d87 100644
--- a/table/block_based/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -181,8 +181,8 @@ std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
   CachableEntry<BlockContents> filter_block;
   if (prefetch || !use_cache) {
     const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     nullptr /* get_context */, lookup_context,
-                                     &filter_block);
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
     if (!s.ok()) {
       return std::unique_ptr<FilterBlockReader>();
     }
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 314763ec3b4..bb12188d355 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -208,7 +208,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
  protected:
   static Status ReadIndexBlock(const BlockBasedTable* table,
                                FilePrefetchBuffer* prefetch_buffer,
-                               const ReadOptions& read_options,
+                               const ReadOptions& read_options, bool use_cache,
                                GetContext* get_context,
                                BlockCacheLookupContext* lookup_context,
                                CachableEntry<Block>* index_block);
@@ -240,6 +240,12 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
     return table_->get_rep()->index_value_is_full;
   }
 
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
   Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
                              BlockCacheLookupContext* lookup_context,
                              CachableEntry<Block>* index_block) const;
@@ -258,7 +264,7 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
 
 Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    const ReadOptions& read_options, GetContext* get_context,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
     CachableEntry<Block>* index_block) {
   PERF_TIMER_GUARD(read_index_block_nanos);
@@ -273,7 +279,7 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->footer.index_handle(),
       UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
-      get_context, lookup_context);
+      get_context, lookup_context, /* for_compaction */ false, use_cache);
 
   return s;
 }
@@ -295,7 +301,8 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
   }
 
   return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
-                        get_context, lookup_context, index_block);
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
 }
 
 // Index that allows binary search lookup in a two-level index structure.
@@ -318,7 +325,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
       const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
                          /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
@@ -509,7 +516,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
       const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
                          /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
@@ -593,7 +600,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
     CachableEntry<Block> index_block;
     if (prefetch || !use_cache) {
       const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(),
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
                          /*get_context=*/nullptr, lookup_context, &index_block);
       if (!s.ok()) {
         return s;
@@ -1915,7 +1922,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
 
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
-                    block_type, get_context, lookup_context, for_compaction);
+                    block_type, get_context, lookup_context, for_compaction,
+                    /* use_cache */ true);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
@@ -2078,8 +2086,10 @@ Status BlockBasedTable::GetDataBlockFromCache(
     GetContext* get_context) const {
   BlockCacheLookupContext lookup_data_block_context(
       TableReaderCaller::kUserMultiGet);
+  assert(block_type == BlockType::kData);
   Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict, block,
-                    block_type, get_context, &lookup_data_block_context);
+                           block_type, get_context, &lookup_data_block_context,
+                           /* for_compaction */ false, /* use_cache */ true);
   if (s.IsIncomplete()) {
     s = Status::OK();
   }
@@ -2289,9 +2299,11 @@ void BlockBasedTable::MaybeLoadBlocksToCache(
         continue;
       }
 
-      (*statuses)[idx_in_batch] = RetrieveBlock(nullptr, options, handle,
-            uncompression_dict, &(*results)[idx_in_batch], BlockType::kData,
-            mget_iter->get_context, &lookup_data_block_context);
+      (*statuses)[idx_in_batch] =
+          RetrieveBlock(nullptr, options, handle, uncompression_dict,
+                        &(*results)[idx_in_batch], BlockType::kData,
+                        mget_iter->get_context, &lookup_data_block_context,
+                        /* for_compaction */ false, /* use_cache */ true);
     }
     return;
   }
@@ -2418,15 +2430,12 @@ Status BlockBasedTable::RetrieveBlock(
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction) const {
+    bool for_compaction, bool use_cache) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
   Status s;
-  if (rep_->table_options.cache_index_and_filter_blocks ||
-      (block_type != BlockType::kFilter &&
-       block_type != BlockType::kCompressionDictionary &&
-       block_type != BlockType::kIndex)) {
+  if (use_cache) {
     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
                                      uncompression_dict, block_entry,
                                      block_type, get_context, lookup_context,
@@ -2487,14 +2496,14 @@ template Status BlockBasedTable::RetrieveBlock<BlockContents>(
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<BlockContents>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction) const;
+    bool for_compaction, bool use_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<Block>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction) const;
+    bool for_compaction, bool use_cache) const;
 
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 85346d75c72..017199d8080 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -299,7 +299,7 @@ class BlockBasedTable : public TableReader {
                        CachableEntry<TBlocklike>* block_entry,
                        BlockType block_type, GetContext* get_context,
                        BlockCacheLookupContext* lookup_context,
-                       bool for_compaction = false) const;
+                       bool for_compaction, bool use_cache) const;
 
   Status GetDataBlockFromCache(
       const ReadOptions& ro, const BlockHandle& handle,
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
index 717a4ad0dff..b6a33498678 100644
--- a/table/block_based/filter_block_reader_common.cc
+++ b/table/block_based/filter_block_reader_common.cc
@@ -13,7 +13,7 @@ namespace rocksdb {
 template <typename TBlocklike>
 Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    const ReadOptions& read_options, GetContext* get_context,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
     CachableEntry<TBlocklike>* filter_block) {
   PERF_TIMER_GUARD(read_filter_block_nanos);
@@ -28,7 +28,8 @@ Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
   const Status s =
       table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
                            UncompressionDict::GetEmptyDict(), filter_block,
-                           BlockType::kFilter, get_context, lookup_context);
+                           BlockType::kFilter, get_context, lookup_context,
+                           /* for_compaction */ false, use_cache);
 
   return s;
 }
@@ -52,6 +53,14 @@ bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
   return table_->get_rep()->whole_key_filtering;
 }
 
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
 template <typename TBlocklike>
 Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
     bool no_io, GetContext* get_context,
@@ -70,7 +79,8 @@ Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
   }
 
   return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
-                         get_context, lookup_context, filter_block);
+                         cache_filter_blocks(), get_context, lookup_context,
+                         filter_block);
 }
 
 template <typename TBlocklike>
diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h
index 3698d3f1e91..4e691e0d913 100644
--- a/table/block_based/filter_block_reader_common.h
+++ b/table/block_based/filter_block_reader_common.h
@@ -31,7 +31,7 @@ class FilterBlockReaderCommon : public FilterBlockReader {
  protected:
   static Status ReadFilterBlock(const BlockBasedTable* table,
                                 FilePrefetchBuffer* prefetch_buffer,
-                                const ReadOptions& read_options,
+                                const ReadOptions& read_options, bool use_cache,
                                 GetContext* get_context,
                                 BlockCacheLookupContext* lookup_context,
                                 CachableEntry<TBlocklike>* filter_block);
@@ -39,6 +39,7 @@ class FilterBlockReaderCommon : public FilterBlockReader {
   const BlockBasedTable* table() const { return table_; }
   const SliceTransform* table_prefix_extractor() const;
   bool whole_key_filtering() const;
+  bool cache_filter_blocks() const;
 
   Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
                               BlockCacheLookupContext* lookup_context,
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 553bd37d974..29decc35beb 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -134,8 +134,8 @@ std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
   CachableEntry<BlockContents> filter_block;
   if (prefetch || !use_cache) {
     const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     nullptr /* get_context */, lookup_context,
-                                     &filter_block);
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
     if (!s.ok()) {
       return std::unique_ptr<FilterBlockReader>();
     }
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 158ed84abee..1ba6b3c07a1 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -133,8 +133,8 @@ std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
   CachableEntry<Block> filter_block;
   if (prefetch || !use_cache) {
     const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     nullptr /* get_context */, lookup_context,
-                                     &filter_block);
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
     if (!s.ok()) {
       return std::unique_ptr<FilterBlockReader>();
     }
@@ -226,7 +226,8 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
   const Status s =
       table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
                              UncompressionDict::GetEmptyDict(), filter_block,
-                             BlockType::kFilter, get_context, lookup_context);
+                             BlockType::kFilter, get_context, lookup_context,
+                             /* for_compaction */ false, /* use_cache */ true);
 
   return s;
 }
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index d74dbf6c497..92db24bb246 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -24,8 +24,8 @@ Status UncompressionDictReader::Create(
   CachableEntry<BlockContents> uncompression_dict_block;
   if (prefetch || !use_cache) {
     const Status s = ReadUncompressionDictionaryBlock(
-        table, prefetch_buffer, ReadOptions(), nullptr /* get_context */,
-        lookup_context, &uncompression_dict_block);
+        table, prefetch_buffer, ReadOptions(), use_cache,
+        nullptr /* get_context */, lookup_context, &uncompression_dict_block);
     if (!s.ok()) {
       return s;
     }
@@ -43,7 +43,7 @@ Status UncompressionDictReader::Create(
 
 Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    const ReadOptions& read_options, GetContext* get_context,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
     CachableEntry<BlockContents>* uncompression_dict_block) {
   // TODO: add perf counter for compression dictionary read time
@@ -59,7 +59,8 @@ Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->compression_dict_handle,
       UncompressionDict::GetEmptyDict(), uncompression_dict_block,
-      BlockType::kCompressionDictionary, get_context, lookup_context);
+      BlockType::kCompressionDictionary, get_context, lookup_context,
+      /* for_compaction */ false, use_cache);
 
   if (!s.ok()) {
     ROCKS_LOG_WARN(
@@ -89,9 +90,9 @@ Status UncompressionDictReader::GetOrReadUncompressionDictionaryBlock(
     read_options.read_tier = kBlockCacheTier;
   }
 
-  return ReadUncompressionDictionaryBlock(table_, prefetch_buffer, read_options,
-                                          get_context, lookup_context,
-                                          uncompression_dict_block);
+  return ReadUncompressionDictionaryBlock(
+      table_, prefetch_buffer, read_options, cache_dictionary_blocks(),
+      get_context, lookup_context, uncompression_dict_block);
 }
 
 Status UncompressionDictReader::GetOrReadUncompressionDictionary(
@@ -135,4 +136,11 @@ size_t UncompressionDictReader::ApproximateMemoryUsage() const {
     return usage;
 }
 
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
 }  // namespace rocksdb
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
index 808149e96b3..09fdf54b070 100644
--- a/table/block_based/uncompression_dict_reader.h
+++ b/table/block_based/uncompression_dict_reader.h
@@ -46,9 +46,11 @@ class UncompressionDictReader {
     assert(table_);
   }
 
+  bool cache_dictionary_blocks() const;
+
   static Status ReadUncompressionDictionaryBlock(
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-      const ReadOptions& read_options, GetContext* get_context,
+      const ReadOptions& read_options, bool use_cache, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
       CachableEntry<BlockContents>* uncompression_dict_block);
 

From d61d4507c0980b544e87fd0aa5ed2990a45dad5e Mon Sep 17 00:00:00 2001
From: Jeffrey Xiao <jeffrey.xiao1998@gmail.com>
Date: Wed, 14 Aug 2019 20:58:59 -0700
Subject: [PATCH 301/572] Fix IngestExternalFile overlapping check (#5649)

Summary:
Previously, the end key of a range deletion tombstone was considered exclusive for the purposes of deletion, but considered inclusive when checking if two SSTables overlap. For example, an SSTable with a range deletion tombstone [a, b) would be considered overlapping with an SSTable with a range deletion tombstone [b, c). This commit fixes this check.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5649

Differential Revision: D16808765

Pulled By: anand1976

fbshipit-source-id: 5c7ad1c027e4f778d35070e5dae1b8e6037e0d68
---
 db/external_sst_file_basic_test.cc    | 42 +++++++++++++++++++++++++
 db/external_sst_file_ingestion_job.cc | 44 ++++++++++++++-------------
 db/external_sst_file_ingestion_job.h  | 17 +++--------
 db/external_sst_file_test.cc          | 12 +++-----
 db/import_column_family_job.cc        | 20 ++++++------
 5 files changed, 84 insertions(+), 51 deletions(-)

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index ff7da502afb..475ec7fe876 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -835,6 +835,48 @@ TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
 }
 
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file8.sst (delete 300 => 400)
+  std::string file8 = sst_files_dir_ + "file8.sst";
+  ASSERT_OK(sst_file_writer.Open(file8));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+  ExternalSstFileInfo file8_info;
+  Status s = sst_file_writer.Finish(&file8_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file8_info.file_path, file8);
+  ASSERT_EQ(file8_info.num_entries, 0);
+  ASSERT_EQ(file8_info.smallest_key, "");
+  ASSERT_EQ(file8_info.largest_key, "");
+  ASSERT_EQ(file8_info.num_range_del_entries, 1);
+  ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+  ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+  // file9.sst (delete 400 => 500)
+  std::string file9 = sst_files_dir_ + "file9.sst";
+  ASSERT_OK(sst_file_writer.Open(file9));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+  ExternalSstFileInfo file9_info;
+  s = sst_file_writer.Finish(&file9_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file9_info.file_path, file9);
+  ASSERT_EQ(file9_info.num_entries, 0);
+  ASSERT_EQ(file9_info.smallest_key, "");
+  ASSERT_EQ(file9_info.largest_key, "");
+  ASSERT_EQ(file9_info.num_range_del_entries, 1);
+  ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+  ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+  // Range deletion tombstones are exclusive on their end key, so these SSTs
+  // should not be considered as overlapping.
+  s = DeprecatedAddFile({file8, file9});
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
   bool change_checksum_called = false;
   const auto& change_checksum = [&](void* arg) {
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 44b50168566..2fce8e01bed 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -64,13 +64,13 @@ Status ExternalSstFileIngestionJob::Prepare(
     std::sort(
         sorted_files.begin(), sorted_files.end(),
         [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
-          return ucmp->Compare(info1->smallest_user_key,
-                               info2->smallest_user_key) < 0;
+          return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                   info2->smallest_internal_key) < 0;
         });
 
     for (size_t i = 0; i < num_files - 1; i++) {
-      if (ucmp->Compare(sorted_files[i]->largest_user_key,
-                        sorted_files[i + 1]->smallest_user_key) >= 0) {
+      if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                            sorted_files[i + 1]->smallest_internal_key) >= 0) {
         return Status::NotSupported("Files have overlapping ranges");
       }
     }
@@ -81,8 +81,8 @@ Status ExternalSstFileIngestionJob::Prepare(
       return Status::InvalidArgument("File contain no entries");
     }
 
-    if (!f.smallest_internal_key().Valid() ||
-        !f.largest_internal_key().Valid()) {
+    if (!f.smallest_internal_key.Valid() ||
+        !f.largest_internal_key.Valid()) {
       return Status::Corruption("Generated table have corrupted keys");
     }
   }
@@ -178,8 +178,8 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
                                                SuperVersion* super_version) {
   autovector<Range> ranges;
   for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
-    ranges.emplace_back(file_to_ingest.smallest_user_key,
-                        file_to_ingest.largest_user_key);
+    ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+                        file_to_ingest.largest_internal_key.user_key());
   }
   Status status =
       cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
@@ -238,8 +238,8 @@ Status ExternalSstFileIngestionJob::Run() {
       return status;
     }
     edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
-                  f.fd.GetFileSize(), f.smallest_internal_key(),
-                  f.largest_internal_key(), f.assigned_seqno, f.assigned_seqno,
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
                   false);
   }
   return status;
@@ -414,6 +414,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       table_reader->NewRangeTombstoneIterator(ro));
 
   // Get first (smallest) and last (largest) key from file.
+  file_to_ingest->smallest_internal_key = InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key = InternalKey("", 0, ValueType::kTypeValue);
   bool bounds_set = false;
   iter->SeekToFirst();
   if (iter->Valid()) {
@@ -423,7 +425,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     if (key.sequence != 0) {
       return Status::Corruption("external file have non zero sequence number");
     }
-    file_to_ingest->smallest_user_key = key.user_key.ToString();
+    file_to_ingest->smallest_internal_key.SetFrom(key);
 
     iter->SeekToLast();
     if (!ParseInternalKey(iter->key(), &key)) {
@@ -432,7 +434,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     if (key.sequence != 0) {
       return Status::Corruption("external file have non zero sequence number");
     }
-    file_to_ingest->largest_user_key = key.user_key.ToString();
+    file_to_ingest->largest_internal_key.SetFrom(key);
 
     bounds_set = true;
   }
@@ -448,13 +450,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       }
       RangeTombstone tombstone(key, range_del_iter->value());
 
-      if (!bounds_set || ucmp->Compare(tombstone.start_key_,
-                                       file_to_ingest->smallest_user_key) < 0) {
-        file_to_ingest->smallest_user_key = tombstone.start_key_.ToString();
+      InternalKey start_key = tombstone.SerializeKey();
+      if (!bounds_set || sstableKeyCompare(ucmp, start_key, file_to_ingest->smallest_internal_key) < 0) {
+        file_to_ingest->smallest_internal_key = start_key;
       }
-      if (!bounds_set || ucmp->Compare(tombstone.end_key_,
-                                       file_to_ingest->largest_user_key) > 0) {
-        file_to_ingest->largest_user_key = tombstone.end_key_.ToString();
+      InternalKey end_key = tombstone.SerializeEndKey();
+      if (!bounds_set || sstableKeyCompare(ucmp, end_key, file_to_ingest->largest_internal_key) > 0) {
+        file_to_ingest->largest_internal_key = end_key;
       }
       bounds_set = true;
     }
@@ -496,7 +498,7 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     if (vstorage->NumLevelFiles(lvl) > 0) {
       bool overlap_with_level = false;
       status = sv->current->OverlapWithLevelIterator(ro, env_options_,
-          file_to_ingest->smallest_user_key, file_to_ingest->largest_user_key,
+          file_to_ingest->smallest_internal_key.user_key(), file_to_ingest->largest_internal_key.user_key(),
           lvl, &overlap_with_level);
       if (!status.ok()) {
         return status;
@@ -630,8 +632,8 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   }
 
   auto* vstorage = cfd_->current()->storage_info();
-  Slice file_smallest_user_key(file_to_ingest->smallest_user_key);
-  Slice file_largest_user_key(file_to_ingest->largest_user_key);
+  Slice file_smallest_user_key(file_to_ingest->smallest_internal_key.user_key());
+  Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
 
   if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
                                &file_largest_user_key)) {
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 50f3944054f..4f9fac2416d 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -25,10 +25,10 @@ class Directories;
 struct IngestedFileInfo {
   // External file path
   std::string external_file_path;
-  // Smallest user key in external file
-  std::string smallest_user_key;
-  // Largest user key in external file
-  std::string largest_user_key;
+  // Smallest internal key in external file
+  InternalKey smallest_internal_key;
+  // Largest internal key in external file
+  InternalKey largest_internal_key;
   // Sequence number for keys in external file
   SequenceNumber original_seqno;
   // Offset of the global sequence number field in the file, will
@@ -62,15 +62,6 @@ struct IngestedFileInfo {
   // ingestion_options.move_files is false by default, thus copy_file is true
   // by default.
   bool copy_file = true;
-
-  InternalKey smallest_internal_key() const {
-    return InternalKey(smallest_user_key, assigned_seqno,
-                       ValueType::kTypeValue);
-  }
-
-  InternalKey largest_internal_key() const {
-    return InternalKey(largest_user_key, assigned_seqno, ValueType::kTypeValue);
-  }
 };
 
 class ExternalSstFileIngestionJob {
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index ebd6cb2b160..f5bed175042 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -696,10 +696,10 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
     ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
 
-    // file7.sst (delete 100 => 200)
+    // file7.sst (delete 99 => 201)
     std::string file7 = sst_files_dir_ + "file7.sst";
     ASSERT_OK(sst_file_writer.Open(file7));
-    ASSERT_OK(sst_file_writer.DeleteRange(Key(100), Key(200)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
     ExternalSstFileInfo file7_info;
     s = sst_file_writer.Finish(&file7_info);
     ASSERT_TRUE(s.ok()) << s.ToString();
@@ -708,8 +708,8 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_EQ(file7_info.smallest_key, "");
     ASSERT_EQ(file7_info.largest_key, "");
     ASSERT_EQ(file7_info.num_range_del_entries, 1);
-    ASSERT_EQ(file7_info.smallest_range_del_key, Key(100));
-    ASSERT_EQ(file7_info.largest_range_del_key, Key(200));
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
 
     // list 1 has internal key range conflict
     std::vector<std::string> file_list0({file1, file2});
@@ -724,9 +724,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
     // These lists of files have key ranges that overlap with each other
     s = DeprecatedAddFile(file_list1);
     ASSERT_FALSE(s.ok()) << s.ToString();
-    // Both of the following overlap on the end key of a range deletion
-    // tombstone. This is a limitation because these tombstones have exclusive
-    // end keys that should not count as overlapping with other keys.
+    // Both of the following overlap on the range deletion tombstone.
     s = DeprecatedAddFile(file_list4);
     ASSERT_FALSE(s.ok()) << s.ToString();
     s = DeprecatedAddFile(file_list5);
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 3c00a25917d..cd591406983 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -58,13 +58,13 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
       std::sort(sorted_files.begin(), sorted_files.end(),
                 [&ucmp](const IngestedFileInfo* info1,
                         const IngestedFileInfo* info2) {
-                  return ucmp->Compare(info1->smallest_user_key,
-                                       info2->smallest_user_key) < 0;
+                  return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                           info2->smallest_internal_key) < 0;
                 });
 
       for (size_t i = 0; i < sorted_files.size() - 1; i++) {
-        if (ucmp->Compare(sorted_files[i]->largest_user_key,
-                          sorted_files[i + 1]->smallest_user_key) >= 0) {
+        if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                              sorted_files[i + 1]->smallest_internal_key) >= 0) {
           return Status::InvalidArgument("Files have overlapping ranges");
         }
       }
@@ -76,8 +76,8 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
       return Status::InvalidArgument("File contain no entries");
     }
 
-    if (!f.smallest_internal_key().Valid() ||
-        !f.largest_internal_key().Valid()) {
+    if (!f.smallest_internal_key.Valid() ||
+        !f.largest_internal_key.Valid()) {
       return Status::Corruption("File has corrupted keys");
     }
   }
@@ -137,8 +137,8 @@ Status ImportColumnFamilyJob::Run() {
     const auto& f = files_to_import_[i];
     const auto& file_metadata = metadata_[i];
     edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
-                  f.fd.GetFileSize(), f.smallest_internal_key(),
-                  f.largest_internal_key(), file_metadata.smallest_seqno,
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, file_metadata.smallest_seqno,
                   file_metadata.largest_seqno, false);
 
     // If incoming sequence number is higher, update local sequence number.
@@ -236,14 +236,14 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   if (!ParseInternalKey(iter->key(), &key)) {
     return Status::Corruption("external file have corrupted keys");
   }
-  file_to_import->smallest_user_key = key.user_key.ToString();
+  file_to_import->smallest_internal_key.SetFrom(key);
 
   // Get last (largest) key from file
   iter->SeekToLast();
   if (!ParseInternalKey(iter->key(), &key)) {
     return Status::Corruption("external file have corrupted keys");
   }
-  file_to_import->largest_user_key = key.user_key.ToString();
+  file_to_import->largest_internal_key.SetFrom(key);
 
   file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
 

From 6ec2bf3fcea3d5c224ad3f82eb87b4ffb86ee490 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Thu, 15 Aug 2019 14:39:47 -0700
Subject: [PATCH 302/572] Blog post for write_unprepared (#5711)

Summary:
Introducing write_unprepared feature.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5711

Differential Revision: D16838307

Pulled By: maysamyabandeh

fbshipit-source-id: d9a4daf63dd0f855bea49c14ce84e6299f1401c7
---
 .../2019-08-15-unordered-write.markdown       | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 docs/_posts/2019-08-15-unordered-write.markdown

diff --git a/docs/_posts/2019-08-15-unordered-write.markdown b/docs/_posts/2019-08-15-unordered-write.markdown
new file mode 100644
index 00000000000..5f0eb2880a4
--- /dev/null
+++ b/docs/_posts/2019-08-15-unordered-write.markdown
@@ -0,0 +1,56 @@
+---
+title: Higher write throughput with `unordered_write` feature
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+Since RocksDB 6.3, The `unordered_write=`true option together with WritePrepared transactions offers 34-42% higher write throughput compared to vanilla RocksDB. If the application can handle more relaxed ordering guarantees, the gain in throughput would increase to 63-131%.
+
+### Background
+
+Currently RocksDB API delivers the following powerful guarantees:
+- Atomic reads: Either all of a write batch is visible to reads or none of it.
+- Read-your-own writes: When a write thread returns to the user, a subsequent read by the same thread will be able to see its own writes.
+- Immutable Snapshots: The reads visible to the snapshot are immutable in the sense that it will not be affected by any in-flight or future writes.
+
+### `unordered_write`
+
+The `unordered_write` feature, when turned on, relaxes the default guarantees of RocksDB. While it still gives read-your-own-write property, neither atomic reads nor the immutable snapshot properties are provided any longer. However, RocksDB users could still get read-your-own-write and immutable snapshots when using this feature in conjunction with TransactionDB configured with WritePrepared transactions and `two_write_queues`. You can read [here](https://github.com/facebook/rocksdb/wiki/unordered_write) to learn about the design of `unordered_write` and [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions) to learn more about WritePrepared transactions.
+
+### How to use it?
+
+To get the same guarantees as vanilla RocksdB:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    db_options.two_write_queues = true;
+    DB* db;
+    {
+      TransactionDBOptions txn_db_options;
+      txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+      txn_db_options.skip_concurrency_control = true;
+      TransactionDB* txn_db;
+      TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
+      db = txn_db;
+    }
+    db->Write(...);
+
+To get relaxed guarantees:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    DB* db;
+    DB::Open(db_options, kDBPath, &db);
+    db->Write(...);
+
+# Benchmarks
+
+    TEST_TMPDIR=/dev/shm/ ~/db_bench --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --transaction_db=true --unordered_write=1 --disable_wal=0
+
+Throughput with `unordered_write`=true and using WritePrepared transaction:
+- WAL: +42%
+- No-WAL: +34%
+Throughput with `unordered_write`=true
+- WAL: +63%
+- NoWAL: +131%

From bd2c753dd0b48f83818042d1caae5e47fc6444aa Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 15 Aug 2019 16:59:42 -0700
Subject: [PATCH 303/572] Add command "list_file_range_deletes" in ldb (#5615)

Summary:
Add a command in ldb so that users can print out tombstones in SST files.
In order to test the code, change the interface of LDBCommandRunner::RunCommand() so that it doesn't return from the program, but return the status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5615

Test Plan: Add a new unit test

Differential Revision: D16550326

fbshipit-source-id: 88ddfe6984bdcbb3a528abdd115089df09eba52e
---
 HISTORY.md                          |  2 +
 db/db_impl/db_impl.cc               | 21 ++++++-
 db/db_impl/db_impl.h                |  8 ++-
 db/db_impl/db_impl_debug.cc         |  3 +-
 db/table_cache.cc                   | 22 +++++++
 db/table_cache.h                    |  8 +++
 db/version_set.cc                   | 54 +++++++++++++++++
 db/version_set.h                    |  5 ++
 include/rocksdb/utilities/ldb_cmd.h |  3 +-
 tools/ldb_cmd.cc                    | 54 +++++++++++++++++
 tools/ldb_cmd_impl.h                | 16 +++++
 tools/ldb_cmd_test.cc               | 93 ++++++++++++++++++++++++++++-
 tools/ldb_tool.cc                   | 16 ++---
 13 files changed, 292 insertions(+), 13 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 8bcb47505a7..37a774c55f8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -19,6 +19,7 @@
 * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
 * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
 * Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors.
+* LDBCommandRunner::RunCommand() to return the status code as an integer, rather than call exit() using the code.
 
 ### New Features
 * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
@@ -57,6 +58,7 @@
 * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
 * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
 * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
+* Add command `list_file_range_deletes` in ldb, which prints out tombstones in SST files.
 
 ### Performance Improvements
 * Reduce binary search when iterator reseek into the same data block.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 9236d911e78..5d885b8b293 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -33,9 +33,9 @@
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
-#include "db/import_column_family_job.h"
 #include "db/flush_job.h"
 #include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
 #include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -91,6 +91,7 @@
 #include "tools/sst_dump_tool_imp.h"
 #include "util/autovector.h"
 #include "util/build_version.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -864,6 +865,24 @@ void DBImpl::DumpStats() {
   PrintStatistics();
 }
 
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                           int max_entries_to_print,
+                                           std::string* out_str) {
+  auto* cfh =
+      static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
+          column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+  Version* version = super_version->current;
+
+  Status s =
+      version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+  CleanupSuperVersion(super_version);
+  return s;
+}
+
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
   if (!job_context->logs_to_free.empty()) {
     for (auto l : job_context->logs_to_free) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index f1dbc5d0286..f2b3df5e6ac 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -812,6 +812,13 @@ class DBImpl : public DB {
                        uint64_t* new_time,
                        std::map<std::string, uint64_t>* stats_map);
 
+  // Print information of all tombstones of all iterators to the std::string
+  // This is only used by ldb. The output might be capped. Tombstones
+  // printed out are not guaranteed to be in any order.
+  Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                     int max_entries_to_print,
+                                     std::string* out_str);
+
 #ifndef NDEBUG
   // Compact any files in the named level that overlap [*begin, *end]
   Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
@@ -912,7 +919,6 @@ class DBImpl : public DB {
   void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
   bool TEST_IsPersistentStatsEnabled() const;
   size_t TEST_EstimateInMemoryStatsHistorySize() const;
-
 #endif  // NDEBUG
 
  protected:
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index ec8489848c5..d783355ce7f 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -9,12 +9,13 @@
 
 #ifndef NDEBUG
 
+#include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
 
 namespace rocksdb {
-
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   InstrumentedMutexLock l(&mutex_);
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 48415beff34..3c8a36c3b7d 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -255,6 +255,28 @@ InternalIterator* TableCache::NewIterator(
   return result;
 }
 
+Status TableCache::GetRangeTombstoneIterator(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  const FileDescriptor& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(env_options_, internal_comparator, fd, &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    out_iter->reset(t->NewRangeTombstoneIterator(options));
+    assert(out_iter);
+  }
+  return s;
+}
+
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
                        const FileMetaData& file_meta, const Slice& k,
diff --git a/db/table_cache.h b/db/table_cache.h
index 89a0b1b5c63..ff9a70b5706 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -93,6 +93,14 @@ class TableCache {
              HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
              int level = -1);
 
+  // Return the range delete tombstone iterator of the file specified by
+  // `file_meta`.
+  Status GetRangeTombstoneIterator(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
   // If a seek to internal key "k" in specified file finds an entry,
   // call get_context->SaveValue() repeatedly until
   // it returns false. As a side effect, it will insert the TableReader
diff --git a/db/version_set.cc b/db/version_set.cc
index af0168f7660..d1216646a31 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1257,6 +1257,60 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
   return Status::OK();
 }
 
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+                                            std::string* out_str) {
+  if (max_entries_to_print <= 0) {
+    return Status::OK();
+  }
+  int num_entries_left = max_entries_to_print;
+
+  std::stringstream ss;
+
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
+
+      ss << "=== file : " << fname << " ===\n";
+
+      TableCache* table_cache = cfd_->table_cache();
+      std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+      Status s = table_cache->GetRangeTombstoneIterator(
+          ReadOptions(), cfd_->internal_comparator(), *file_meta,
+          &tombstone_iter);
+      if (!s.ok()) {
+        return s;
+      }
+      if (tombstone_iter) {
+        tombstone_iter->SeekToFirst();
+
+        while (tombstone_iter->Valid() && num_entries_left > 0) {
+          ss << "start: " << tombstone_iter->start_key().ToString(true)
+             << " end: " << tombstone_iter->end_key().ToString(true)
+             << " seq: " << tombstone_iter->seq() << '\n';
+          tombstone_iter->Next();
+          num_entries_left--;
+        }
+        if (num_entries_left <= 0) {
+          break;
+        }
+      }
+    }
+    if (num_entries_left <= 0) {
+      break;
+    }
+  }
+  assert(num_entries_left >= 0);
+  if (num_entries_left <= 0) {
+    ss << "(results may not be complete)\n";
+  }
+
+  *out_str = ss.str();
+  return Status::OK();
+}
+
 Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
                                          int level) {
   for (const auto& file_meta : storage_info_.files_[level]) {
diff --git a/db/version_set.h b/db/version_set.h
index 25598630e2a..3b4d5661c50 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -630,6 +630,11 @@ class Version {
   Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
                                       TablePropertiesCollection* props) const;
 
+  // Print summary of range delete tombstones in SST files into out_str,
+  // with maximum max_entries_to_print entries printed out.
+  Status TablesRangeTombstoneSummary(int max_entries_to_print,
+                                     std::string* out_str);
+
   // REQUIRES: lock is held
   // On success, "tp" will contains the aggregated table property among
   // the table properties of all sst files in this version.
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index e7000742d1b..efac3f84d84 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -261,7 +261,8 @@ class LDBCommandRunner {
  public:
   static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name);
 
-  static void RunCommand(
+  // Returns the status code to return. 0 is no error.
+  static int RunCommand(
       int argc, char** argv, Options options, const LDBOptions& ldb_options,
       const std::vector<ColumnFamilyDescriptor>* column_families);
 };
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 86dfcc54e9e..83a3038255f 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -261,6 +261,9 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new IngestExternalSstFilesCommand(parsed_params.cmd_params,
                                              parsed_params.option_map,
                                              parsed_params.flags);
+  } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) {
+    return new ListFileRangeDeletesCommand(parsed_params.option_map,
+                                           parsed_params.flags);
   }
   return nullptr;
 }
@@ -3228,5 +3231,56 @@ Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+  std::map<std::string, std::string>::const_iterator itr =
+      options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+#if defined(CYGWIN)
+      max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+      max_keys_ = std::stoi(itr->second);
+#endif
+    } catch (const std::invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
+    } catch (const std::out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
+    }
+  }
+}
+
+void ListFileRangeDeletesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ListFileRangeDeletesCommand::Name());
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" : print tombstones in SST files.\n");
+}
+
+void ListFileRangeDeletesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+
+  std::string out_str;
+
+  Status st =
+      db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str);
+  if (st.ok()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str);
+    fprintf(stdout, "%s\n", out_str.c_str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
 }   // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 23bafe68254..38b2817c0d1 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -592,4 +592,20 @@ class IngestExternalSstFilesCommand : public LDBCommand {
   static const std::string ARG_WRITE_GLOBAL_SEQNO;
 };
 
+// Command that prints out range delete tombstones in SST files.
+class ListFileRangeDeletesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_file_range_deletes"; }
+
+  ListFileRangeDeletesCommand(const std::map<std::string, std::string>& options,
+                              const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  int max_keys_ = 1000;
+};
+
 }  // namespace rocksdb
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 24622b7ccf3..a8e2d5c7b14 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/ldb_cmd.h"
+#include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 
 using std::string;
@@ -77,8 +78,8 @@ TEST_F(LdbCmdTest, MemEnv) {
   char arg3[] = "dump_live_files";
   char* argv[] = {arg1, arg2, arg3};
 
-  rocksdb::LDBTool tool;
-  tool.Run(3, argv, opts);
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
 }
 
 TEST_F(LdbCmdTest, OptionParsing) {
@@ -118,6 +119,94 @@ TEST_F(LdbCmdTest, OptionParsing) {
   }
 }
 
+TEST_F(LdbCmdTest, ListFileTombstone) {
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::TmpDir();
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  ASSERT_OK(db->Put(wopts, "foo", "1"));
+  ASSERT_OK(db->Put(wopts, "bar", "2"));
+
+  FlushOptions fopts;
+  fopts.wait = true;
+  ASSERT_OK(db->Flush(fopts));
+
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2"));
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
+  ASSERT_OK(db->Flush(fopts));
+
+  delete db;
+
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char* argv[] = {arg1, arg2, arg3};
+
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(2, num_tb);
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  // Test the case of limiting tombstones
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char arg4[] = "--max_keys=1";
+    char* argv[] = {arg1, arg2, arg3, arg4};
+
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(1, num_tb);
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 } // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index 2813f6c6edf..2831afe63b6 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -71,6 +71,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBQuerierCommand::Help(ret);
   ApproxSizeCommand::Help(ret);
   CheckConsistencyCommand::Help(ret);
+  ListFileRangeDeletesCommand::Help(ret);
 
   ret.append("\n\n");
   ret.append("Admin Commands:\n");
@@ -96,12 +97,12 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   fprintf(stderr, "%s\n", ret.c_str());
 }
 
-void LDBCommandRunner::RunCommand(
+int LDBCommandRunner::RunCommand(
     int argc, char** argv, Options options, const LDBOptions& ldb_options,
     const std::vector<ColumnFamilyDescriptor>* column_families) {
   if (argc <= 2) {
     PrintHelp(ldb_options, argv[0]);
-    exit(1);
+    return 1;
   }
 
   LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(
@@ -109,11 +110,11 @@ void LDBCommandRunner::RunCommand(
   if (cmdObj == nullptr) {
     fprintf(stderr, "Unknown command\n");
     PrintHelp(ldb_options, argv[0]);
-    exit(1);
+    return 1;
   }
 
   if (!cmdObj->ValidateCmdLineOptions()) {
-    exit(1);
+    return 1;
   }
 
   cmdObj->Run();
@@ -121,14 +122,15 @@ void LDBCommandRunner::RunCommand(
   fprintf(stderr, "%s\n", ret.ToString().c_str());
   delete cmdObj;
 
-  exit(ret.IsFailed());
+  return ret.IsFailed() ? 1 : 0;
 }
 
 void LDBTool::Run(int argc, char** argv, Options options,
                   const LDBOptions& ldb_options,
                   const std::vector<ColumnFamilyDescriptor>* column_families) {
-  LDBCommandRunner::RunCommand(argc, argv, options, ldb_options,
-                               column_families);
+  int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
+                                                ldb_options, column_families);
+  exit(error_code);
 }
 } // namespace rocksdb
 

From 3a3dc294379fd9bd13ff4be27e0a6b7036ff547c Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 16 Aug 2019 11:15:13 -0700
Subject: [PATCH 304/572] Update HISTORY.md for 6.3.2/6.4.0 and add a
 not-yet-released change

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5710

Test Plan: HISTORY.md-only change, no testing required.

Differential Revision: D16836869

Pulled By: ltamasi

fbshipit-source-id: 978148f1d14b0c46839a94d7ada8a5e8ecf73965
---
 HISTORY.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 37a774c55f8..7b4a85e6a16 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,9 @@
 ### New Features
 * Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
 
+### Bug Fixes
+* Fixed a number of data races in BlobDB.
+
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
 * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
@@ -35,7 +38,16 @@
 ### Bug Fixes
 * Fix ingested file and directory not being fsync.
 * Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well.
+
+## 6.3.2 (8/15/2019)
+### Public API Change
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 
+### Bug Fixes
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well.
 
 ## 6.3.1 (7/24/2019)
 ### Bug Fixes

From 40712df9ab3d91bae3da5b9a9e8d89983f875fed Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Fri, 16 Aug 2019 13:54:23 -0700
Subject: [PATCH 305/572] ThreadPoolImpl::Impl::BGThreadWrapper() returns void
 (#5709)

Summary:
there is no need to return void*, as
std::thread::thread(Func&& f, Args&&... args ) only requires `Func` to
be callable.

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5709

Differential Revision: D16832894

fbshipit-source-id: a1e1b876fa8d55589ef5feb5b27f3a435068b747
---
 util/threadpool_imp.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index ea5e875df21..1a597e24822 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -99,7 +99,7 @@ struct ThreadPoolImpl::Impl {
 
 private:
 
-  static void* BGThreadWrapper(void* arg);
+  static void BGThreadWrapper(void* arg);
 
   bool low_io_priority_;
   bool low_cpu_priority_;
@@ -275,7 +275,7 @@ struct BGThreadMetadata {
       : thread_pool_(thread_pool), thread_id_(thread_id) {}
 };
 
-void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
+void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
   BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
   size_t thread_id = meta->thread_id_;
   ThreadPoolImpl::Impl* tp = meta->thread_pool_;
@@ -298,7 +298,7 @@ void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
       break;
     case Env::Priority::TOTAL:
       assert(false);
-      return nullptr;
+      return;
   }
   assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
   ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
@@ -308,7 +308,7 @@ void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   ThreadStatusUtil::UnregisterThread();
 #endif
-  return nullptr;
+  return;
 }
 
 void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,

From c762efc4a945e6ed1f4bd632e5f202c271381438 Mon Sep 17 00:00:00 2001
From: sheng qiu <herbert1984106@gmail.com>
Date: Fri, 16 Aug 2019 13:55:37 -0700
Subject: [PATCH 306/572] =?UTF-8?q?fix=20compile=20error:=20=E2=80=98FALLO?=
 =?UTF-8?q?C=5FFL=5FKEEP=5FSIZE=E2=80=99=20undeclared=20(#5708)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
add "linux/falloc.h" in env/io_posix.cc to fix compile error: ‘FALLOC_FL_KEEP_SIZE’ undeclared

Signed-off-by: sheng qiu <herbert1984106@gmail.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5708

Differential Revision: D16832922

fbshipit-source-id: 30e787c4a1b5a9724a8acfd68962ff5ec5f27d3e
---
 env/io_posix.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index bcc9ab5272e..c383494eaa4 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #if defined(OS_LINUX)
 #include <linux/fs.h>
+#include <linux/falloc.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>

From c2404d9928e28b064c0a2e72cfdfc7a369678c12 Mon Sep 17 00:00:00 2001
From: Eli Pozniansky <elipoz@gmail.com>
Date: Fri, 16 Aug 2019 14:16:49 -0700
Subject: [PATCH 307/572] Optimizing ApproximateSize to create index iterator
 just once (#5693)

Summary:
VersionSet::ApproximateSize doesn't need to create two separate index iterators and do binary search for each in BlockBasedTable. So BlockBasedTable::ApproximateSize was added that creates the iterator once and uses it to calculate the data size between start and end keys.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5693

Differential Revision: D16774056

Pulled By: elipoz

fbshipit-source-id: 53ce262e1a057788243bf30cd9b8aa6581df1a18
---
 db/table_cache.cc                             | 27 ++++++++
 db/table_cache.h                              |  7 +++
 db/version_set.cc                             | 55 +++++++++++++---
 db/version_set.h                              |  9 ++-
 table/block_based/block_based_table_reader.cc | 62 ++++++++++++++-----
 table/block_based/block_based_table_reader.h  | 13 +++-
 table/cuckoo/cuckoo_table_reader.h            |  6 ++
 table/mock_table.h                            |  5 ++
 table/plain/plain_table_reader.cc             |  6 ++
 table/plain/plain_table_reader.h              |  3 +
 table/table_reader.h                          |  6 ++
 11 files changed, 170 insertions(+), 29 deletions(-)

diff --git a/db/table_cache.cc b/db/table_cache.cc
index 3c8a36c3b7d..f4de3b8fbb5 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -553,4 +553,31 @@ uint64_t TableCache::ApproximateOffsetOf(
 
   return result;
 }
+
+uint64_t TableCache::ApproximateSize(
+    const Slice& start, const Slice& end, const FileDescriptor& fd,
+    TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateSize(start, end, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
 }  // namespace rocksdb
diff --git a/db/table_cache.h b/db/table_cache.h
index ff9a70b5706..f7e0b0b3544 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -167,6 +167,13 @@ class TableCache {
       const InternalKeyComparator& internal_comparator,
       const SliceTransform* prefix_extractor = nullptr);
 
+  // Returns approximated data size between start and end keys in a file
+  // represented by fd (the start key must not be greater than the end key).
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           const FileDescriptor& fd, TableReaderCaller caller,
+                           const InternalKeyComparator& internal_comparator,
+                           const SliceTransform* prefix_extractor = nullptr);
+
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index d1216646a31..e94ad3d5a45 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5011,7 +5011,7 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
       uint64_t file_size = files_brief.files[i].fd.GetFileSize();
       // The entire file falls into the range, so we can just take its size.
       assert(file_size ==
-             ApproximateSize(v, files_brief.files[i], end, caller));
+             ApproximateSize(v, files_brief.files[i], start, end, caller));
       total_full_size += file_size;
     }
 
@@ -5045,25 +5045,23 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
   } else {
     // Estimate for all the first files, at each level
     for (const auto file_ptr : first_files) {
-      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
-      // subtract the bytes needed to be scanned to get to the starting key
-      uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
-      assert(total_full_size >= val);
-      total_full_size -= val;
+      total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
     }
 
     // Estimate for all the last files, at each level
     for (const auto file_ptr : last_files) {
-      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
+      // We could use ApproximateSize here, but calling ApproximateOffsetOf
+      // directly is just more efficient.
+      total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
     }
   }
 
   return total_full_size;
 }
 
-uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
-                                     const Slice& key,
-                                     TableReaderCaller caller) {
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                                         const Slice& key,
+                                         TableReaderCaller caller) {
   // pre-condition
   assert(v);
   const auto& icmp = v->cfd_->internal_comparator();
@@ -5088,6 +5086,43 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
   return result;
 }
 
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& start, const Slice& end,
+                                     TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+  assert(icmp.Compare(start, end) <= 0);
+
+  if (icmp.Compare(f.largest_key, start) <= 0 ||
+      icmp.Compare(f.smallest_key, end) > 0) {
+    // Entire file is before or after the start/end keys range
+    return 0;
+  }
+
+  if (icmp.Compare(f.smallest_key, start) >= 0) {
+    // Start of the range is before the file start - approximate by end offset
+    return ApproximateOffsetOf(v, f, end, caller);
+  }
+
+  if (icmp.Compare(f.largest_key, end) < 0) {
+    // End of the range is after the file end - approximate by subtracting
+    // start offset from the file size
+    uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+    assert(f.fd.GetFileSize() >= start_offset);
+    return f.fd.GetFileSize() - start_offset;
+  }
+
+  // The interval falls entirely in the range for this file.
+  TableCache* table_cache = v->cfd_->table_cache();
+  if (table_cache == nullptr) {
+    return 0;
+  }
+  return table_cache->ApproximateSize(
+      start, end, f.file_metadata->fd, caller, icmp,
+      v->GetMutableCFOptions().prefix_extractor.get());
+}
+
 void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
   // pre-calculate space requirement
   int64_t total_files = 0;
diff --git a/db/version_set.h b/db/version_set.h
index 3b4d5661c50..028c6ad1ba1 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1043,8 +1043,15 @@ class VersionSet {
     }
   };
 
+  // Returns approximated offset of a key in a file for a given version.
+  uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                               const Slice& key, TableReaderCaller caller);
+
+  // Returns approximated data size between start and end keys in a file
+  // for a given version.
   uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
-                           const Slice& key, TableReaderCaller caller);
+                           const Slice& start, const Slice& end,
+                           TableReaderCaller caller);
 
   // Save current contents to *log
   Status WriteSnapshot(log::Writer* log);
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bb12188d355..119de00f05f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3921,25 +3921,16 @@ Status BlockBasedTable::CreateIndexReader(
   }
 }
 
-uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
-                                              TableReaderCaller caller) {
-  BlockCacheLookupContext context(caller);
-  IndexBlockIter iiter_on_stack;
-  auto index_iter =
-      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
-                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
-                       /*lookup_context=*/&context);
-
-  index_iter->Seek(key);
-  uint64_t result;
-  if (index_iter->Valid()) {
-    BlockHandle handle = index_iter->value().handle;
+uint64_t BlockBasedTable::ApproximateOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter) const {
+  uint64_t result = 0;
+  if (index_iter.Valid()) {
+    BlockHandle handle = index_iter.value().handle;
     result = handle.offset();
   } else {
-    // key is past the last key in the file. If table_properties is not
+    // The iterator is past the last key in the file. If table_properties is not
     // available, approximate the offset by returning the offset of the
     // metaindex block (which is right near the end of the file).
-    result = 0;
     if (rep_->table_properties) {
       result = rep_->table_properties->data_size;
     }
@@ -3949,11 +3940,48 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
     }
   }
 
+  return result;
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              TableReaderCaller caller) {
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  auto index_iter =
+      NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
   if (index_iter != &iiter_on_stack) {
-    delete index_iter;
+    iiter_unique_ptr.reset(index_iter);
   }
 
-  return result;
+  index_iter->Seek(key);
+  return ApproximateOffsetOf(*index_iter);
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+                                          TableReaderCaller caller) {
+  assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  auto index_iter =
+      NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(start);
+  uint64_t start_offset = ApproximateOffsetOf(*index_iter);
+  index_iter->Seek(end);
+  uint64_t end_offset = ApproximateOffsetOf(*index_iter);
+
+  assert(end_offset >= start_offset);
+  return end_offset - start_offset;
 }
 
 bool BlockBasedTable::TEST_FilterBlockInCache() const {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 017199d8080..2b0c4754a6c 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -151,13 +151,20 @@ class BlockBasedTable : public TableReader {
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
-  // present in the file).  The returned value is in terms of file
+  // present in the file). The returned value is in terms of file
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
   uint64_t ApproximateOffsetOf(const Slice& key,
                                TableReaderCaller caller) override;
 
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  // The start key must not be greater than the end key.
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
   bool TEST_BlockInCache(const BlockHandle& handle) const;
 
   // Returns true if the block for the specified key is in cache.
@@ -438,6 +445,10 @@ class BlockBasedTable : public TableReader {
   static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
                                   size_t* size);
 
+  // Given an iterator return its offset in file.
+  uint64_t ApproximateOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter) const;
+
   // Helper functions for DumpTable()
   Status DumpIndexBlock(WritableFile* out_file);
   Status DumpDataBlocks(WritableFile* out_file);
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index 10db084259f..ea33ffb2a8a 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -62,6 +62,12 @@ class CuckooTableReader: public TableReader {
                                TableReaderCaller /*caller*/) override {
     return 0;
   }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
   void SetupForCompaction() override {}
   // End of methods not implemented.
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 4b886e63e25..deb383231d0 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -55,6 +55,11 @@ class MockTableReader : public TableReader {
     return 0;
   }
 
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
   size_t ApproximateMemoryUsage() const override { return 0; }
 
   void SetupForCompaction() override {}
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index f6c348fdbf9..63a28e34a36 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -620,6 +620,12 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
   return 0;
 }
 
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+                                           const Slice& /*end*/,
+                                           TableReaderCaller /*caller*/) {
+  return 0;
+}
+
 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
                                        bool use_prefix_seek)
     : table_(table),
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index f63649cacf8..ab108b21605 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -94,6 +94,9 @@ class PlainTableReader: public TableReader {
   uint64_t ApproximateOffsetOf(const Slice& key,
                                TableReaderCaller caller) override;
 
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
   uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
   void SetupForCompaction() override;
 
diff --git a/table/table_reader.h b/table/table_reader.h
index eb383c8fe8e..e968014b1ac 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -65,6 +65,12 @@ class TableReader {
   virtual uint64_t ApproximateOffsetOf(const Slice& key,
                                        TableReaderCaller caller) = 0;
 
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                                   TableReaderCaller caller) = 0;
+
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
   virtual void SetupForCompaction() = 0;

From a2e46eae46825e3eb87f9205d8dbc962b1d0107a Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Fri, 16 Aug 2019 14:36:41 -0700
Subject: [PATCH 308/572] fix compiling with `-DNPERF_CONTEXT` (#5704)

Summary:
This was previously broken, as the performance context-related
macro signatures in file monitoring/perf_context_imp.h
deviated for the case when NPERF_CONTEXT was defined and when it
was not.

Update the macros for the `-DNPERF_CONTEXT` case, so it compiles.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5704

Differential Revision: D16867746

fbshipit-source-id: 05539724cb1f7955ecc42828365836a677759ad9
---
 monitoring/perf_context_imp.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h
index e0ff8afc58e..6e26988f137 100644
--- a/monitoring/perf_context_imp.h
+++ b/monitoring/perf_context_imp.h
@@ -22,12 +22,15 @@ extern thread_local PerfContext perf_context;
 
 #if defined(NPERF_CONTEXT)
 
-#define PERF_TIMER_GUARD(metric)
-#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition)
-#define PERF_TIMER_MEASURE(metric)
 #define PERF_TIMER_STOP(metric)
 #define PERF_TIMER_START(metric)
+#define PERF_TIMER_GUARD(metric)
+#define PERF_TIMER_GUARD_WITH_ENV(metric, env)
+#define PERF_CPU_TIMER_GUARD(metric, env)
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, ticker_type)
+#define PERF_TIMER_MEASURE(metric)
 #define PERF_COUNTER_ADD(metric, value)
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
 
 #else
 

From 353a68d5501f04e2e44393dceccd7d6bdeaae998 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 16 Aug 2019 15:05:56 -0700
Subject: [PATCH 309/572] Update HISTORY.md for 6.4.0 (#5714)

Summary:
Update HISTORY.md by removing a feature from "Unreleased" to 6.4.0 after cherry-picking related commits to 6.4.fb branch.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5714

Differential Revision: D16865334

Pulled By: riversand963

fbshipit-source-id: f17ede905a1dfbbcdf98806ca398c618cf54748a
---
 HISTORY.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 7b4a85e6a16..cbb3fec35c8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,8 +1,5 @@
 # Rocksdb Change Log
 ## Unreleased
-### New Features
-* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
-
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 
@@ -29,6 +26,7 @@
 * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
 * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
 * Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
+* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
 
 ### Performance Improvements
 * Reduce iterator key comparision for upper/lower bound check.

From e0515607bc04a97edc2cb1ae8b527793be08e495 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 16 Aug 2019 15:34:49 -0700
Subject: [PATCH 310/572] Blacklist TransactionTest.GetWithoutSnapshot from
 valgrind_test (#5715)

Summary:
In valgrind_test, TransactionTest.GetWithoutSnapshot ran 2 hours and still didn't finish. Black list from valgrind_test to prevent timeout.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5715

Test Plan: run "make valgrind_test" and see whether the test is still generated.

Differential Revision: D16866009

fbshipit-source-id: 92c78049b0bc1c2b9a0dfc1b7c8a9206b36f02f0
---
 utilities/transactions/transaction_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 551632614c1..da7cee06304 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5488,6 +5488,7 @@ class ThreeBytewiseComparator : public Comparator {
   }
 };
 
+#ifndef ROCKSDB_VALGRIND_RUN
 TEST_P(TransactionTest, GetWithoutSnapshot) {
   WriteOptions write_options;
   std::atomic<bool> finish = {false};
@@ -5516,6 +5517,7 @@ TEST_P(TransactionTest, GetWithoutSnapshot) {
   commit_thread.join();
   read_thread.join();
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // Test that the transactional db can handle duplicate keys in the write batch
 TEST_P(TransactionTest, DuplicateKeys) {

From 35fe685402d206a9dfb0cdf6b407ce2b2ab4af5f Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Fri, 16 Aug 2019 15:48:09 -0700
Subject: [PATCH 311/572] cmake: s/SNAPPY_LIBRARIES/snappy_LIBRARIES/ (#5687)

Summary:
fix the regression introduced by cc9fa7fc

Signed-off-by: Kefu Chai <tchaikov@gmail.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5687

Differential Revision: D16870212

fbshipit-source-id: 78b5519e1d2b03262d102ca530491254ddffdc38
---
 cmake/modules/Findsnappy.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake
index 2de2889c1a6..39bba6bd217 100644
--- a/cmake/modules/Findsnappy.cmake
+++ b/cmake/modules/Findsnappy.cmake
@@ -9,7 +9,7 @@ find_path(snappy_INCLUDE_DIRS
   NAMES snappy.h
   HINTS ${snappy_ROOT_DIR}/include)
 
-find_library(SNAPPY_LIBRARIES
+find_library(snappy_LIBRARIES
   NAMES snappy
   HINTS ${snappy_ROOT_DIR}/lib)
 

From f2bf0b2d1e5cdd588026cec86c681efa7539ce64 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Fri, 16 Aug 2019 16:25:11 -0700
Subject: [PATCH 312/572] Fixes for building RocksJava releases on arm64v8

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5674

Differential Revision: D16870338

fbshipit-source-id: c8dac644b1479fa734b491f3a8d50151772290f7
---
 Makefile                                     | 15 ++++++++++-----
 java/crossbuild/docker-build-linux-centos.sh |  6 +++---
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index d8b540fcff6..742d8ab79e0 100644
--- a/Makefile
+++ b/Makefile
@@ -1696,7 +1696,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64 aarch64))
+	ifneq (,$(filter $(MACHINE), amd64 arm64 aarch64 sparc64))
 		ARCH := 64
 	else
 		ARCH := 32
@@ -1705,14 +1705,11 @@ else
 	ARCH := $(shell getconf LONG_BIT)
 endif
 
-ifeq (,$(findstring ppc,$(MACHINE)))
+ifeq (,$(filter $(MACHINE), ppc arm64 aarch64 sparc64))
         ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 else
         ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
 endif
-ifneq (,$(findstring aarch64,$(MACHINE)))
-        ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
-endif
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
@@ -1909,6 +1906,14 @@ rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
 	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
+rocksdbjavastaticdockerarm64v8:
+	mkdir -p java/target
+	DOCKER_LINUX_ARM64V8_CONTAINER=`docker ps -aqf name=rocksdb_linux_arm64v8-be`; \
+	if [ -z "$$DOCKER_LINUX_ARM64V8_CONTAINER" ]; then \
+		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_arm64v8-be evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
+	fi
+	docker start -a rocksdb_linux_arm64v8-be
+
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
 rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral
diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh
index aedcb8af788..68ee610cb73 100755
--- a/java/crossbuild/docker-build-linux-centos.sh
+++ b/java/crossbuild/docker-build-linux-centos.sh
@@ -12,17 +12,17 @@ cd /rocksdb-local
 if hash scl 2>/dev/null; then
 	if scl --list | grep -q 'devtoolset-7'; then
 		scl enable devtoolset-7 'make jclean clean'
-		scl enable devtoolset-7 'PORTABLE=1 make -j6 rocksdbjavastatic'
+		scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	elif scl --list | grep -q 'devtoolset-2'; then
 		scl enable devtoolset-2 'make jclean clean'
-		scl enable devtoolset-2 'PORTABLE=1 make -j6 rocksdbjavastatic'
+		scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	else
 		echo "Could not find devtoolset"
 		exit 1;
 	fi
 else
 	make jclean clean
-        PORTABLE=1 make -j6 rocksdbjavastatic
+        PORTABLE=1 make -j2 rocksdbjavastatic
 fi
 
 cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target

From e89b1c9c6e1388f6e7eba0b66110492fe3e7623f Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 16 Aug 2019 16:37:20 -0700
Subject: [PATCH 313/572] add missing check for hash index when calling
 BlockBasedTableIterator (#5712)

Summary:
Previous PR https://github.com/facebook/rocksdb/pull/3601 added support for making prefix_extractor dynamically mutable. However, there was a missing check for hash index when creating new BlockBasedTableIterator. While the check may be redundant because no other types of IndexReader makes uses of the flag, it is less error-prone to add the missing check so that future index reader implementation will not worry about violating the contract.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5712

Differential Revision: D16842052

Pulled By: miasantreble

fbshipit-source-id: aef11c0ff7a690ed248f5b8fe23481cac486b381
---
 table/block_based/block_based_table_reader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 119de00f05f..65e960f8c88 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3084,7 +3084,8 @@ InternalIterator* BlockBasedTable::NewIterator(
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
     return new (mem) BlockBasedTableIterator<DataBlockIter>(
         this, read_options, rep_->internal_comparator,
-        NewIndexIterator(read_options, need_upper_bound_check,
+        NewIndexIterator(read_options, need_upper_bound_check &&
+                         rep_->index_type == BlockBasedTableOptions::kHashSearch,
                          /*input_iter=*/nullptr, /*get_context=*/nullptr,
                          &lookup_context),
         !skip_filters && !read_options.total_order_seek &&

From e1c468d16ff3400508570fbbed4d4b01f3505471 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 16 Aug 2019 16:40:09 -0700
Subject: [PATCH 314/572] Do readahead in VerifyChecksum() (#5713)

Summary:
Right now VerifyChecksum() doesn't do read-ahead. In some use cases, users won't be able to achieve good performance. With this change, by default, RocksDB will do a default readahead, and users will be able to overwrite the readahead size by passing in a ReadOptions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5713

Test Plan: Add a new unit test.

Differential Revision: D16860874

fbshipit-source-id: 0cff0fe79ac855d3d068e6ccd770770854a68413
---
 HISTORY.md                                    |  2 +
 Makefile                                      |  2 +-
 db/convenience.cc                             |  9 ++-
 db/corruption_test.cc                         | 55 ++++++++++++++++++-
 db/db_impl/db_impl.cc                         |  5 +-
 db/db_impl/db_impl.h                          |  3 +-
 db/db_test.cc                                 |  3 +-
 db/external_sst_file_ingestion_job.cc         |  7 ++-
 include/rocksdb/convenience.h                 |  7 +++
 include/rocksdb/db.h                          |  4 +-
 include/rocksdb/sst_file_reader.h             |  4 +-
 include/rocksdb/utilities/stackable_db.h      |  4 ++
 table/block_based/block_based_table_reader.cc | 46 +++++++++-------
 table/block_based/block_based_table_reader.h  | 21 ++++---
 table/sst_file_reader.cc                      |  5 +-
 table/table_reader.h                          |  3 +-
 tools/sst_dump_tool.cc                        |  4 +-
 17 files changed, 140 insertions(+), 44 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index cbb3fec35c8..13b49f469d7 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,8 @@
 ## Unreleased
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
+### New Features
+* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/Makefile b/Makefile
index 742d8ab79e0..be02db7f347 100644
--- a/Makefile
+++ b/Makefile
@@ -1226,7 +1226,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
 thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/db/convenience.cc b/db/convenience.cc
index 271217cd4f8..320d5c6e117 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path) {
+  return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path) {
   std::unique_ptr<RandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
@@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options,
   if (!s.ok()) {
     return s;
   }
-  s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum);
+  s = table_reader->VerifyChecksum(read_options,
+                                   TableReaderCaller::kUserVerifyChecksum);
   return s;
 }
 
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 82752161f39..3fd85953db3 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -13,10 +13,11 @@
 
 #include <errno.h>
 #include <fcntl.h>
-#include <cinttypes>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <cinttypes>
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
 #include "file/filename.h"
@@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test {
     delete db_;
     db_ = nullptr;
     Options opt = (options ? *options : options_);
-    opt.env = &env_;
+    if (opt.env == Options().env) {
+      // If env is not overridden, replace it with ErrorEnv.
+      // Otherwise, the test already uses a non-default Env.
+      opt.env = &env_;
+    }
     opt.arena_block_size = 4096;
     BlockBasedTableOptions table_options;
     table_options.block_cache = tiny_cache_;
@@ -321,6 +326,52 @@ TEST_F(CorruptionTest, TableFile) {
   ASSERT_NOK(dbi->VerifyChecksum());
 }
 
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+  Options options;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  // Disable block cache as we are going to check checksum for
+  // the same file twice and measure number of reads.
+  BlockBasedTableOptions table_options_no_bc;
+  table_options_no_bc.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+  Reopen(&options);
+
+  Build(10000);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  senv.count_random_reads_ = true;
+  senv.random_read_counter_.Reset();
+  ASSERT_OK(dbi->VerifyChecksum());
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+  // The SST file is about 10MB. Default readahead size is 256KB.
+  // Give a conservative 20 reads for metadata blocks, The number
+  // of random reads should be within 10 MB / 256KB + 20 = 60.
+  ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+  senv.random_read_bytes_counter_ = 0;
+  ReadOptions ro;
+  ro.readahead_size = size_t{32 * 1024};
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+  // The SST file is about 10MB. We set readahead size to 32KB.
+  // Give 0 to 20 reads for metadata blocks, and allow real read
+  // to range from 24KB to 48KB. The lower bound would be:
+  //   10MB / 48KB + 0 = 213
+  // The higher bound is
+  //   10MB / 24KB + 20 = 447.
+  ASSERT_GE(senv.random_read_counter_.Read(), 213);
+  ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+  CloseDb();
+}
+
 TEST_F(CorruptionTest, TableFileIndexData) {
   Options options;
   // very big, we'll trigger flushes manually
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 5d885b8b293..dae879346f0 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -4112,7 +4112,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
   return status;
 }
 
-Status DBImpl::VerifyChecksum() {
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
   Status s;
   std::vector<ColumnFamilyData*> cfd_list;
   {
@@ -4143,7 +4143,8 @@ Status DBImpl::VerifyChecksum() {
         const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
         std::string fname = TableFileName(cfd->ioptions()->cf_paths,
                                           fd.GetNumber(), fd.GetPathId());
-        s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname);
+        s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options,
+                                           fname);
       }
     }
     if (!s.ok()) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index f2b3df5e6ac..42377ce4104 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -380,7 +380,8 @@ class DBImpl : public DB {
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) override;
 
-  virtual Status VerifyChecksum() override;
+  using DB::VerifyChecksum;
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
 
   using DB::StartTrace;
   virtual Status StartTrace(
diff --git a/db/db_test.cc b/db/db_test.cc
index ae0481da89d..17ac2272901 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2585,7 +2585,8 @@ class ModelDB : public DB {
     return Status::NotSupported("Not implemented.");
   }
 
-  Status VerifyChecksum() override {
+  using DB::VerifyChecksum;
+  Status VerifyChecksum(const ReadOptions&) override {
     return Status::NotSupported("Not implemented.");
   }
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 2fce8e01bed..46c1a6b683b 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -347,8 +347,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   }
 
   if (ingestion_options_.verify_checksums_before_ingest) {
-    status =
-        table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion);
+    // If customized readahead size is needed, we can pass a user option
+    // all the way to here. Right now we just rely on the default readahead
+    // to keep things simple.
+    status = table_reader->VerifyChecksum(
+        ReadOptions(), TableReaderCaller::kExternalSSTIngestion);
   }
   if (!status.ok()) {
     return status;
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index d3cbe6016ac..db26948a432 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path);
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 36d6fea92bb..023659524fd 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1225,7 +1225,9 @@ class DB {
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) = 0;
 
-  virtual Status VerifyChecksum() = 0;
+  virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+  virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
 
   // AddFile() is deprecated, please use IngestExternalFile()
   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h
index 517907dd501..522a8d9a1df 100644
--- a/include/rocksdb/sst_file_reader.h
+++ b/include/rocksdb/sst_file_reader.h
@@ -33,7 +33,9 @@ class SstFileReader {
   std::shared_ptr<const TableProperties> GetTableProperties() const;
 
   // Verifies whether there is corruption in this table.
-  Status VerifyChecksum();
+  Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+  Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
 
  private:
   struct Rep;
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 35fddc804b9..3941c1821e7 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -143,6 +143,10 @@ class StackableDB : public DB {
 
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
+  virtual Status VerifyChecksum(const ReadOptions& options) override {
+    return db_->VerifyChecksum(options);
+  }
+
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 65e960f8c88..ef375874076 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -62,6 +62,10 @@ extern const std::string kHashIndexPrefixesMetadataBlock;
 
 typedef BlockBasedTable::IndexReader IndexReader;
 
+// Found that 256 KB readahead size provides the best performance, based on
+// experiments, for auto readahead. Experiment data is in PR #3282.
+const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
+
 BlockBasedTable::~BlockBasedTable() {
   delete rep_;
 }
@@ -2854,13 +2858,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
   FindKeyBackward();
 }
 
-// Found that 256 KB readahead size provides the best performance, based on
-// experiments, for auto readahead. Experiment data is in PR #3282.
-template <class TBlockIter, typename TValue>
-const size_t
-    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
-        256 * 1024;
-
 template <class TBlockIter, typename TValue>
 void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
   BlockHandle data_block_handle = index_iter_->value().handle;
@@ -2883,7 +2880,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
       if (read_options_.readahead_size == 0) {
         // Implicit auto readahead
         num_file_reads_++;
-        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
+        if (num_file_reads_ >
+            BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
           if (!rep->file->use_direct_io() &&
               (data_block_handle.offset() +
                    static_cast<size_t>(data_block_handle.size()) +
@@ -2897,14 +2895,14 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
                                                    readahead_size_);
             // Keep exponentially increasing readahead size until
             // kMaxAutoReadaheadSize.
-            readahead_size_ =
-                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
+            readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
+                                       readahead_size_ * 2);
           } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
             // Direct I/O
             // Let FilePrefetchBuffer take care of the readahead.
-            prefetch_buffer_.reset(
-                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
-                                       kMaxAutoReadaheadSize));
+            prefetch_buffer_.reset(new FilePrefetchBuffer(
+                rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize,
+                BlockBasedTable::kMaxAutoReadaheadSize));
           }
         }
       } else if (!prefetch_buffer_) {
@@ -3707,7 +3705,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   return Status::OK();
 }
 
-Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+                                       TableReaderCaller caller) {
   Status s;
   // Check Meta blocks
   std::unique_ptr<Block> meta;
@@ -3725,7 +3724,7 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
   IndexBlockIter iiter_on_stack;
   BlockCacheLookupContext context{caller};
   InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
-      ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
       /*get_context=*/nullptr, &context);
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
@@ -3735,13 +3734,22 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
     // error opening index iterator
     return iiter->status();
   }
-  s = VerifyChecksumInBlocks(iiter);
+  s = VerifyChecksumInBlocks(read_options, iiter);
   return s;
 }
 
 Status BlockBasedTable::VerifyChecksumInBlocks(
+    const ReadOptions& read_options,
     InternalIteratorBase<IndexValue>* index_iter) {
   Status s;
+  // We are scanning the whole file, so no need to do exponential
+  // increasing of the buffer size.
+  size_t readahead_size = (read_options.readahead_size != 0)
+                              ? read_options.readahead_size
+                              : kMaxAutoReadaheadSize;
+  FilePrefetchBuffer prefetch_buffer(rep_->file.get(), readahead_size,
+                                     readahead_size);
+
   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
     s = index_iter->status();
     if (!s.ok()) {
@@ -3750,9 +3758,9 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     BlockHandle handle = index_iter->value().handle;
     BlockContents contents;
     BlockFetcher block_fetcher(
-        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-        ReadOptions(), handle, &contents, rep_->ioptions,
-        false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
+        rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
+        &contents, rep_->ioptions, false /* decompress */,
+        false /*maybe_compressed*/, BlockType::kData,
         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 2b0c4754a6c..7597c00d619 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -82,6 +82,13 @@ class BlockBasedTable : public TableReader {
   // For Posix files the unique ID is three varints.
   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
 
+  // All the below fields control iterator readahead
+  static const size_t kInitAutoReadaheadSize = 8 * 1024;
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  static const size_t kMaxAutoReadaheadSize;
+  static const int kMinNumFileReadsToStartAutoReadahead = 2;
+
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
   // retrieving data from the table.
@@ -182,7 +189,8 @@ class BlockBasedTable : public TableReader {
   // convert SST file to a human readable form
   Status DumpTable(WritableFile* out_file) override;
 
-  Status VerifyChecksum(TableReaderCaller caller) override;
+  Status VerifyChecksum(const ReadOptions& readOptions,
+                        TableReaderCaller caller) override;
 
   ~BlockBasedTable();
 
@@ -430,7 +438,8 @@ class BlockBasedTable : public TableReader {
   static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
-  Status VerifyChecksumInBlocks(InternalIteratorBase<IndexValue>* index_iter);
+  Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+                                InternalIteratorBase<IndexValue>* index_iter);
 
   // Create the filter from the filter block.
   std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
@@ -772,13 +781,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // lookup_context_.caller = kCompaction.
   size_t compaction_readahead_size_;
 
-  // All the below fields control iterator readahead
-  static const size_t kInitAutoReadaheadSize = 8 * 1024;
-  // Found that 256 KB readahead size provides the best performance, based on
-  // experiments, for auto readahead. Experiment data is in PR #3282.
-  static const size_t kMaxAutoReadaheadSize;
-  static const int kMinNumFileReadsToStartAutoReadahead = 2;
-  size_t readahead_size_ = kInitAutoReadaheadSize;
+  size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
   size_t readahead_limit_ = 0;
   int64_t num_file_reads_ = 0;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index 7c3b91cc39a..cc892c25b2a 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -79,8 +79,9 @@ std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
   return rep_->table_reader->GetTableProperties();
 }
 
-Status SstFileReader::VerifyChecksum() {
-  return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader);
+Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
+  return rep_->table_reader->VerifyChecksum(read_options,
+                                            TableReaderCaller::kSSTFileReader);
 }
 
 }  // namespace rocksdb
diff --git a/table/table_reader.h b/table/table_reader.h
index e968014b1ac..bf4dad766de 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -127,7 +127,8 @@ class TableReader {
   }
 
   // check whether there is corruption in this db file
-  virtual Status VerifyChecksum(TableReaderCaller /*caller*/) {
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
+                                TableReaderCaller /*caller*/) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
 };
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 44a733b57c6..cbd9c0c8785 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -143,7 +143,9 @@ Status SstFileDumper::NewTableReader(
 }
 
 Status SstFileDumper::VerifyChecksum() {
-  return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool);
+  // We could pass specific readahead setting into read options if needed.
+  return table_reader_->VerifyChecksum(ReadOptions(),
+                                       TableReaderCaller::kSSTDumpTool);
 }
 
 Status SstFileDumper::DumpTable(const std::string& out_filename) {

From 8e12638f3d0d91791cf06253493b8b15827f4b6c Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 19 Aug 2019 10:50:25 -0700
Subject: [PATCH 315/572] Slightly adjust atomic white box test's kill odd
 (#5717)

Summary:
Atomic white box test's kill odd is the same as normal test. However, in the scenario that only WritableFileWriter::Append() is blacklisted, WritableFileWriter::Flush() dominates the killing odds. Normally, most of WritableFileWriter::Flush() are called in WAL writes, where every write triggers a WAL flush. In atomic test, WAL is disabled, so the kill happens less frequently than we antipated. In some rare cases, the kill didn't end up with happening (for reasons I still don't fully understand) and cause the stress test timeout.

If WAL is disabled, make the odds 5x likely to trigger.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5717

Test Plan: Run whitebox_crash_test_with_atomic_flush and whitebox_crash_test and observe the kill odds printed out.

Differential Revision: D16897237

fbshipit-source-id: cbf5d96f6fc0e980523d0f1f94bf4e72cdb82d1c
---
 tools/db_crashtest.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 709406e56f4..3d9fbb91b58 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -288,8 +288,12 @@ def whitebox_crash_main(args, unknown_args):
                     "kill_random_test": kill_random_test,
                 })
             elif kill_mode == 1:
+                if cmd_params.get('disable_wal', 0) == 1:
+                    my_kill_odd = kill_random_test / 50 + 1
+                else:
+                    my_kill_odd = kill_random_test / 10 + 1
                 additional_opts.update({
-                    "kill_random_test": (kill_random_test / 10 + 1),
+                    "kill_random_test": my_kill_odd,
                     "kill_prefix_blacklist": "WritableFileWriter::Append,"
                     + "WritableFileWriter::WriteBuffered",
                 })

From 4c74dba5fa166bddc1e8a1b1f7488f99ffb1b458 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 20 Aug 2019 10:33:03 -0700
Subject: [PATCH 316/572] Bump up memory order of ref counting of
 ColumnFamilyData (#5723)

Summary:
We see this TSAN warning:

WARNING: ThreadSanitizer: data race (pid=282806)
  Write of size 8 at 0x7b6c00000e38 by thread T16 (mutexes: write M1023578822185846136):
    #0 operator delete(void*) <null> (libtsan.so.0+0x0000000795f8)
    https://github.com/facebook/rocksdb/issues/1 rocksdb::DBImpl::BackgroundFlush(bool*, rocksdb::JobContext*, rocksdb::LogBuffer*, rocksdb::FlushReason*, rocksdb::Env::Priority) db/db_impl/db_impl_compaction_flush.cc:2202 (db_flush_test+0x00000060b462)
    https://github.com/facebook/rocksdb/issues/2 rocksdb::DBImpl::BackgroundCallFlush(rocksdb::Env::Priority) db/db_impl/db_impl_compaction_flush.cc:2226 (db_flush_test+0x00000060cbd8)
    https://github.com/facebook/rocksdb/issues/3 rocksdb::DBImpl::BGWorkFlush(void*) db/db_impl/db_impl_compaction_flush.cc:2073 (db_flush_test+0x00000060d5ac)
    ......

Previous atomic write of size 4 at 0x7b6c00000e38 by main thread:
    #0 __tsan_atomic32_fetch_sub <null> (libtsan.so.0+0x00000006d721)
    https://github.com/facebook/rocksdb/issues/1 std::__atomic_base<int>::fetch_sub(int, std::memory_order) /mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc/include/c++/7.3.0/bits/atomic_base.h:524 (db_flush_test+0x0000005f9e38)
    https://github.com/facebook/rocksdb/issues/2 rocksdb::ColumnFamilyData::Unref() db/column_family.h:286 (db_flush_test+0x0000005f9e38)
    https://github.com/facebook/rocksdb/issues/3 rocksdb::DBImpl::FlushMemTable(rocksdb::ColumnFamilyData*, rocksdb::FlushOptions const&, rocksdb::FlushReason, bool) db/db_impl/db_impl_compaction_flush.cc:1624 (db_flush_test+0x0000005f9e38)
    https://github.com/facebook/rocksdb/issues/4 rocksdb::DBImpl::TEST_FlushMemTable(rocksdb::ColumnFamilyData*, rocksdb::FlushOptions const&) db/db_impl/db_impl_debug.cc:127 (db_flush_test+0x00000061ace9)
    https://github.com/facebook/rocksdb/issues/5 rocksdb::DBFlushTest_CFDropRaceWithWaitForFlushMemTables_Test::TestBody() db/db_flush_test.cc:320 (db_flush_test+0x0000004b44e5)
    https://github.com/facebook/rocksdb/issues/6 void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc:3824 (db_flush_test+0x000000be2988)
    ......

It's still very clear the cause of the warning is because that TSAN treats results from relaxed atomic::fetch_sub() as non-atomic with the operation itself. We can make it more explicit by bumping up the order to CS.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5723

Test Plan: Run all existing test.

Differential Revision: D16908250

fbshipit-source-id: bf17d39ed19058372bdf97f6440a743f88153021
---
 db/column_family.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/column_family.h b/db/column_family.h
index 8180f0be26a..135504ea21b 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -275,7 +275,7 @@ class ColumnFamilyData {
   // Ref() can only be called from a context where the caller can guarantee
   // that ColumnFamilyData is alive (while holding a non-zero ref already,
   // holding a DB mutex, or as the leader in a write batch group).
-  void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
+  void Ref() { refs_.fetch_add(1); }
 
   // Unref decreases the reference count, but does not handle deletion
   // when the count goes to 0.  If this method returns true then the
@@ -283,7 +283,7 @@ class ColumnFamilyData {
   // FreeDeadColumnFamilies().  Unref() can only be called while holding
   // a DB mutex, or during single-threaded recovery.
   bool Unref() {
-    int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
+    int old_refs = refs_.fetch_sub(1);
     assert(old_refs > 0);
     return old_refs == 1;
   }

From 3552473668ec37833894317ebfb6d46694838c74 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 20 Aug 2019 10:40:59 -0700
Subject: [PATCH 317/572] Introduce
 IngestExternalFileOptions.verify_checksums_readahead_size (#5721)

Summary:
Recently readahead is introduced for checksum verifying. However, users cannot override the setting for the checksum verifying before external SST file ingestion. Introduce a new option for the purpose.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5721

Test Plan: Add a new unit test for it.

Differential Revision: D16906896

fbshipit-source-id: 218ec37001ddcc05411cefddbe233d15ab308476
---
 HISTORY.md                            |  2 +-
 db/external_sst_file_basic_test.cc    | 51 +++++++++++++++++++++++++++
 db/external_sst_file_ingestion_job.cc |  4 ++-
 include/rocksdb/options.h             |  7 ++++
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 13b49f469d7..23bdccfb18b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,7 +3,7 @@
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 ### New Features
-* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size.
+* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 475ec7fe876..c85d7394ea7 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -745,6 +745,57 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
   }
 }
 
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+  Options options;
+  options.create_if_missing = true;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  Random rnd(301);
+  std::string value = DBTestBase::RandomString(&rnd, 4000);
+  for (int i = 0; i < 5000; i++) {
+    ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+
+  // Ingest it once without verifying checksums to see the baseline
+  // preads.
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = false;
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  auto base_num_reads = senv.random_read_counter_.Read();
+  // Make sure the counter is enabled.
+  ASSERT_GT(base_num_reads, 0);
+
+  // Ingest again and observe the reads made for for readahead.
+  ingest_opt.move_files = false;
+  ingest_opt.verify_checksums_before_ingest = true;
+  ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+  // The SST file is about 20MB. Readahead size is 2MB.
+  // Give a conservative 15 reads for metadata blocks, the number
+  // of random reads should be within 20 MB / 2MB + 15 = 25.
+  ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+  Destroy(options);
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   int kNumLevels = 7;
   Options options = CurrentOptions();
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 46c1a6b683b..36fbe988660 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -350,8 +350,10 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     // If customized readahead size is needed, we can pass a user option
     // all the way to here. Right now we just rely on the default readahead
     // to keep things simple.
+    ReadOptions ro;
+    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
     status = table_reader->VerifyChecksum(
-        ReadOptions(), TableReaderCaller::kExternalSSTIngestion);
+        ro, TableReaderCaller::kExternalSSTIngestion);
   }
   if (!status.ok()) {
     return status;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index bda44d4417c..12cb66510dc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1477,6 +1477,13 @@ struct IngestExternalFileOptions {
   // Warning: setting this to true causes slowdown in file ingestion because
   // the external SST file has to be read.
   bool verify_checksums_before_ingest = false;
+  // When verify_checksums_before_ingest = true, RocksDB uses default
+  // readahead setting to scan the file while verifying checksums before
+  // ingestion.
+  // Users can override the default value using this option.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  size_t verify_checksums_readahead_size = 0;
 };
 
 enum TraceFilterType : uint64_t {

From 7bc18e272791b8b23e20e62967d4c1d1040b16b9 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 20 Aug 2019 11:38:15 -0700
Subject: [PATCH 318/572] Disable snapshot refresh feature when
 snap_refresh_nanos is 0 (#5724)

Summary:
The comments of snap_refresh_nanos advertise that the snapshot refresh feature will be disabled when the option is set to 0. This contract is however not honored in the code: https://github.com/facebook/rocksdb/pull/5278
The patch fixes that and also adds an assert to ensure that the feature is not used when the option  is zero.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5724

Differential Revision: D16918185

Pulled By: maysamyabandeh

fbshipit-source-id: fec167287df7d85093e087fc39c0eb243e3bbd7e
---
 HISTORY.md                             |  1 +
 db/compaction/compaction_iterator.h    |  1 +
 db/compaction/compaction_job_test.cc   |  2 +-
 db/db_impl/db_impl_compaction_flush.cc | 12 ++++++++----
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 23bdccfb18b..740210e4f24 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 ## Unreleased
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
+* Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
 ### New Features
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 86a2b87b22c..2bf847e2e21 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -39,6 +39,7 @@ class SnapshotListFetchCallback {
   virtual void Refresh(std::vector<SequenceNumber>* snapshots,
                        SequenceNumber max) = 0;
   inline bool TimeToRefresh(const size_t key_index) {
+    assert(snap_refresh_nanos_ != 0);
     // skip the key if key_index % every_nth_key (which is of power 2) is not 0.
     if ((key_index & every_nth_key_minus_one_) != 0) {
       return false;
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index add4911891a..b813966bc5f 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -964,7 +964,7 @@ TEST_F(CompactionJobTest, SnapshotRefresh) {
    public:
     SnapshotListFetchCallbackTest(Env* env, Random64& rand,
                                   std::vector<SequenceNumber>* snapshots)
-        : SnapshotListFetchCallback(env, 0 /*no time delay*/,
+        : SnapshotListFetchCallback(env, 1 /*short time delay*/,
                                     1 /*fetch after each key*/),
           rand_(rand),
           snapshots_(snapshots) {}
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 67292401683..ffb6b2b60ca 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1008,8 +1008,10 @@ Status DBImpl::CompactFilesImpl(
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
       &compaction_job_stats, Env::Priority::USER,
-      immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                    : nullptr);
+      immutable_db_options_.max_subcompactions <= 1 &&
+              c->mutable_cf_options()->snap_refresh_nanos > 0
+          ? &fetch_callback
+          : nullptr);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -2737,8 +2739,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri,
-        immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                      : nullptr);
+        immutable_db_options_.max_subcompactions <= 1 &&
+                c->mutable_cf_options()->snap_refresh_nanos > 0
+            ? &fetch_callback
+            : nullptr);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,

From 9046bdc5d3a4e8774e08e4b12784d37c2b001c9f Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 21 Aug 2019 10:21:41 -0700
Subject: [PATCH 319/572] Fix MultiGet() bug when whole_key_filtering is
 disabled (#5665)

Summary:
The batched MultiGet() implementation was not correctly handling bloom filter lookups when whole_key_filtering is disabled. It was incorrectly skipping keys not in the prefix_extractor domain, and not calling transform for keys in domain. This PR fixes both problems by moving the domain check and transformation to the FilterBlockReader.

Tests:
Unit test (confirmed failed before the fix)
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5665

Differential Revision: D16902380

Pulled By: anand1976

fbshipit-source-id: a6be81ad68a6e37134a65246aec7a2c590eccf00
---
 HISTORY.md                                    |  1 +
 db/db_basic_test.cc                           | 47 +++++++++++++++++++
 table/block_based/block_based_table_reader.cc |  7 ---
 table/block_based/filter_block.h              |  3 +-
 table/block_based/full_filter_block.cc        | 36 ++++++++++----
 table/block_based/full_filter_block.h         |  1 +
 6 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 740210e4f24..a87d06f9017 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,7 @@
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
+* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 ### New Features
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 6104b142a30..828eaaaeb52 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1287,6 +1287,53 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
   }
 }
 
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+//              If false, use full filter block
+class MultiGetPrefixExtractorTest
+    : public DBBasicTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  BlockBasedTableOptions bbto;
+  if (GetParam()) {
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.partition_filters = true;
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  bbto.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  // First key is not in the prefix_extractor domain
+  ASSERT_OK(Put("k", "v0"));
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk3", "v3"));
+  ASSERT_OK(Put("kk4", "v4"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> keys({"k", "kk1", "kk2", "kk3", "kk4"});
+  std::vector<std::string> values;
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values[0], "v0");
+  ASSERT_EQ(values[1], "v1");
+  ASSERT_EQ(values[2], "v2");
+  ASSERT_EQ(values[3], "v3");
+  ASSERT_EQ(values[4], "v4");
+  // Filter hits for 4 in-domain keys
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MultiGetPrefix, MultiGetPrefixExtractorTest,
+    ::testing::Bool());
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBBasicTest, GetAllKeyVersions) {
   Options options = CurrentOptions();
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index ef375874076..bf0bed84637 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3155,13 +3155,6 @@ void BlockBasedTable::FullFilterKeysMayMatch(
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0) {
-    for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      Slice user_key = iter->lkey->user_key();
-
-      if (!prefix_extractor->InDomain(user_key)) {
-        range->SkipKey(iter);
-      }
-    }
     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                              lookup_context);
   }
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 936281bde65..a2871e6c8ea 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -137,7 +137,8 @@ class FilterBlockReader {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
       GetContext* const get_context = iter->get_context;
-      if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
+      if (prefix_extractor->InDomain(ukey) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
                        block_offset, no_io, &ikey, get_context,
                        lookup_context)) {
         range->SkipKey(iter);
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 29decc35beb..cf1afb5d36b 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <array>
 #include "table/block_based/full_filter_block.h"
 
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
@@ -208,11 +209,11 @@ void FullFilterBlockReader::KeysMayMatch(
     // present
     return;
   }
-  MayMatch(range, no_io, lookup_context);
+  MayMatch(range, no_io, nullptr, lookup_context);
 }
 
 void FullFilterBlockReader::PrefixesMayMatch(
-    MultiGetRange* range, const SliceTransform* /* prefix_extractor */,
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
     uint64_t block_offset, const bool no_io,
     BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
@@ -220,11 +221,12 @@ void FullFilterBlockReader::PrefixesMayMatch(
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  MayMatch(range, no_io, lookup_context);
+  MayMatch(range, no_io, prefix_extractor, lookup_context);
 }
 
 void FullFilterBlockReader::MayMatch(
     MultiGetRange* range, bool no_io,
+    const SliceTransform* prefix_extractor,
     BlockCacheLookupContext* lookup_context) const {
   CachableEntry<BlockContents> filter_block;
 
@@ -252,18 +254,36 @@ void FullFilterBlockReader::MayMatch(
   // &may_match[0] doesn't work for autovector<bool> (compiler error). So
   // declare both keys and may_match as arrays, which is also slightly less
   // expensive compared to autovector
-  Slice* keys[MultiGetContext::MAX_BATCH_SIZE];
-  bool may_match[MultiGetContext::MAX_BATCH_SIZE] = {false};
+  std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+  std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
   int num_keys = 0;
-  for (auto iter = range->begin(); iter != range->end(); ++iter) {
-    keys[num_keys++] = &iter->ukey;
+  MultiGetRange filter_range(*range, range->begin(), range->end());
+  for (auto iter = filter_range.begin();
+       iter != filter_range.end(); ++iter) {
+    if (!prefix_extractor) {
+      keys[num_keys++] = &iter->ukey;
+    } else if (prefix_extractor->InDomain(iter->ukey)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+      keys[num_keys++] = &prefixes.back();
+    } else {
+      filter_range.SkipKey(iter);
+    }
   }
   filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
 
   int i = 0;
-  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+  for (auto iter = filter_range.begin();
+       iter != filter_range.end(); ++iter) {
     if (!may_match[i]) {
+      // Update original MultiGet range to skip this key. The filter_range
+      // was temporarily used just to skip keys not in prefix_extractor domain
       range->SkipKey(iter);
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    } else {
+      //PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      PerfContext* perf_ctx = get_perf_context();
+      perf_ctx->bloom_sst_hit_count++;
     }
     ++i;
   }
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 08a41706e6b..4d10a5a33ce 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -124,6 +124,7 @@ class FullFilterBlockReader : public FilterBlockReaderCommon<BlockContents> {
   bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
                 BlockCacheLookupContext* lookup_context) const;
   void MayMatch(MultiGetRange* range, bool no_io,
+                const SliceTransform* prefix_extractor,
                 BlockCacheLookupContext* lookup_context) const;
   bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
                           const Comparator* comparator) const;

From 244e6f2002dda2959bc5b15f42dd16d8d6984eb3 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Thu, 22 Aug 2019 08:47:36 -0700
Subject: [PATCH 320/572] Refactor MultiGet names in BlockBasedTable (#5726)

Summary:
To improve code readability, since RetrieveBlock already calls MaybeReadBlockAndLoadToCache, we avoid name similarity of the functions that call RetrieveBlock with MaybeReadBlockAndLoadToCache. The patch thus renames MaybeLoadBlocksToCache to RetrieveMultipleBlock and deletes GetDataBlockFromCache, which contains only two lines.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5726

Differential Revision: D16962535

Pulled By: maysamyabandeh

fbshipit-source-id: 99e8946808ce4eb7857592b9003812e3004f92d6
---
 table/block_based/block_based_table_reader.cc | 39 +++++++------------
 table/block_based/block_based_table_reader.h  |  8 +---
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bf0bed84637..b3bca0cb86e 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2080,27 +2080,6 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   return iter;
 }
 
-// Lookup the cache for the given data block referenced by an index iterator
-// value (i.e BlockHandle). If it exists in the cache, initialize block to
-// the contents of the data block.
-Status BlockBasedTable::GetDataBlockFromCache(
-    const ReadOptions& ro, const BlockHandle& handle,
-    const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block, BlockType block_type,
-    GetContext* get_context) const {
-  BlockCacheLookupContext lookup_data_block_context(
-      TableReaderCaller::kUserMultiGet);
-  assert(block_type == BlockType::kData);
-  Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict, block,
-                           block_type, get_context, &lookup_data_block_context,
-                           /* for_compaction */ false, /* use_cache */ true);
-  if (s.IsIncomplete()) {
-    s = Status::OK();
-  }
-
-  return s;
-}
-
 // If contents is nullptr, this function looks up the block caches for the
 // data block referenced by handle, and read the block from disk if necessary.
 // If contents is non-null, it skips the cache lookup and disk read, since
@@ -2275,7 +2254,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 //         found in cache
 // handles - A vector of block handles. Some of them me be NULL handles
 // scratch - An optional contiguous buffer to read compressed blocks into
-void BlockBasedTable::MaybeLoadBlocksToCache(
+void BlockBasedTable::RetrieveMultipleBlocks(
     const ReadOptions& options,
     const MultiGetRange* batch,
     const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
@@ -3441,10 +3420,20 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           block_handles.emplace_back(BlockHandle::NullBlockHandle());
           continue;
         }
+        // Lookup the cache for the given data block referenced by an index
+        // iterator value (i.e BlockHandle). If it exists in the cache,
+        // initialize block to the contents of the data block.
         offset = v.handle.offset();
         BlockHandle handle = v.handle;
-        Status s = GetDataBlockFromCache(ro, handle, uncompression_dict,
-              &(results.back()), BlockType::kData, miter->get_context);
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict,
+                          &(results.back()), BlockType::kData,
+                          miter->get_context, &lookup_data_block_context,
+                          /* for_compaction */ false, /* use_cache */ true);
+        if (s.IsIncomplete()) {
+          s = Status::OK();
+        }
         if (s.ok() && !results.back().IsEmpty()) {
           // Found it in the cache. Add NULL handle to indicate there is
           // nothing to read from disk
@@ -3475,7 +3464,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
             block_buf.reset(scratch);
           }
         }
-        MaybeLoadBlocksToCache(read_options,
+        RetrieveMultipleBlocks(read_options,
             &data_block_range, &block_handles, &statuses, &results,
             scratch, uncompression_dict);
       }
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 7597c00d619..3609db26e95 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -316,13 +316,7 @@ class BlockBasedTable : public TableReader {
                        BlockCacheLookupContext* lookup_context,
                        bool for_compaction, bool use_cache) const;
 
-  Status GetDataBlockFromCache(
-      const ReadOptions& ro, const BlockHandle& handle,
-      const UncompressionDict& uncompression_dict,
-      CachableEntry<Block>* block_entry, BlockType block_type,
-      GetContext* get_context) const;
-
-  void MaybeLoadBlocksToCache(
+  void RetrieveMultipleBlocks(
       const ReadOptions& options, const MultiGetRange* batch,
       const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
       autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,

From 202942b20cbd7a2dd539dcc4250688dd6a9a932e Mon Sep 17 00:00:00 2001
From: Patrick Pei <14530807+patrickpei@users.noreply.github.com>
Date: Thu, 22 Aug 2019 16:20:20 -0700
Subject: [PATCH 321/572] Fix local includes

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5722

Differential Revision: D16908380

fbshipit-source-id: 6a0e3cb2730b08d6012d3d7f31c937f01c399846
---
 table/get_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/get_context.h b/table/get_context.h
index 1a15949904f..8a2f24464bc 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
-#include <db/dbformat.h>
 #include <string>
+#include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/read_callback.h"
 #include "rocksdb/env.h"

From d8a27d9331416680bc5c239652de49a6a3b7b9f5 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 22 Aug 2019 16:30:30 -0700
Subject: [PATCH 322/572] Atomic Flush Crash Test also covers the case that WAL
 is enabled. (#5729)

Summary:
AtomicFlushStressTest is a powerful test, but right now we only run it for atomic_flush=true + disable_wal=true. We further extend it to the case where atomic_flush=false + disable_wal = false. All the workload generation and validation can stay the same.
Atomic flush crash test is also changed to switch between the two test scenarios. It makes the name "atomic flush crash test" out of sync from what it really does. We leave it as it is to avoid troubles with continous test set-up.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5729

Test Plan: Run "CRASH_TEST_KILL_ODD=188 TEST_TMPDIR=/dev/shm/ USE_CLANG=1 make whitebox_crash_test_with_atomic_flush", observe the settings used and see it passed.

Differential Revision: D16969791

fbshipit-source-id: 56e37487000ae631e31b0100acd7bdc441c04163
---
 Makefile              |  4 ++--
 tools/db_crashtest.py | 20 +++++++++++---------
 tools/db_stress.cc    | 22 ++++++++++++----------
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index be02db7f347..3bcccf88ae1 100644
--- a/Makefile
+++ b/Makefile
@@ -950,7 +950,7 @@ blackbox_crash_test: db_stress
 	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --enable_atomic_flush blackbox $(CRASH_TEST_EXT_ARGS)
+	python -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
 
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
@@ -963,7 +963,7 @@ whitebox_crash_test: db_stress
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --enable_atomic_flush whitebox  --random_kill_odd \
+	python -u tools/db_crashtest.py --cf_consistency whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 asan_check:
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 3d9fbb91b58..ed709a1acf5 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -16,9 +16,9 @@
 #       default_params < {blackbox,whitebox}_default_params <
 #       simple_default_params <
 #       {blackbox,whitebox}_simple_default_params < args
-#   for enable_atomic_flush:
+#   for cf_consistency:
 #       default_params < {blackbox,whitebox}_default_params <
-#       atomic_flush_params < args
+#       cf_consistency_params < args
 
 expected_values_file = tempfile.NamedTemporaryFile()
 
@@ -132,10 +132,10 @@ def is_direct_io_supported(dbname):
 
 whitebox_simple_default_params = {}
 
-atomic_flush_params = {
-    "disable_wal": 1,
+cf_consistency_params = {
+    "disable_wal": lambda: random.randint(0, 1),
     "reopen": 0,
-    "test_atomic_flush": 1,
+    "test_cf_consistency": 1,
     # use small value for write_buffer_size so that RocksDB triggers flush
     # more frequently
     "write_buffer_size": 1024 * 1024,
@@ -160,6 +160,8 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("test_batches_snapshots") == 1:
         dest_params["delpercent"] += dest_params["delrangepercent"]
         dest_params["delrangepercent"] = 0
+    if dest_params.get("disable_wal", 0) == 1:
+        dest_params["atomic_flush"] = 1
     return dest_params
 
 
@@ -177,8 +179,8 @@ def gen_cmd_params(args):
             params.update(blackbox_simple_default_params)
         if args.test_type == 'whitebox':
             params.update(whitebox_simple_default_params)
-    if args.enable_atomic_flush:
-        params.update(atomic_flush_params)
+    if args.cf_consistency:
+        params.update(cf_consistency_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -191,7 +193,7 @@ def gen_cmd(params, unknown_params):
         '--{0}={1}'.format(k, v)
         for k, v in finalize_and_sanitize(params).items()
         if k not in set(['test_type', 'simple', 'duration', 'interval',
-                         'random_kill_odd', 'enable_atomic_flush'])
+                         'random_kill_odd', 'cf_consistency'])
         and v is not None] + unknown_params
     return cmd
 
@@ -388,7 +390,7 @@ def main():
         db_stress multiple times")
     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
     parser.add_argument("--simple", action="store_true")
-    parser.add_argument("--enable_atomic_flush", action='store_true')
+    parser.add_argument("--cf_consistency", action='store_true')
 
     all_params = dict(default_params.items()
                       + blackbox_default_params.items()
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 98d088e345e..7b4a36f0ff6 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -136,9 +136,10 @@ DEFINE_bool(test_batches_snapshots, false,
 DEFINE_bool(atomic_flush, false,
             "If set, enables atomic flush in the options.\n");
 
-DEFINE_bool(test_atomic_flush, false,
-            "If set, runs the stress test dedicated to verifying atomic flush "
-            "functionality. Setting this implies `atomic_flush=true`.\n");
+DEFINE_bool(test_cf_consistency, false,
+            "If set, runs the stress test dedicated to verifying writes to "
+            "multiple column families are consistent. Setting this implies "
+            "`atomic_flush=true` is set true if `disable_wal=false`.\n");
 
 DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
 
@@ -3950,11 +3951,11 @@ class BatchedOpsStressTest : public StressTest {
   virtual void VerifyDb(ThreadState* /* thread */) const {}
 };
 
-class AtomicFlushStressTest : public StressTest {
+class CfConsistencyStressTest : public StressTest {
  public:
-  AtomicFlushStressTest() : batch_id_(0) {}
+  CfConsistencyStressTest() : batch_id_(0) {}
 
-  virtual ~AtomicFlushStressTest() {}
+  virtual ~CfConsistencyStressTest() {}
 
   virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
                          const ReadOptions& /* read_opts */,
@@ -4048,7 +4049,7 @@ class AtomicFlushStressTest : public StressTest {
       std::unique_ptr<MutexLock>& /* lock */) {
     assert(false);
     fprintf(stderr,
-            "AtomicFlushStressTest does not support TestIngestExternalFile "
+            "CfConsistencyStressTest does not support TestIngestExternalFile "
             "because it's not possible to verify the result\n");
     std::terminate();
   }
@@ -4461,9 +4462,10 @@ int main(int argc, char** argv) {
             "Error: clear_column_family_one_in must be 0 when using backup\n");
     exit(1);
   }
-  if (FLAGS_test_atomic_flush) {
+  if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
     FLAGS_atomic_flush = true;
   }
+
   if (FLAGS_read_only) {
     if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
         FLAGS_delrangepercent != 0) {
@@ -4507,8 +4509,8 @@ int main(int argc, char** argv) {
   rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
 
   std::unique_ptr<rocksdb::StressTest> stress;
-  if (FLAGS_test_atomic_flush) {
-    stress.reset(new rocksdb::AtomicFlushStressTest());
+  if (FLAGS_test_cf_consistency) {
+    stress.reset(new rocksdb::CfConsistencyStressTest());
   } else if (FLAGS_test_batches_snapshots) {
     stress.reset(new rocksdb::BatchedOpsStressTest());
   } else {

From df8c307d63a6f431920e81b556fc7bdd20312da7 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 23 Aug 2019 08:25:52 -0700
Subject: [PATCH 323/572] Revert to storing UncompressionDicts in the cache
 (#5645)

Summary:
PR https://github.com/facebook/rocksdb/issues/5584 decoupled the uncompression dictionary object from the underlying block data; however, this defeats the purpose of the digested ZSTD dictionary, since the whole point
of the digest is to create it once and reuse it over and over again. This patch goes back to
storing the uncompression dictionary itself in the cache (which should be now safe to do,
since it no longer includes a Statistics pointer), while preserving the rest of the refactoring.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5645

Test Plan: make asan_check

Differential Revision: D16551864

Pulled By: ltamasi

fbshipit-source-id: 2a7e2d34bb16e70e3c816506d5afe1d842057800
---
 table/block_based/block_based_table_reader.cc | 186 +++++++++---------
 table/block_based/block_based_table_reader.h  |   6 +-
 .../block_based/uncompression_dict_reader.cc  |  72 +++----
 table/block_based/uncompression_dict_reader.h |  21 +-
 util/compression.h                            |  69 ++++---
 5 files changed, 177 insertions(+), 177 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index b3bca0cb86e..0efd5e3c121 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -72,6 +72,57 @@ BlockBasedTable::~BlockBasedTable() {
 
 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
 
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               SequenceNumber /* global_seqno */,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */,
+                               bool /* using_zstd */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
+                       size_t read_amp_bytes_per_bit, Statistics* statistics,
+                       bool /* using_zstd */) {
+    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+                     statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   SequenceNumber /* global_seqno */,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+};
+
 namespace {
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@@ -79,15 +130,16 @@ namespace {
 // On success fill *result and return OK - caller owns *result
 // @param uncompression_dict Data for presetting the compression library's
 //    dictionary.
+template <typename TBlocklike>
 Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
+    std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
     bool do_uncompress, bool maybe_compressed, BlockType block_type,
     const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-    bool for_compaction = false) {
+    bool for_compaction, bool using_zstd) {
   assert(result);
 
   BlockContents contents;
@@ -97,34 +149,9 @@ Status ReadBlockFromFile(
       cache_options, memory_allocator, nullptr, for_compaction);
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
-    result->reset(new Block(std::move(contents), global_seqno,
-                            read_amp_bytes_per_bit, ioptions.statistics));
-  }
-
-  return s;
-}
-
-Status ReadBlockFromFile(
-    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
-    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    std::unique_ptr<BlockContents>* result, const ImmutableCFOptions& ioptions,
-    bool do_uncompress, bool maybe_compressed, BlockType block_type,
-    const UncompressionDict& uncompression_dict,
-    const PersistentCacheOptions& cache_options,
-    SequenceNumber /* global_seqno */, size_t /* read_amp_bytes_per_bit */,
-    MemoryAllocator* memory_allocator, bool for_compaction = false) {
-  assert(result);
-
-  result->reset(new BlockContents);
-
-  BlockFetcher block_fetcher(
-      file, prefetch_buffer, footer, options, handle, result->get(), ioptions,
-      do_uncompress, maybe_compressed, block_type, uncompression_dict,
-      cache_options, memory_allocator, nullptr, for_compaction);
-
-  const Status s = block_fetcher.ReadBlockContents();
-  if (!s.ok()) {
-    result->reset();
+    result->reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(contents), global_seqno, read_amp_bytes_per_bit,
+        ioptions.statistics, using_zstd));
   }
 
   return s;
@@ -1599,7 +1626,8 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
-      GetMemoryAllocator(rep_->table_options));
+      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
+      rep_->blocks_definitely_zstd_compressed);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
@@ -1616,38 +1644,6 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
   return Status::OK();
 }
 
-template <typename TBlocklike>
-class BlocklikeTraits;
-
-template <>
-class BlocklikeTraits<BlockContents> {
- public:
-  static BlockContents* Create(BlockContents&& contents,
-                               SequenceNumber /* global_seqno */,
-                               size_t /* read_amp_bytes_per_bit */,
-                               Statistics* /* statistics */) {
-    return new BlockContents(std::move(contents));
-  }
-
-  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
-    return 0;
-  }
-};
-
-template <>
-class BlocklikeTraits<Block> {
- public:
-  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
-                       size_t read_amp_bytes_per_bit, Statistics* statistics) {
-    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
-                     statistics);
-  }
-
-  static uint32_t GetNumRestarts(const Block& block) {
-    return block.NumRestarts();
-  }
-};
-
 template <typename TBlocklike>
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
@@ -1719,7 +1715,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
     std::unique_ptr<TBlocklike> block_holder(
         BlocklikeTraits<TBlocklike>::Create(
             std::move(contents), rep_->get_global_seqno(block_type),
-            read_amp_bytes_per_bit, statistics));  // uncompressed block
+            read_amp_bytes_per_bit, statistics,
+            rep_->blocks_definitely_zstd_compressed));  // uncompressed block
 
     if (block_cache != nullptr && block_holder->own_bytes() &&
         read_options.fill_cache) {
@@ -1790,11 +1787,11 @@ Status BlockBasedTable::PutDataBlockToCache(
 
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
         std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
-        statistics));
+        statistics, rep_->blocks_definitely_zstd_compressed));
   } else {
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
         std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
-        statistics));
+        statistics, rep_->blocks_definitely_zstd_compressed));
   }
 
   // Insert compressed block into compressed block cache.
@@ -1912,7 +1909,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     return iter;
   }
 
-  UncompressionDict uncompression_dict;
+  CachableEntry<UncompressionDict> uncompression_dict;
   if (rep_->uncompression_dict_reader) {
     const bool no_io = (ro.read_tier == kBlockCacheTier);
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
@@ -1924,9 +1921,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     }
   }
 
+  const UncompressionDict& dict = uncompression_dict.GetValue()
+                                      ? *uncompression_dict.GetValue()
+                                      : UncompressionDict::GetEmptyDict();
+
   CachableEntry<Block> block;
-  s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
-                    block_type, get_context, lookup_context, for_compaction,
+  s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                    get_context, lookup_context, for_compaction,
                     /* use_cache */ true);
 
   if (!s.ok()) {
@@ -2255,15 +2256,11 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
 // handles - A vector of block handles. Some of them me be NULL handles
 // scratch - An optional contiguous buffer to read compressed blocks into
 void BlockBasedTable::RetrieveMultipleBlocks(
-    const ReadOptions& options,
-    const MultiGetRange* batch,
-    const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
+    const ReadOptions& options, const MultiGetRange* batch,
+    const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
     autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
-    autovector<
-      CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
-    char* scratch,
-    const UncompressionDict& uncompression_dict) const {
-
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+    char* scratch, const UncompressionDict& uncompression_dict) const {
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
   const ImmutableCFOptions& ioptions = rep_->ioptions;
@@ -2459,7 +2456,8 @@ Status BlockBasedTable::RetrieveBlock(
         block_type == BlockType::kData
             ? rep_->table_options.read_amp_bytes_per_bit
             : 0,
-        GetMemoryAllocator(rep_->table_options), for_compaction);
+        GetMemoryAllocator(rep_->table_options), for_compaction,
+        rep_->blocks_definitely_zstd_compressed);
   }
 
   if (!s.ok()) {
@@ -2488,6 +2486,13 @@ template Status BlockBasedTable::RetrieveBlock<Block>(
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     bool for_compaction, bool use_cache) const;
 
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
@@ -3369,7 +3374,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
 
-      UncompressionDict uncompression_dict;
+      CachableEntry<UncompressionDict> uncompression_dict;
       Status uncompression_dict_status;
       if (rep_->uncompression_dict_reader) {
         uncompression_dict_status =
@@ -3379,6 +3384,10 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
                 &uncompression_dict);
       }
 
+      const UncompressionDict& dict = uncompression_dict.GetValue()
+                                          ? *uncompression_dict.GetValue()
+                                          : UncompressionDict::GetEmptyDict();
+
       size_t total_len = 0;
       ReadOptions ro = read_options;
       ro.read_tier = kBlockCacheTier;
@@ -3427,10 +3436,10 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         BlockHandle handle = v.handle;
         BlockCacheLookupContext lookup_data_block_context(
             TableReaderCaller::kUserMultiGet);
-        Status s = RetrieveBlock(nullptr, ro, handle, uncompression_dict,
-                          &(results.back()), BlockType::kData,
-                          miter->get_context, &lookup_data_block_context,
-                          /* for_compaction */ false, /* use_cache */ true);
+        Status s = RetrieveBlock(
+            nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+            miter->get_context, &lookup_data_block_context,
+            /* for_compaction */ false, /* use_cache */ true);
         if (s.IsIncomplete()) {
           s = Status::OK();
         }
@@ -3464,9 +3473,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
             block_buf.reset(scratch);
           }
         }
-        RetrieveMultipleBlocks(read_options,
-            &data_block_range, &block_handles, &statuses, &results,
-            scratch, uncompression_dict);
+        RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
+                               &statuses, &results, scratch, dict);
       }
     }
 
@@ -4117,7 +4125,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
 
   // Output compression dictionary
   if (rep_->uncompression_dict_reader) {
-    UncompressionDict uncompression_dict;
+    CachableEntry<UncompressionDict> uncompression_dict;
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
         nullptr /* prefetch_buffer */, false /* no_io */,
         nullptr /* get_context */, nullptr /* lookup_context */,
@@ -4126,7 +4134,9 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
       return s;
     }
 
-    const Slice& raw_dict = uncompression_dict.GetRawDict();
+    assert(uncompression_dict.GetValue());
+
+    const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
     out_file->Append(
         "Compression Dictionary:\n"
         "--------------------------------------\n");
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 3609db26e95..59575899aa2 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -318,10 +318,10 @@ class BlockBasedTable : public TableReader {
 
   void RetrieveMultipleBlocks(
       const ReadOptions& options, const MultiGetRange* batch,
-      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>*  handles,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
       autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
-      autovector<
-        CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+      autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+          results,
       char* scratch, const UncompressionDict& uncompression_dict) const;
 
   // Get the iterator from the index reader.
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index 92db24bb246..559e8af5cbd 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -21,36 +21,36 @@ Status UncompressionDictReader::Create(
   assert(!pin || prefetch);
   assert(uncompression_dict_reader);
 
-  CachableEntry<BlockContents> uncompression_dict_block;
+  CachableEntry<UncompressionDict> uncompression_dict;
   if (prefetch || !use_cache) {
-    const Status s = ReadUncompressionDictionaryBlock(
+    const Status s = ReadUncompressionDictionary(
         table, prefetch_buffer, ReadOptions(), use_cache,
-        nullptr /* get_context */, lookup_context, &uncompression_dict_block);
+        nullptr /* get_context */, lookup_context, &uncompression_dict);
     if (!s.ok()) {
       return s;
     }
 
     if (use_cache && !pin) {
-      uncompression_dict_block.Reset();
+      uncompression_dict.Reset();
     }
   }
 
   uncompression_dict_reader->reset(
-      new UncompressionDictReader(table, std::move(uncompression_dict_block)));
+      new UncompressionDictReader(table, std::move(uncompression_dict)));
 
   return Status::OK();
 }
 
-Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
+Status UncompressionDictReader::ReadUncompressionDictionary(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
     const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
-    CachableEntry<BlockContents>* uncompression_dict_block) {
+    CachableEntry<UncompressionDict>* uncompression_dict) {
   // TODO: add perf counter for compression dictionary read time
 
   assert(table);
-  assert(uncompression_dict_block);
-  assert(uncompression_dict_block->IsEmpty());
+  assert(uncompression_dict);
+  assert(uncompression_dict->IsEmpty());
 
   const BlockBasedTable::Rep* const rep = table->get_rep();
   assert(rep);
@@ -58,7 +58,7 @@ Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
 
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->compression_dict_handle,
-      UncompressionDict::GetEmptyDict(), uncompression_dict_block,
+      UncompressionDict::GetEmptyDict(), uncompression_dict,
       BlockType::kCompressionDictionary, get_context, lookup_context,
       /* for_compaction */ false, use_cache);
 
@@ -73,15 +73,14 @@ Status UncompressionDictReader::ReadUncompressionDictionaryBlock(
   return s;
 }
 
-Status UncompressionDictReader::GetOrReadUncompressionDictionaryBlock(
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
     FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
-    CachableEntry<BlockContents>* uncompression_dict_block) const {
-  assert(uncompression_dict_block);
+    CachableEntry<UncompressionDict>* uncompression_dict) const {
+  assert(uncompression_dict);
 
-  if (!uncompression_dict_block_.IsEmpty()) {
-    uncompression_dict_block->SetUnownedValue(
-        uncompression_dict_block_.GetValue());
+  if (!uncompression_dict_.IsEmpty()) {
+    uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
     return Status::OK();
   }
 
@@ -90,42 +89,17 @@ Status UncompressionDictReader::GetOrReadUncompressionDictionaryBlock(
     read_options.read_tier = kBlockCacheTier;
   }
 
-  return ReadUncompressionDictionaryBlock(
-      table_, prefetch_buffer, read_options, cache_dictionary_blocks(),
-      get_context, lookup_context, uncompression_dict_block);
-}
-
-Status UncompressionDictReader::GetOrReadUncompressionDictionary(
-    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context,
-    UncompressionDict* uncompression_dict) const {
-  CachableEntry<BlockContents> uncompression_dict_block;
-  const Status s = GetOrReadUncompressionDictionaryBlock(
-      prefetch_buffer, no_io, get_context, lookup_context,
-      &uncompression_dict_block);
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  assert(uncompression_dict);
-  assert(table_);
-  assert(table_->get_rep());
-
-  UncompressionDict dict(uncompression_dict_block.GetValue()->data,
-                         table_->get_rep()->blocks_definitely_zstd_compressed);
-  *uncompression_dict = std::move(dict);
-  uncompression_dict_block.TransferTo(uncompression_dict);
-
-  return Status::OK();
+  return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+                                     cache_dictionary_blocks(), get_context,
+                                     lookup_context, uncompression_dict);
 }
 
 size_t UncompressionDictReader::ApproximateMemoryUsage() const {
-  assert(!uncompression_dict_block_.GetOwnValue() ||
-         uncompression_dict_block_.GetValue() != nullptr);
-  size_t usage = uncompression_dict_block_.GetOwnValue()
-             ? uncompression_dict_block_.GetValue()->ApproximateMemoryUsage()
-             : 0;
+  assert(!uncompression_dict_.GetOwnValue() ||
+         uncompression_dict_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_.GetOwnValue()
+                     ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+                     : 0;
 
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
index 09fdf54b070..bfaf0b4bc70 100644
--- a/table/block_based/uncompression_dict_reader.h
+++ b/table/block_based/uncompression_dict_reader.h
@@ -33,34 +33,27 @@ class UncompressionDictReader {
   Status GetOrReadUncompressionDictionary(
       FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
-      UncompressionDict* uncompression_dict) const;
+      CachableEntry<UncompressionDict>* uncompression_dict) const;
 
   size_t ApproximateMemoryUsage() const;
 
  private:
-  UncompressionDictReader(
-      const BlockBasedTable* t,
-      CachableEntry<BlockContents>&& uncompression_dict_block)
-      : table_(t),
-        uncompression_dict_block_(std::move(uncompression_dict_block)) {
+  UncompressionDictReader(const BlockBasedTable* t,
+                          CachableEntry<UncompressionDict>&& uncompression_dict)
+      : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
     assert(table_);
   }
 
   bool cache_dictionary_blocks() const;
 
-  static Status ReadUncompressionDictionaryBlock(
+  static Status ReadUncompressionDictionary(
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
       const ReadOptions& read_options, bool use_cache, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
-      CachableEntry<BlockContents>* uncompression_dict_block);
-
-  Status GetOrReadUncompressionDictionaryBlock(
-      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context,
-      CachableEntry<BlockContents>* uncompression_dict_block) const;
+      CachableEntry<UncompressionDict>* uncompression_dict);
 
   const BlockBasedTable* table_;
-  CachableEntry<BlockContents> uncompression_dict_block_;
+  CachableEntry<UncompressionDict> uncompression_dict_;
 };
 
 }  // namespace rocksdb
diff --git a/util/compression.h b/util/compression.h
index 5dbb6c244aa..c2db250f76c 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -21,7 +21,6 @@
 #include <string>
 
 #include "memory/memory_allocator.h"
-#include "rocksdb/cleanable.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "util/coding.h"
@@ -217,14 +216,19 @@ struct CompressionDict {
 
 // Holds dictionary and related data, like ZSTD's digested uncompression
 // dictionary.
-struct UncompressionDict : public Cleanable {
-  // Block containing the data for the compression dictionary. It is non-empty
-  // only if the constructor that takes a string parameter is used.
+struct UncompressionDict {
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a string parameter is used.
   std::string dict_;
 
-  // Slice pointing to the compression dictionary data. Points to
-  // dict_ if the string constructor is used. In the case of the Slice
-  // constructor, it is a copy of the Slice passed by the caller.
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a Slice parameter is used and the passed in
+  // CacheAllocationPtr is not nullptr.
+  CacheAllocationPtr allocation_;
+
+  // Slice pointing to the compression dictionary data. Can point to
+  // dict_, allocation_, or some other memory location, depending on how
+  // the object was constructed.
   Slice slice_;
 
 #ifdef ROCKSDB_ZSTD_DDICT
@@ -232,18 +236,12 @@ struct UncompressionDict : public Cleanable {
   ZSTD_DDict* zstd_ddict_ = nullptr;
 #endif  // ROCKSDB_ZSTD_DDICT
 
-  // Slice constructor: it is the caller's responsibility to either
-  // a) make sure slice remains valid throughout the lifecycle of this object OR
-  // b) transfer the management of the underlying resource (e.g. cache handle)
-  // to this object, in which case UncompressionDict is self-contained, and the
-  // resource is guaranteed to be released (via the cleanup logic in Cleanable)
-  // when UncompressionDict is destroyed.
 #ifdef ROCKSDB_ZSTD_DDICT
-  UncompressionDict(Slice slice, bool using_zstd)
+  UncompressionDict(std::string dict, bool using_zstd)
 #else   // ROCKSDB_ZSTD_DDICT
-  UncompressionDict(Slice slice, bool /*using_zstd*/)
+  UncompressionDict(std::string dict, bool /* using_zstd */)
 #endif  // ROCKSDB_ZSTD_DDICT
-      : slice_(std::move(slice)) {
+      : dict_(std::move(dict)), slice_(dict_) {
 #ifdef ROCKSDB_ZSTD_DDICT
     if (!slice_.empty() && using_zstd) {
       zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
@@ -252,14 +250,25 @@ struct UncompressionDict : public Cleanable {
 #endif  // ROCKSDB_ZSTD_DDICT
   }
 
-  // String constructor: results in a self-contained UncompressionDict.
-  UncompressionDict(std::string dict, bool using_zstd)
-      : UncompressionDict(Slice(dict), using_zstd) {
-    dict_ = std::move(dict);
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool using_zstd)
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool /* using_zstd */)
+#endif  // ROCKSDB_ZSTD_DDICT
+      : allocation_(std::move(allocation)), slice_(std::move(slice)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
   }
 
   UncompressionDict(UncompressionDict&& rhs)
       : dict_(std::move(rhs.dict_)),
+        allocation_(std::move(rhs.allocation_)),
         slice_(std::move(rhs.slice_))
 #ifdef ROCKSDB_ZSTD_DDICT
         ,
@@ -288,6 +297,7 @@ struct UncompressionDict : public Cleanable {
     }
 
     dict_ = std::move(rhs.dict_);
+    allocation_ = std::move(rhs.allocation_);
     slice_ = std::move(rhs.slice_);
 
 #ifdef ROCKSDB_ZSTD_DDICT
@@ -298,6 +308,12 @@ struct UncompressionDict : public Cleanable {
     return *this;
   }
 
+  // The object is self-contained if the string constructor is used, or the
+  // Slice constructor is invoked with a non-null allocation. Otherwise, it
+  // is the caller's responsibility to ensure that the underlying storage
+  // outlives this object.
+  bool own_bytes() const { return !dict_.empty() || allocation_; }
+
   const Slice& GetRawDict() const { return slice_; }
 
 #ifdef ROCKSDB_ZSTD_DDICT
@@ -310,12 +326,19 @@ struct UncompressionDict : public Cleanable {
   }
 
   size_t ApproximateMemoryUsage() const {
-    size_t usage = 0;
-    usage += sizeof(struct UncompressionDict);
+    size_t usage = sizeof(struct UncompressionDict);
+    usage += dict_.size();
+    if (allocation_) {
+      auto allocator = allocation_.get_deleter().allocator;
+      if (allocator) {
+        usage += allocator->UsableSize(allocation_.get(), slice_.size());
+      } else {
+        usage += slice_.size();
+      }
+    }
 #ifdef ROCKSDB_ZSTD_DDICT
     usage += ZSTD_sizeof_DDict(zstd_ddict_);
 #endif  // ROCKSDB_ZSTD_DDICT
-    usage += dict_.size();
     return usage;
   }
 

From 26293c89a65625c34f362385779358cb16905e38 Mon Sep 17 00:00:00 2001
From: DaiZhiwei <53242408+zhiwei-dai@users.noreply.github.com>
Date: Fri, 23 Aug 2019 11:02:06 -0700
Subject: [PATCH 324/572] crc32c_arm64 performance optimization (#5675)

Summary:
Crc32c Parallel computation coding optimization:
Macro unfolding removes the "for" loop and is good to decrease branch-miss in arm64 micro architecture
1024 Bytes is divided into  8(head) + 1008( 6 * 7 * 3 * 8 ) + 8(tail)  three parts
Macro unfolding 42 loops to 6 CRC32C7X24BYTESs
1 CRC32C7X24BYTES containing 7 CRC32C24BYTESs

1, crc32c_test
[==========] Running 4 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 4 tests from CRC
[ RUN      ] CRC.StandardResults
[       OK ] CRC.StandardResults (1 ms)
[ RUN      ] CRC.Values
[       OK ] CRC.Values (0 ms)
[ RUN      ] CRC.Extend
[       OK ] CRC.Extend (0 ms)
[ RUN      ] CRC.Mask
[       OK ] CRC.Mask (0 ms)
[----------] 4 tests from CRC (1 ms total)

[----------] Global test environment tear-down
[==========] 4 tests from 1 test case ran. (1 ms total)
[  PASSED  ] 4 tests.

2, db_bench --benchmarks="crc32c"
crc32c : 0.218 micros/op 4595390 ops/sec; 17950.7 MB/s (4096 per op)

3, repeated crc32c_test case  60000 times
perf stat -e branch-miss -- ./crc32c_test
before optimization:
739,426,504      branch-miss
after optimization:
1,128,572      branch-miss
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5675

Differential Revision: D16989210

fbshipit-source-id: 7204e6069bb6ed066d49c2d1b3ac385065a98557
---
 util/crc32c_arm64.cc | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 8743f8c721c..d346c2612f0 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -12,6 +12,26 @@
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1 << 7)
 #endif
+
+#ifdef HAVE_ARM64_CRYPTO
+/* unfolding to compute 8 * 3 = 24 bytes parallelly */
+#define CRC32C24BYTES(ITR) \
+  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));\
+  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH*2 + (ITR)));\
+  crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
+
+/* unfolding to compute 24 * 7 = 168 bytes parallelly */
+#define CRC32C7X24BYTES(ITR) do {\
+  CRC32C24BYTES((ITR)*7+0) \
+  CRC32C24BYTES((ITR)*7+1) \
+  CRC32C24BYTES((ITR)*7+2) \
+  CRC32C24BYTES((ITR)*7+3) \
+  CRC32C24BYTES((ITR)*7+4) \
+  CRC32C24BYTES((ITR)*7+5) \
+  CRC32C24BYTES((ITR)*7+6) \
+} while(0)
+#endif
+
 uint32_t crc32c_runtime_check(void) {
   uint64_t auxv = getauxval(AT_HWCAP);
   return (auxv & HWCAP_CRC32) != 0;
@@ -48,15 +68,16 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
     crc0 = crc32c_u64(crc, *buf64++);
 
     /* 3 blocks crc32c parallel computation
-     *
-     * 42 * 8 * 3 = 1008 (bytes)
+     * Macro unfolding to compute parallelly
+     * 168 * 6 = 1008 (bytes)
      */
-    for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
-      crc0 = crc32c_u64(crc0, *buf64);
-      crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
-      crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
-    }
-    buf64 += (BLK_LENGTH * 2);
+    CRC32C7X24BYTES(0);
+    CRC32C7X24BYTES(1);
+    CRC32C7X24BYTES(2);
+    CRC32C7X24BYTES(3);
+    CRC32C7X24BYTES(4);
+    CRC32C7X24BYTES(5);
+    buf64 += (BLK_LENGTH * 3);
 
     /* Last 8 bytes */
     crc = crc32c_u64(crc2, *buf64++);
@@ -72,6 +93,9 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
 
     length -= 1024;
   }
+
+  if (length == 0)
+    return crc ^ (0xffffffffU);
 #endif
   buf8 = (const uint8_t *)buf64;
   while (length >= 8) {

From 2f41ecfe75f0ebf33e4969083b031c7a97ebaee7 Mon Sep 17 00:00:00 2001
From: Zhongyi Xie <xiez@fb.com>
Date: Fri, 23 Aug 2019 13:54:09 -0700
Subject: [PATCH 325/572] Refactor trimming logic for immutable memtables
 (#5022)

Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022

Differential Revision: D14394062

Pulled By: miasantreble

fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
---
 CMakeLists.txt                                |  1 +
 HISTORY.md                                    |  2 +
 TARGETS                                       |  1 +
 db/c.cc                                       |  5 ++
 db/column_family.cc                           | 12 ++-
 db/column_family_test.cc                      | 11 ++-
 db/db_basic_test.cc                           |  5 +-
 db/db_compaction_test.cc                      |  2 +-
 db/db_impl/db_impl.cc                         |  1 +
 db/db_impl/db_impl.h                          |  5 ++
 db/db_impl/db_impl_open.cc                    |  8 +-
 db/db_impl/db_impl_secondary.cc               |  6 +-
 db/db_impl/db_impl_write.cc                   | 55 ++++++++++---
 db/db_properties_test.cc                      |  7 +-
 db/db_test.cc                                 |  5 +-
 db/deletefile_test.cc                         |  2 +
 db/flush_scheduler.cc                         |  2 +-
 db/flush_scheduler.h                          | 11 ++-
 db/memtable.cc                                |  8 +-
 db/memtable.h                                 | 12 ++-
 db/memtable_list.cc                           | 81 +++++++++++++++++--
 db/memtable_list.h                            | 67 ++++++++++++---
 db/memtable_list_test.cc                      | 31 +++++--
 db/repair.cc                                  |  3 +-
 db/trim_history_scheduler.cc                  | 59 ++++++++++++++
 db/trim_history_scheduler.h                   | 44 ++++++++++
 db/write_batch.cc                             | 45 ++++++++---
 db/write_batch_internal.h                     |  9 ++-
 db/write_batch_test.cc                        |  3 +-
 examples/rocksdb_option_file_example.ini      |  2 +-
 include/rocksdb/advanced_options.h            | 29 +++++--
 include/rocksdb/c.h                           |  3 +
 include/rocksdb/utilities/transaction.h       |  6 +-
 java/rocksjni/write_batch_test.cc             |  4 +-
 options/cf_options.cc                         |  2 +
 options/cf_options.h                          |  2 +
 options/options.cc                            |  5 ++
 options/options_helper.cc                     |  3 +
 options/options_settable_test.cc              |  1 +
 options/options_test.cc                       |  2 +
 src.mk                                        |  1 +
 table/table_test.cc                           |  3 +-
 test_util/testutil.cc                         |  1 +
 tools/db_bench_tool.cc                        | 16 ++++
 tools/db_bench_tool_test.cc                   |  1 +
 tools/db_stress.cc                            | 16 ++++
 util/string_util.cc                           |  3 +-
 util/string_util.h                            |  1 -
 .../optimistic_transaction_db_impl.cc         |  8 +-
 .../optimistic_transaction_test.cc            |  1 +
 .../pessimistic_transaction_db.cc             |  8 +-
 utilities/transactions/transaction_test.cc    |  6 +-
 utilities/transactions/transaction_util.cc    |  2 +-
 53 files changed, 522 insertions(+), 107 deletions(-)
 create mode 100644 db/trim_history_scheduler.cc
 create mode 100644 db/trim_history_scheduler.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f81e0ca4f99..a094d32615a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -530,6 +530,7 @@ set(SOURCES
         db/table_cache.cc
         db/table_properties_collector.cc
         db/transaction_log_impl.cc
+        db/trim_history_scheduler.cc
         db/version_builder.cc
         db/version_edit.cc
         db/version_set.cc
diff --git a/HISTORY.md b/HISTORY.md
index a87d06f9017..da06ca2280b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,8 @@
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 ### New Features
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
+### Public API Change
+* Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/TARGETS b/TARGETS
index bac5c4311aa..058e591e8cc 100644
--- a/TARGETS
+++ b/TARGETS
@@ -158,6 +158,7 @@ cpp_library(
         "db/table_cache.cc",
         "db/table_properties_collector.cc",
         "db/transaction_log_impl.cc",
+        "db/trim_history_scheduler.cc",
         "db/version_builder.cc",
         "db/version_edit.cc",
         "db/version_set.cc",
diff --git a/db/c.cc b/db/c.cc
index 4d40558f6b1..66e3892af1f 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2514,6 +2514,11 @@ void rocksdb_options_set_max_write_buffer_number_to_maintain(
   opt->rep.max_write_buffer_number_to_maintain = n;
 }
 
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt, int64_t n) {
+  opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
 void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
                                                 unsigned char v) {
   opt->rep.enable_pipelined_write = v;
diff --git a/db/column_family.cc b/db/column_family.cc
index e135c2d317f..4c67b7d7604 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -227,7 +227,14 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   if (result.max_write_buffer_number < 2) {
     result.max_write_buffer_number = 2;
   }
-  if (result.max_write_buffer_number_to_maintain < 0) {
+  // fall back max_write_buffer_number_to_maintain if
+  // max_write_buffer_size_to_maintain is not set
+  if (result.max_write_buffer_size_to_maintain < 0) {
+    result.max_write_buffer_size_to_maintain =
+        result.max_write_buffer_number *
+        static_cast<int64_t>(result.write_buffer_size);
+  } else if (result.max_write_buffer_size_to_maintain == 0 &&
+             result.max_write_buffer_number_to_maintain < 0) {
     result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
   }
   // bloom filter size shouldn't exceed 1/4 of memtable size.
@@ -423,7 +430,8 @@ ColumnFamilyData::ColumnFamilyData(
       write_buffer_manager_(write_buffer_manager),
       mem_(nullptr),
       imm_(ioptions_.min_write_buffer_number_to_merge,
-           ioptions_.max_write_buffer_number_to_maintain),
+           ioptions_.max_write_buffer_number_to_maintain,
+           ioptions_.max_write_buffer_size_to_maintain),
       super_version_(nullptr),
       super_version_number_(0),
       local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 235313f48fb..95c43ac5ae9 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -1132,22 +1132,25 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
   default_cf.arena_block_size = 4 * 4096;
   default_cf.max_write_buffer_number = 10;
   default_cf.min_write_buffer_number_to_merge = 1;
-  default_cf.max_write_buffer_number_to_maintain = 0;
+  default_cf.max_write_buffer_size_to_maintain = 0;
   one.write_buffer_size = 200000;
   one.arena_block_size = 4 * 4096;
   one.max_write_buffer_number = 10;
   one.min_write_buffer_number_to_merge = 2;
-  one.max_write_buffer_number_to_maintain = 1;
+  one.max_write_buffer_size_to_maintain =
+      static_cast<int>(one.write_buffer_size);
   two.write_buffer_size = 1000000;
   two.arena_block_size = 4 * 4096;
   two.max_write_buffer_number = 10;
   two.min_write_buffer_number_to_merge = 3;
-  two.max_write_buffer_number_to_maintain = 2;
+  two.max_write_buffer_size_to_maintain =
+      static_cast<int>(two.write_buffer_size);
   three.write_buffer_size = 4096 * 22;
   three.arena_block_size = 4096;
   three.max_write_buffer_number = 10;
   three.min_write_buffer_number_to_merge = 4;
-  three.max_write_buffer_number_to_maintain = -1;
+  three.max_write_buffer_size_to_maintain =
+      static_cast<int>(three.write_buffer_size);
 
   Reopen({default_cf, one, two, three});
 
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 828eaaaeb52..963dde6cea9 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -297,7 +297,7 @@ TEST_F(DBBasicTest, FlushMultipleMemtable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = -1;
+    options.max_write_buffer_size_to_maintain = -1;
     CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
@@ -327,7 +327,8 @@ TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
   writeOpt.disableWAL = true;
   options.max_write_buffer_number = 2;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
   CreateAndReopenWithCF({"pikachu"}, options);
 
   // Compaction can still go through even if no thread can flush the
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 7f639c85397..45ba1701139 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -141,7 +141,7 @@ Options DeletionTriggerOptions(Options options) {
   options.compression = kNoCompression;
   options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.num_levels = kCDTNumLevels;
   options.level0_file_num_compaction_trigger = 1;
   options.target_file_size_base = options.write_buffer_size * 2;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index dae879346f0..587f1d8d951 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -472,6 +472,7 @@ Status DBImpl::CloseHelper() {
                            &files_grabbed_for_purge_);
   EraseThreadStatusDbInfo();
   flush_scheduler_.Clear();
+  trim_history_scheduler_.Clear();
 
   while (!flush_queue_.empty()) {
     const FlushRequest& flush_req = PopFirstFromFlushQueue();
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 42377ce4104..dcefc47533a 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -37,6 +37,7 @@
 #include "db/read_callback.h"
 #include "db/snapshot_checker.h"
 #include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
 #include "db/version_edit.h"
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
@@ -1355,6 +1356,8 @@ class DBImpl : public DB {
 
   void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
 
+  Status TrimMemtableHistory(WriteContext* context);
+
   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
   void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
@@ -1733,6 +1736,8 @@ class DBImpl : public DB {
 
   FlushScheduler flush_scheduler_;
 
+  TrimHistoryScheduler trim_history_scheduler_;
+
   SnapshotList snapshots_;
 
   // For each background job, pending_outputs_ keeps the current file number at
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 0e0fcfbf2c3..737e3a66087 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -862,9 +862,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // That's why we set ignore missing column families to true
       bool has_valid_writes = false;
       status = WriteBatchInternal::InsertInto(
-          &batch, column_family_memtables_.get(), &flush_scheduler_, true,
-          log_number, this, false /* concurrent_memtable_writes */,
-          next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+          &batch, column_family_memtables_.get(), &flush_scheduler_,
+          &trim_history_scheduler_, true, log_number, this,
+          false /* concurrent_memtable_writes */, next_sequence,
+          &has_valid_writes, seq_per_batch_, batch_per_txn_);
       MaybeIgnoreError(&status);
       if (!status.ok()) {
         // We are treating this as a failure while reading since we read valid
@@ -931,6 +932,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     }
 
     flush_scheduler_.Clear();
+    trim_history_scheduler_.Clear();
     auto last_sequence = *next_sequence - 1;
     if ((*next_sequence != kMaxSequenceNumber) &&
         (versions_->LastSequence() <= last_sequence)) {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index a73cd6ba296..1a55c328ee5 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -253,9 +253,9 @@ Status DBImplSecondary::RecoverLogFiles(
         bool has_valid_writes = false;
         status = WriteBatchInternal::InsertInto(
             &batch, column_family_memtables_.get(),
-            nullptr /* flush_scheduler */, true, log_number, this,
-            false /* concurrent_memtable_writes */, next_sequence,
-            &has_valid_writes, seq_per_batch_, batch_per_txn_);
+            nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+            true, log_number, this, false /* concurrent_memtable_writes */,
+            next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
       }
       // If column family was not found, it might mean that the WAL write
       // batch references to the column family that was dropped after the
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 0ad2a3e9a86..d15165122be 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -171,6 +171,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           versions_->GetColumnFamilySet());
       w.status = WriteBatchInternal::InsertInto(
           &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+          &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
           true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
           batch_per_txn_);
@@ -375,7 +376,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         // w.sequence will be set inside InsertInto
         w.status = WriteBatchInternal::InsertInto(
             write_group, current_sequence, column_family_memtables_.get(),
-            &flush_scheduler_, write_options.ignore_missing_column_families,
+            &flush_scheduler_, &trim_history_scheduler_,
+            write_options.ignore_missing_column_families,
             0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
             batch_per_txn_);
       } else {
@@ -391,6 +393,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           assert(w.sequence == current_sequence);
           w.status = WriteBatchInternal::InsertInto(
               &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+              &trim_history_scheduler_,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
               this, true /*concurrent_memtable_writes*/, seq_per_batch_,
               w.batch_cnt, batch_per_txn_);
@@ -545,9 +548,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     } else {
       memtable_write_group.status = WriteBatchInternal::InsertInto(
           memtable_write_group, w.sequence, column_family_memtables_.get(),
-          &flush_scheduler_, write_options.ignore_missing_column_families,
-          0 /*log_number*/, this, false /*concurrent_memtable_writes*/,
-          seq_per_batch_, batch_per_txn_);
+          &flush_scheduler_, &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
@@ -559,8 +562,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
         versions_->GetColumnFamilySet());
     w.status = WriteBatchInternal::InsertInto(
         &w, w.sequence, &column_family_memtables, &flush_scheduler_,
-        write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-        true /*concurrent_memtable_writes*/);
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/);
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
       MemTableInsertStatusCheck(w.status);
       versions_->SetLastSequence(w.write_group->last_sequence);
@@ -597,8 +600,9 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
         versions_->GetColumnFamilySet());
     w.status = WriteBatchInternal::InsertInto(
         &w, w.sequence, &column_family_memtables, &flush_scheduler_,
-        write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-        true /*concurrent_memtable_writes*/, seq_per_batch_, sub_batch_cnt);
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        seq_per_batch_, sub_batch_cnt);
 
     WriteStatusCheck(w.status);
     if (write_options.disableWAL) {
@@ -856,6 +860,10 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     status = HandleWriteBufferFull(write_context);
   }
 
+  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+    status = TrimMemtableHistory(write_context);
+  }
+
   if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
     WaitForPendingWrites();
     status = ScheduleFlushes(write_context);
@@ -1112,9 +1120,9 @@ Status DBImpl::WriteRecoverableState() {
     WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
     auto status = WriteBatchInternal::InsertInto(
         &cached_recoverable_state_, column_family_memtables_.get(),
-        &flush_scheduler_, true, 0 /*recovery_log_number*/, this,
-        false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool,
-        seq_per_batch_);
+        &flush_scheduler_, &trim_history_scheduler_, true,
+        0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+        &next_seq, &dont_care_bool, seq_per_batch_);
     auto last_seq = next_seq - 1;
     if (two_write_queues_) {
       versions_->FetchAddLastAllocatedSequence(last_seq - seq);
@@ -1474,6 +1482,31 @@ void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
   }
 }
 
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  ColumnFamilyData* tmp_cfd;
+  while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+         nullptr) {
+    cfds.push_back(tmp_cfd);
+  }
+  for (auto& cfd : cfds) {
+    autovector<MemTable*> to_delete;
+    cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
+    for (auto m : to_delete) {
+      delete m;
+    }
+    context->superversion_context.NewSuperVersion();
+    assert(context->superversion_context.new_superversion.get() != nullptr);
+    cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+
+    if (cfd->Unref()) {
+      delete cfd;
+      cfd = nullptr;
+    }
+  }
+  return Status::OK();
+}
+
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
   autovector<ColumnFamilyData*> cfds;
   if (immutable_db_options_.atomic_flush) {
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 1a988f5ea4c..956accef821 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -615,8 +615,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = 4;
     options.write_buffer_size = 1000000;
+    options.max_write_buffer_size_to_maintain =
+        5 * static_cast<int64_t>(options.write_buffer_size);
     CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string big_value(1000000 * 2, 'x');
@@ -747,7 +748,7 @@ TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
   options.max_background_flushes = 1;
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.write_buffer_size = 1000000;
   Reopen(options);
 
@@ -997,7 +998,7 @@ TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
   options.max_background_flushes = 1;
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.write_buffer_size = 1000000;
   Reopen(options);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 17ac2272901..906a67cda23 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -883,7 +883,7 @@ TEST_F(DBTest, FlushMultipleMemtable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = -1;
+    options.max_write_buffer_size_to_maintain = -1;
     CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
@@ -901,7 +901,8 @@ TEST_F(DBTest, FlushSchedule) {
   options.level0_stop_writes_trigger = 1 << 10;
   options.level0_slowdown_writes_trigger = 1 << 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
   options.max_write_buffer_number = 2;
   options.write_buffer_size = 120 * 1024;
   CreateAndReopenWithCF({"pikachu"}, options);
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 18014e5b435..b99c8e9dce8 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -284,6 +284,8 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
 TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
   auto do_test = [&](bool bg_purge) {
     ColumnFamilyOptions co;
+    co.max_write_buffer_size_to_maintain =
+        static_cast<int64_t>(co.write_buffer_size);
     WriteOptions wo;
     FlushOptions fo;
     ColumnFamilyHandle* cfh = nullptr;
diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc
index 9c6c04efe33..cbcb5ce49f9 100644
--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@@ -11,7 +11,7 @@
 
 namespace rocksdb {
 
-void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
 #ifndef NDEBUG
   {
     std::lock_guard<std::mutex> lock(checking_mutex_);
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
index b5abec40569..5ca85e88bcf 100644
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@@ -9,26 +9,29 @@
 #include <atomic>
 #include <mutex>
 #include <set>
+#include "util/autovector.h"
 
 namespace rocksdb {
 
 class ColumnFamilyData;
 
-// Unless otherwise noted, all methods on FlushScheduler should be called
-// only with the DB mutex held or from a single-threaded recovery context.
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
 class FlushScheduler {
  public:
   FlushScheduler() : head_(nullptr) {}
 
   // May be called from multiple threads at once, but not concurrent with
   // any other method calls on this instance
-  void ScheduleFlush(ColumnFamilyData* cfd);
+  void ScheduleWork(ColumnFamilyData* cfd);
 
   // Removes and returns Ref()-ed column family. Client needs to Unref().
   // Filters column families that have been dropped.
   ColumnFamilyData* TakeNextColumnFamily();
 
-  // This can be called concurrently with ScheduleFlush but it would miss all
+  // This can be called concurrently with ScheduleWork but it would miss all
   // the scheduled flushes after the last synchronization. This would result
   // into less precise enforcement of memtable sizes but should not matter much.
   bool Empty();
diff --git a/db/memtable.cc b/db/memtable.cc
index 62c7339b5d0..06cb2222e02 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -105,7 +105,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       insert_with_hint_prefix_extractor_(
           ioptions.memtable_insert_with_hint_prefix_extractor),
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
-      atomic_flush_seqno_(kMaxSequenceNumber) {
+      atomic_flush_seqno_(kMaxSequenceNumber),
+      approximate_memory_usage_(0) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -139,11 +140,12 @@ size_t MemTable::ApproximateMemoryUsage() {
     }
     total_usage += usage;
   }
+  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
   // otherwise, return the actual usage
   return total_usage;
 }
 
-bool MemTable::ShouldFlushNow() const {
+bool MemTable::ShouldFlushNow() {
   size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
@@ -159,6 +161,8 @@ bool MemTable::ShouldFlushNow() const {
                           range_del_table_->ApproximateMemoryUsage() +
                           arena_.MemoryAllocatedBytes();
 
+  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
   if (allocated_memory + kArenaBlockSize <
diff --git a/db/memtable.h b/db/memtable.h
index 36ba0df79ba..c0baa9e173a 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -130,6 +130,12 @@ class MemTable {
   // operations on the same MemTable (unless this Memtable is immutable).
   size_t ApproximateMemoryUsage();
 
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
   // This method heuristically determines if the memtable should continue to
   // host more data.
   bool ShouldScheduleFlush() const {
@@ -486,8 +492,12 @@ class MemTable {
   // writes with sequence number smaller than seq are flushed.
   SequenceNumber atomic_flush_seqno_;
 
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
   // Returns a heuristic flush decision
-  bool ShouldFlushNow() const;
+  bool ShouldFlushNow();
 
   // Updates flush_state_ using ShouldFlushNow()
   void UpdateFlushState();
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index d06a82df8ef..e3f0732de15 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -46,6 +46,8 @@ MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
     : max_write_buffer_number_to_maintain_(
           old->max_write_buffer_number_to_maintain_),
+      max_write_buffer_size_to_maintain_(
+          old->max_write_buffer_size_to_maintain_),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
   if (old != nullptr) {
     memlist_ = old->memlist_;
@@ -62,8 +64,10 @@ MemTableListVersion::MemTableListVersion(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage,
-    int max_write_buffer_number_to_maintain)
+    int max_write_buffer_number_to_maintain,
+    int64_t max_write_buffer_size_to_maintain)
     : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
 
 void MemTableListVersion::Ref() { ++refs_; }
@@ -240,7 +244,7 @@ void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   AddMemTable(m);
 
-  TrimHistory(to_delete);
+  TrimHistory(to_delete, m->ApproximateMemoryUsage());
 }
 
 // Removes m from list of memtables not flushed.  Caller should NOT Unref m.
@@ -250,19 +254,51 @@ void MemTableListVersion::Remove(MemTable* m,
   memlist_.remove(m);
 
   m->MarkFlushed();
-  if (max_write_buffer_number_to_maintain_ > 0) {
+  if (max_write_buffer_size_to_maintain_ > 0 ||
+      max_write_buffer_number_to_maintain_ > 0) {
     memlist_history_.push_front(m);
-    TrimHistory(to_delete);
+    // Unable to get size of mutable memtable at this point, pass 0 to
+    // TrimHistory as a best effort.
+    TrimHistory(to_delete, 0);
   } else {
     UnrefMemTable(to_delete, m);
   }
 }
 
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() {
+  size_t total_memtable_size = 0;
+  for (auto& memtable : memlist_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  for (auto& memtable : memlist_history_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  if (!memlist_history_.empty()) {
+    total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage();
+  }
+  return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
+    // calculate the total memory usage after dropping the oldest flushed
+    // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+    // whether to trim history
+    return ApproximateMemoryUsageExcludingLast() + usage >=
+           static_cast<size_t>(max_write_buffer_size_to_maintain_);
+  } else if (max_write_buffer_number_to_maintain_ > 0) {
+    return memlist_.size() + memlist_history_.size() >
+           static_cast<size_t>(max_write_buffer_number_to_maintain_);
+  } else {
+    return false;
+  }
+}
+
 // Make sure we don't use up too much space in history
-void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
-  while (memlist_.size() + memlist_history_.size() >
-             static_cast<size_t>(max_write_buffer_number_to_maintain_) &&
-         !memlist_history_.empty()) {
+void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+                                      size_t usage) {
+  while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
     MemTable* x = memlist_history_.back();
     memlist_history_.pop_back();
 
@@ -444,6 +480,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
                            cfd->GetName().c_str(), m->file_number_, mem_id);
           assert(m->file_number_ > 0);
           current_->Remove(m, to_delete);
+          UpdateMemoryUsageExcludingLast();
+          ResetTrimHistoryNeeded();
           ++mem_id;
         }
       } else {
@@ -483,6 +521,15 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   if (num_flush_not_started_ == 1) {
     imm_flush_needed.store(true, std::memory_order_release);
   }
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
+}
+
+void MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+  InstallNewVersion();
+  current_->TrimHistory(to_delete, usage);
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
 }
 
 // Returns an estimate of the number of bytes of data in use.
@@ -496,6 +543,20 @@ size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
 
 size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
 
+size_t MemTableList::ApproximateMemoryUsageExcludingLast() {
+  size_t usage =
+      current_memory_usage_excluding_last_.load(std::memory_order_relaxed);
+  return usage;
+}
+
+// Update current_memory_usage_excluding_last_, need to call whenever state
+// changes for MemtableListVersion (whenever InstallNewVersion() is called)
+void MemTableList::UpdateMemoryUsageExcludingLast() {
+  size_t total_memtable_size = current_->ApproximateMemoryUsageExcludingLast();
+  current_memory_usage_excluding_last_.store(total_memtable_size,
+                                             std::memory_order_relaxed);
+}
+
 uint64_t MemTableList::ApproximateOldestKeyTime() const {
   if (!current_->memlist_.empty()) {
     return current_->memlist_.back()->ApproximateOldestKeyTime();
@@ -623,6 +684,8 @@ Status InstallMemtableAtomicFlushResults(
                          cfds[i]->GetName().c_str(), m->GetFileNumber(),
                          mem_id);
         imm->current_->Remove(m, to_delete);
+        imm->UpdateMemoryUsageExcludingLast();
+        imm->ResetTrimHistoryNeeded();
       }
     }
   } else {
@@ -664,6 +727,8 @@ void MemTableList::RemoveOldMemTables(uint64_t log_number,
       imm_flush_needed.store(false, std::memory_order_release);
     }
   }
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
 }
 
 }  // namespace rocksdb
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 2bd225b8390..75cc1a524b2 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -44,7 +44,8 @@ class MemTableListVersion {
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                MemTableListVersion* old = nullptr);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               int max_write_buffer_number_to_maintain);
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
 
   void Ref();
   void Unref(autovector<MemTable*>* to_delete = nullptr);
@@ -139,7 +140,7 @@ class MemTableListVersion {
   // REQUIRE: m is an immutable memtable
   void Remove(MemTable* m, autovector<MemTable*>* to_delete);
 
-  void TrimHistory(autovector<MemTable*>* to_delete);
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
@@ -152,6 +153,14 @@ class MemTableListVersion {
 
   void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t ApproximateMemoryUsageExcludingLast();
+
+  bool MemtableLimitExceeded(size_t usage);
+
   // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
 
@@ -160,8 +169,10 @@ class MemTableListVersion {
   std::list<MemTable*> memlist_history_;
 
   // Maximum number of MemTables to keep in memory (including both flushed
-  // and not-yet-flushed tables).
   const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
 
   int refs_ = 0;
 
@@ -176,35 +187,41 @@ class MemTableListVersion {
 // recoverability from a crash.
 //
 //
-// Other than imm_flush_needed, this class is not thread-safe and requires
-// external synchronization (such as holding the db mutex or being on the
-// write thread.)
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
 class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge,
-                        int max_write_buffer_number_to_maintain)
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
       : imm_flush_needed(false),
+        imm_trim_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion(&current_memory_usage_,
-                                         max_write_buffer_number_to_maintain)),
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
-        flush_requested_(false) {
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_usage_excluding_last_(0) {
     current_->Ref();
-    current_memory_usage_ = 0;
   }
 
   // Should not delete MemTableList without making sure MemTableList::current()
   // is Unref()'d.
   ~MemTableList() {}
 
-  MemTableListVersion* current() { return current_; }
+  MemTableListVersion* current() const { return current_; }
 
   // so that background threads can detect non-nullptr pointer to
   // determine whether there is anything more to start flushing.
   std::atomic<bool> imm_flush_needed;
 
+  std::atomic<bool> imm_trim_needed;
+
   // Returns the total number of memtables in the list that haven't yet
   // been flushed and logged.
   int NumNotFlushed() const;
@@ -243,6 +260,18 @@ class MemTableList {
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
+  // Returns the cached current_memory_usage_excluding_last_ value
+  size_t ApproximateMemoryUsageExcludingLast();
+
+  // Update current_memory_usage_excluding_last_ from MemtableListVersion
+  void UpdateMemoryUsageExcludingLast();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
   // Returns an estimate of the number of bytes of data used by
   // the unflushed mem-tables.
   size_t ApproximateUnflushedMemTablesMemoryUsage();
@@ -259,6 +288,20 @@ class MemTableList {
 
   bool HasFlushRequested() { return flush_requested_; }
 
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
   // Copying allowed
   // MemTableList(const MemTableList&);
   // void operator=(const MemTableList&);
@@ -338,6 +381,8 @@ class MemTableList {
 
   // The current memory usage.
   size_t current_memory_usage_;
+
+  std::atomic<size_t> current_memory_usage_excluding_last_;
 };
 
 // Installs memtable atomic flush results.
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index 3a14b6830a6..b8dc802166e 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -183,7 +183,7 @@ class MemTableListTest : public testing::Test {
 
 TEST_F(MemTableListTest, Empty) {
   // Create an empty MemTableList and validate basic functions.
-  MemTableList list(1, 0);
+  MemTableList list(1, 0, 0);
 
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -202,8 +202,10 @@ TEST_F(MemTableListTest, GetTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
   int max_write_buffer_number_to_maintain = 0;
+  int64_t max_write_buffer_size_to_maintain = 0;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
   std::string value;
@@ -312,8 +314,10 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
   int max_write_buffer_number_to_maintain = 2;
+  int64_t max_write_buffer_size_to_maintain = 2000;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
   std::string value;
@@ -514,8 +518,11 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 3;
   int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   // Create some MemTables
   uint64_t memtable_id = 0;
@@ -670,7 +677,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
   ASSERT_EQ(2, list.NumNotFlushed());
-  int num_in_history = std::min(3, max_write_buffer_number_to_maintain);
+  int num_in_history =
+      std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(num_in_history, list.NumFlushed());
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
@@ -687,7 +696,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // This will actually install 2 tables.  The 1 we told it to flush, and also
   // tables[4] which has been waiting for tables[3] to commit.
   ASSERT_EQ(0, list.NumNotFlushed());
-  num_in_history = std::min(5, max_write_buffer_number_to_maintain);
+  num_in_history =
+      std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(num_in_history, list.NumFlushed());
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
@@ -730,7 +741,8 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   list.current()->Unref(&to_delete);
   int to_delete_size =
-      std::min(num_tables, max_write_buffer_number_to_maintain);
+      std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+                               static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(to_delete_size, to_delete.size());
 
   for (const auto& m : to_delete) {
@@ -769,10 +781,13 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
   // Create MemTableLists
   int min_write_buffer_number_to_merge = 3;
   int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int64_t>(options.write_buffer_size);
   autovector<MemTableList*> lists;
   for (int i = 0; i != num_cfs; ++i) {
     lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
-                                        max_write_buffer_number_to_maintain));
+                                        max_write_buffer_number_to_maintain,
+                                        max_write_buffer_size_to_maintain));
   }
 
   autovector<uint32_t> cf_ids;
diff --git a/db/repair.cc b/db/repair.cc
index 8967b39f30b..0f0d329ccd6 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -383,7 +383,8 @@ class Repairer {
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr);
+      status =
+          WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
diff --git a/db/trim_history_scheduler.cc b/db/trim_history_scheduler.cc
new file mode 100644
index 00000000000..a213ac65f2c
--- /dev/null
+++ b/db/trim_history_scheduler.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  cfd->Ref();
+  cfds_.push_back(cfd);
+  is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  while (true) {
+    if (cfds_.empty()) {
+      return nullptr;
+    }
+    ColumnFamilyData* cfd = cfds_.back();
+    cfds_.pop_back();
+    if (cfds_.empty()) {
+      is_empty_.store(true, std::memory_order_relaxed);
+    }
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+    if (cfd->Unref()) {
+      // no longer relevant, retry
+      delete cfd;
+    }
+  }
+}
+
+bool TrimHistoryScheduler::Empty() {
+  bool is_empty = is_empty_.load(std::memory_order_relaxed);
+  return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  assert(Empty());
+}
+
+}  // namespace rocksdb
diff --git a/db/trim_history_scheduler.h b/db/trim_history_scheduler.h
new file mode 100644
index 00000000000..e9013b96470
--- /dev/null
+++ b/db/trim_history_scheduler.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+  TrimHistoryScheduler() : is_empty_(true) {}
+
+  // When a column family needs history trimming, add cfd to the FIFO queue
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Remove the column family from the queue, the caller is responsible for
+  // calling `MemtableList::TrimHistory`
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+  // Not on critical path, use mutex to ensure thread safety
+ private:
+  std::atomic<bool> is_empty_;
+  autovector<ColumnFamilyData*> cfds_;
+  std::mutex checking_mutex_;
+};
+
+}  // namespace rocksdb
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 8a896644fc2..2a1bf19489f 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -48,6 +48,7 @@
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
 #include "db/write_batch_internal.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -1189,6 +1190,7 @@ class MemTableInserter : public WriteBatch::Handler {
   SequenceNumber sequence_;
   ColumnFamilyMemTables* const cf_mems_;
   FlushScheduler* const flush_scheduler_;
+  TrimHistoryScheduler* const trim_history_scheduler_;
   const bool ignore_missing_column_families_;
   const uint64_t recovering_log_number_;
   // log number that all Memtables inserted into should reference
@@ -1250,6 +1252,7 @@ class MemTableInserter : public WriteBatch::Handler {
   // cf_mems should not be shared with concurrent inserters
   MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
                    FlushScheduler* flush_scheduler,
+                   TrimHistoryScheduler* trim_history_scheduler,
                    bool ignore_missing_column_families,
                    uint64_t recovering_log_number, DB* db,
                    bool concurrent_memtable_writes,
@@ -1258,6 +1261,7 @@ class MemTableInserter : public WriteBatch::Handler {
       : sequence_(_sequence),
         cf_mems_(cf_mems),
         flush_scheduler_(flush_scheduler),
+        trim_history_scheduler_(trim_history_scheduler),
         ignore_missing_column_families_(ignore_missing_column_families),
         recovering_log_number_(recovering_log_number),
         log_number_ref_(0),
@@ -1748,7 +1752,20 @@ class MemTableInserter : public WriteBatch::Handler {
           cfd->mem()->MarkFlushScheduled()) {
         // MarkFlushScheduled only returns true if we are the one that
         // should take action, so no need to dedup further
-        flush_scheduler_->ScheduleFlush(cfd);
+        flush_scheduler_->ScheduleWork(cfd);
+      }
+    }
+    // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+    if (trim_history_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+      assert(cfd != nullptr);
+      if (cfd->ioptions()->max_write_buffer_size_to_maintain > 0 &&
+          cfd->mem()->ApproximateMemoryUsageFast() +
+                  cfd->imm()->ApproximateMemoryUsageExcludingLast() >=
+              static_cast<size_t>(
+                  cfd->ioptions()->max_write_buffer_size_to_maintain) &&
+          cfd->imm()->MarkTrimHistoryNeeded()) {
+        trim_history_scheduler_->ScheduleWork(cfd);
       }
     }
   }
@@ -1908,12 +1925,14 @@ class MemTableInserter : public WriteBatch::Handler {
 Status WriteBatchInternal::InsertInto(
     WriteThread::WriteGroup& write_group, SequenceNumber sequence,
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
     bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, ignore_missing_column_families,
-      recovery_log_number, db, concurrent_memtable_writes,
-      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, recovery_log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
       continue;
@@ -1939,6 +1958,7 @@ Status WriteBatchInternal::InsertInto(
 Status WriteBatchInternal::InsertInto(
     WriteThread::Writer* writer, SequenceNumber sequence,
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t log_number, DB* db,
     bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
     bool batch_per_txn) {
@@ -1947,9 +1967,10 @@ Status WriteBatchInternal::InsertInto(
 #endif
   assert(writer->ShouldWriteToMemtable());
   MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, ignore_missing_column_families,
-      log_number, db, concurrent_memtable_writes, nullptr /*has_valid_writes*/,
-      seq_per_batch, batch_per_txn);
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
   Status s = writer->batch->Iterate(&inserter);
@@ -1963,11 +1984,13 @@ Status WriteBatchInternal::InsertInto(
 
 Status WriteBatchInternal::InsertInto(
     const WriteBatch* batch, ColumnFamilyMemTables* memtables,
-    FlushScheduler* flush_scheduler, bool ignore_missing_column_families,
-    uint64_t log_number, DB* db, bool concurrent_memtable_writes,
-    SequenceNumber* next_seq, bool* has_valid_writes, bool seq_per_batch,
-    bool batch_per_txn) {
+    FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, SequenceNumber* next_seq,
+    bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+                            trim_history_scheduler,
                             ignore_missing_column_families, log_number, db,
                             concurrent_memtable_writes, has_valid_writes,
                             seq_per_batch, batch_per_txn);
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 67136a84716..6793e845074 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -9,11 +9,13 @@
 
 #pragma once
 #include <vector>
+#include "db/flush_scheduler.h"
+#include "db/trim_history_scheduler.h"
 #include "db/write_thread.h"
-#include "rocksdb/types.h"
-#include "rocksdb/write_batch.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -162,6 +164,7 @@ class WriteBatchInternal {
   static Status InsertInto(
       WriteThread::WriteGroup& write_group, SequenceNumber sequence,
       ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
       DB* db = nullptr, bool concurrent_memtable_writes = false,
       bool seq_per_batch = false, bool batch_per_txn = true);
@@ -171,6 +174,7 @@ class WriteBatchInternal {
   static Status InsertInto(
       const WriteBatch* batch, ColumnFamilyMemTables* memtables,
       FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
       DB* db = nullptr, bool concurrent_memtable_writes = false,
       SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
@@ -179,6 +183,7 @@ class WriteBatchInternal {
   static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
                            ColumnFamilyMemTables* memtables,
                            FlushScheduler* flush_scheduler,
+                           TrimHistoryScheduler* trim_history_scheduler,
                            bool ignore_missing_column_families = false,
                            uint64_t log_number = 0, DB* db = nullptr,
                            bool concurrent_memtable_writes = false,
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 5de602cee81..6a3f9e68038 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -35,7 +35,8 @@ static std::string PrintContents(WriteBatch* b) {
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem);
-  Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr);
+  Status s =
+      WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
   int count = 0;
   int put_count = 0;
   int delete_count = 0;
diff --git a/examples/rocksdb_option_file_example.ini b/examples/rocksdb_option_file_example.ini
index 351f1ed0107..dcbc9a308a8 100644
--- a/examples/rocksdb_option_file_example.ini
+++ b/examples/rocksdb_option_file_example.ini
@@ -104,7 +104,7 @@
   compression=kSnappyCompression
   level0_file_num_compaction_trigger=4
   purge_redundant_kvs_while_flush=true
-  max_write_buffer_number_to_maintain=0
+  max_write_buffer_size_to_maintain=0
   memtable_factory=SkipListFactory
   max_grandparent_overlap_factor=8
   expanded_compaction_factor=25
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index c88a6c17df2..2964491f7ee 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -175,11 +175,26 @@ struct AdvancedColumnFamilyOptions {
   // individual write buffers.  Default: 1
   int min_write_buffer_number_to_merge = 1;
 
+  // DEPRECATED
   // The total maximum number of write buffers to maintain in memory including
   // copies of buffers that have already been flushed.  Unlike
   // max_write_buffer_number, this parameter does not affect flushing.
-  // This controls the minimum amount of write history that will be available
-  // in memory for conflict checking when Transactions are used.
+  // This parameter is being replaced by max_write_buffer_size_to_maintain.
+  // If both parameters are set to non-zero values, this parameter will be
+  // ignored.
+  int max_write_buffer_number_to_maintain = 0;
+
+  // The total maximum size(bytes) of write buffers to maintain in memory
+  // including copies of buffers that have already been flushed. This parameter
+  // only affects trimming of flushed buffers and does not affect flushing.
+  // This controls the maximum amount of write history that will be available
+  // in memory for conflict checking when Transactions are used. The actual
+  // size of write history (flushed Memtables) might be higher than this limit
+  // if further trimming will reduce write history total size below this
+  // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB,
+  // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB.
+  // Because trimming the next Memtable of size 20MB will reduce total memory
+  // usage to 52MB which is below the limit, RocksDB will stop trimming.
   //
   // When using an OptimisticTransactionDB:
   // If this value is too low, some transactions may fail at commit time due
@@ -192,14 +207,14 @@ struct AdvancedColumnFamilyOptions {
   // done for conflict detection.
   //
   // Setting this value to 0 will cause write buffers to be freed immediately
-  // after they are flushed.
-  // If this value is set to -1, 'max_write_buffer_number' will be used.
+  // after they are flushed. If this value is set to -1,
+  // 'max_write_buffer_number * write_buffer_size' will be used.
   //
   // Default:
   // If using a TransactionDB/OptimisticTransactionDB, the default value will
-  // be set to the value of 'max_write_buffer_number' if it is not explicitly
-  // set by the user.  Otherwise, the default is 0.
-  int max_write_buffer_number_to_maintain = 0;
+  // be set to the value of 'max_write_buffer_number * write_buffer_size'
+  // if it is not explicitly set by the user.  Otherwise, the default is 0.
+  int64_t max_write_buffer_size_to_maintain = 0;
 
   // Allows thread-safe inplace updates. If this is true, there is no way to
   // achieve point-in-time consistency using snapshot or iterator (assuming
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index e8cb3224248..d7f13f8edb2 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -854,6 +854,9 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+                                                      int64_t);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
     rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index c0d8537a2bb..95d299c1b59 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -131,7 +131,7 @@ class Transaction {
   // Status::Busy() may be returned if the transaction could not guarantee
   // that there are no write conflicts.  Status::TryAgain() may be returned
   // if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain).
+  //  (See max_write_buffer_size_to_maintain).
   //
   // If this transaction was created by a TransactionDB(), Status::Expired()
   // may be returned if this transaction has lived for longer than
@@ -243,7 +243,7 @@ class Transaction {
   // Status::Busy() if there is a write conflict,
   // Status::TimedOut() if a lock could not be acquired,
   // Status::TryAgain() if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain)
+  //  (See max_write_buffer_size_to_maintain)
   // Status::MergeInProgress() if merge operations cannot be resolved.
   // or other errors if this key could not be read.
   virtual Status GetForUpdate(const ReadOptions& options,
@@ -320,7 +320,7 @@ class Transaction {
   // Status::Busy() if there is a write conflict,
   // Status::TimedOut() if a lock could not be acquired,
   // Status::TryAgain() if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain)
+  //  (See max_write_buffer_size_to_maintain)
   // or other errors on unexpected failures.
   virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
                      const Slice& value, const bool assume_tracked = false) = 0;
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index c6b8a92390e..eeb269e486a 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -52,8 +52,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
-  rocksdb::Status s =
-      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr);
+  rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(
+      b, &cf_mems_default, nullptr, nullptr);
   int count = 0;
   rocksdb::Arena arena;
   rocksdb::ScopedArenaIterator iter(
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 5830fc6613d..ded9ca01b64 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -33,6 +33,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
           cf_options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
           cf_options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
       inplace_callback(cf_options.inplace_callback),
       info_log(db_options.info_log.get()),
diff --git a/options/cf_options.h b/options/cf_options.h
index 47fca58fa7d..e13eae80140 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -43,6 +43,8 @@ struct ImmutableCFOptions {
 
   int max_write_buffer_number_to_maintain;
 
+  int64_t max_write_buffer_size_to_maintain;
+
   bool inplace_update_support;
 
   UpdateStatus (*inplace_callback)(char* existing_value,
diff --git a/options/options.cc b/options/options.cc
index 5efd3ce5742..11804d88f9b 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -42,6 +42,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
           options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
           options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          options.max_write_buffer_size_to_maintain),
       inplace_update_support(options.inplace_update_support),
       inplace_update_num_locks(options.inplace_update_num_locks),
       inplace_callback(options.inplace_callback),
@@ -158,6 +160,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      min_write_buffer_number_to_merge);
     ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
                      max_write_buffer_number_to_maintain);
+    ROCKS_LOG_HEADER(log,
+                     "    Options.max_write_buffer_size_to_maintain: %" PRIu64,
+                     max_write_buffer_size_to_maintain);
     ROCKS_LOG_HEADER(
         log, "           Options.bottommost_compression_opts.window_bits: %d",
         bottommost_compression_opts.window_bits);
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 5733ceed455..588a45ef70f 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -1873,6 +1873,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"max_write_buffer_number_to_maintain",
          {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
           OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"max_write_buffer_size_to_maintain",
+         {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain),
+          OptionType::kInt64T, OptionVerificationType::kNormal, false, 0}},
         {"min_write_buffer_number_to_merge",
          {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
           OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index e60fd6f9ebf..d0fef584728 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -438,6 +438,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "soft_rate_limit=530.615385;"
       "soft_pending_compaction_bytes_limit=0;"
       "max_write_buffer_number_to_maintain=84;"
+      "max_write_buffer_size_to_maintain=2147483648;"
       "merge_operator=aabcxehazrMergeOperator;"
       "memtable_prefix_bloom_size_ratio=0.4642;"
       "memtable_whole_key_filtering=true;"
diff --git a/options/options_test.cc b/options/options_test.cc
index 05ea766f6a6..0e7cebf3a15 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -49,6 +49,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"max_write_buffer_number", "2"},
       {"min_write_buffer_number_to_merge", "3"},
       {"max_write_buffer_number_to_maintain", "99"},
+      {"max_write_buffer_size_to_maintain", "-99999"},
       {"compression", "kSnappyCompression"},
       {"compression_per_level",
        "kNoCompression:"
@@ -150,6 +151,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U);
   ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
diff --git a/src.mk b/src.mk
index 8ebc0bee96c..6d087861d4e 100644
--- a/src.mk
+++ b/src.mk
@@ -53,6 +53,7 @@ LIB_SOURCES =                                                   \
   db/table_cache.cc                                             \
   db/table_properties_collector.cc                              \
   db/transaction_log_impl.cc                                    \
+  db/trim_history_scheduler.cc                                  \
   db/version_builder.cc                                         \
   db/version_edit.cc                                            \
   db/version_set.cc                                             \
diff --git a/table/table_test.cc b/table/table_test.cc
index 749048b78c2..c1f9ed3f3a8 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -3300,7 +3300,8 @@ TEST_F(MemTableTest, Simple) {
   batch.DeleteRange(std::string("begin"), std::string("end"));
   ColumnFamilyMemTablesDefault cf_mems_default(memtable);
   ASSERT_TRUE(
-      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok());
+      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
+          .ok());
 
   for (int i = 0; i < 2; ++i) {
     Arena arena;
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 46f878f8ce5..f3e71bebce8 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -333,6 +333,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
   cf_opt->max_mem_compaction_level = rnd->Uniform(100);
   cf_opt->max_write_buffer_number = rnd->Uniform(100);
   cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
+  cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
   cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
   cf_opt->num_levels = rnd->Uniform(100);
   cf_opt->target_file_size_multiplier = rnd->Uniform(100);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 001dd4d2fb0..3c2de42ddc3 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -340,6 +340,20 @@ DEFINE_int32(max_write_buffer_number_to_maintain,
              "after they are flushed.  If this value is set to -1, "
              "'max_write_buffer_number' will be used.");
 
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             rocksdb::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
 DEFINE_int32(max_background_jobs,
              rocksdb::Options().max_background_jobs,
              "The maximum number of concurrent background jobs that can occur "
@@ -3385,6 +3399,8 @@ class Benchmark {
       FLAGS_min_write_buffer_number_to_merge;
     options.max_write_buffer_number_to_maintain =
         FLAGS_max_write_buffer_number_to_maintain;
+    options.max_write_buffer_size_to_maintain =
+        FLAGS_max_write_buffer_size_to_maintain;
     options.max_background_jobs = FLAGS_max_background_jobs;
     options.max_background_compactions = FLAGS_max_background_compactions;
     options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index 4eb5472acec..87f8f1943a2 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -245,6 +245,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
   expanded_compaction_factor=25
   soft_rate_limit=0.000000
   max_write_buffer_number_to_maintain=0
+  max_write_buffer_size_to_maintain=0
   verify_checksums_in_compaction=true
   merge_operator=nullptr
   memtable_prefix_bloom_bits=0
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 7b4a36f0ff6..3461a75d260 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -204,6 +204,20 @@ DEFINE_int32(max_write_buffer_number_to_maintain,
              "after they are flushed.  If this value is set to -1, "
              "'max_write_buffer_number' will be used.");
 
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             rocksdb::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
 DEFINE_double(memtable_prefix_bloom_size_ratio,
               rocksdb::Options().memtable_prefix_bloom_size_ratio,
               "creates prefix blooms for memtables, each with size "
@@ -2762,6 +2776,8 @@ class StressTest {
           FLAGS_min_write_buffer_number_to_merge;
       options_.max_write_buffer_number_to_maintain =
           FLAGS_max_write_buffer_number_to_maintain;
+      options_.max_write_buffer_size_to_maintain =
+          FLAGS_max_write_buffer_size_to_maintain;
       options_.memtable_prefix_bloom_size_ratio =
           FLAGS_memtable_prefix_bloom_size_ratio;
       options_.memtable_whole_key_filtering =
diff --git a/util/string_util.cc b/util/string_util.cc
index 9b447d50ce3..4a194f3a9cc 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -6,17 +6,16 @@
 #include "util/string_util.h"
 
 #include <errno.h>
-#include <cinttypes>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
+#include <cinttypes>
 #include <cmath>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
-#include "rocksdb/env.h"
 #include "port/port.h"
 #include "port/sys_time.h"
 #include "rocksdb/slice.h"
diff --git a/util/string_util.h b/util/string_util.h
index faf763e54a1..122a6c3567e 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -121,7 +121,6 @@ uint64_t ParseUint64(const std::string& value);
 
 int ParseInt(const std::string& value);
 
-
 int64_t ParseInt64(const std::string& value);
 
 double ParseDouble(const std::string& value);
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index b7fedc06615..0fb7c9100fe 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -63,9 +63,11 @@ Status OptimisticTransactionDB::Open(
   for (auto& column_family : column_families_copy) {
     ColumnFamilyOptions* options = &column_family.options;
 
-    if (options->max_write_buffer_number_to_maintain == 0) {
-      // Setting to -1 will set the History size to max_write_buffer_number.
-      options->max_write_buffer_number_to_maintain = -1;
+    if (options->max_write_buffer_size_to_maintain == 0 &&
+        options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      options->max_write_buffer_size_to_maintain = -1;
     }
   }
 
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index 3aa6c207a48..2a693349549 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -36,6 +36,7 @@ class OptimisticTransactionTest : public testing::Test {
   OptimisticTransactionTest() {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
+    options.max_write_buffer_size_to_maintain = 1600;
     dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
 
     DestroyDB(dbname, options);
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 2f9c918a3b4..caac2ab180c 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -271,9 +271,11 @@ void TransactionDB::PrepareWrap(
   for (size_t i = 0; i < column_families->size(); i++) {
     ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
 
-    if (cf_options->max_write_buffer_number_to_maintain == 0) {
-      // Setting to -1 will set the History size to max_write_buffer_number.
-      cf_options->max_write_buffer_number_to_maintain = -1;
+    if (cf_options->max_write_buffer_size_to_maintain == 0 &&
+        cf_options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      cf_options->max_write_buffer_size_to_maintain = -1;
     }
     if (!cf_options->disable_auto_compactions) {
       // Disable compactions momentarily to prevent race with DB::Open
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index da7cee06304..23ae374dcba 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -227,8 +227,10 @@ TEST_P(TransactionTest, ValidateSnapshotTest) {
         db_impl->TEST_FlushMemTable(true);
         // Make sure the flushed memtable is not kept in memory
         int max_memtable_in_history =
-            std::max(options.max_write_buffer_number,
-                     options.max_write_buffer_number_to_maintain) +
+            std::max(
+                options.max_write_buffer_number,
+                static_cast<int>(options.max_write_buffer_size_to_maintain) /
+                    static_cast<int>(options.write_buffer_size)) +
             1;
         for (int i = 0; i < max_memtable_in_history; i++) {
           db->Put(write_options, Slice("key"), Slice("value"));
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index ba3b75e15bf..37144850323 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -94,7 +94,7 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                " as the MemTable only contains changes newer than "
                "SequenceNumber %" PRIu64
                ".  Increasing the value of the "
-               "max_write_buffer_number_to_maintain option could reduce the "
+               "max_write_buffer_size_to_maintain option could reduce the "
                "frequency "
                "of this error.",
                snap_seq, earliest_seq);

From 62829ff7510b3e0e5cd9c76316544b3527461cb0 Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Mon, 26 Aug 2019 11:24:40 -0700
Subject: [PATCH 326/572] reuse scratch buffer in transaction_log_reader
 (#5702)

Summary:
in order to avoid reallocations for a scratch std::string on every call to Next().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5702

Differential Revision: D16867803

fbshipit-source-id: 1391220a1b172b23336bbc71dc0c79ccf3b1c701
---
 db/transaction_log_impl.cc | 12 ++++--------
 db/transaction_log_impl.h  |  3 ++-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index 8c526af12ae..55f87ede96d 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -78,19 +78,16 @@ Status TransactionLogIteratorImpl::status() { return current_status_; }
 
 bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
 
-bool TransactionLogIteratorImpl::RestrictedRead(
-    Slice* record,
-    std::string* scratch) {
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
   // Don't read if no more complete entries to read from logs
   if (current_last_seq_ >= versions_->LastSequence()) {
     return false;
   }
-  return current_log_reader_->ReadRecord(record, scratch);
+  return current_log_reader_->ReadRecord(record, &scratch_);
 }
 
 void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
                                                      bool strict) {
-  std::string scratch;
   Slice record;
   started_ = false;
   is_valid_ = false;
@@ -104,7 +101,7 @@ void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
     reporter_.Info(current_status_.ToString().c_str());
     return;
   }
-  while (RestrictedRead(&record, &scratch)) {
+  while (RestrictedRead(&record)) {
     if (record.size() < WriteBatchInternal::kHeader) {
       reporter_.Corruption(
         record.size(), Status::Corruption("very small log record"));
@@ -155,7 +152,6 @@ void TransactionLogIteratorImpl::Next() {
 }
 
 void TransactionLogIteratorImpl::NextImpl(bool internal) {
-  std::string scratch;
   Slice record;
   is_valid_ = false;
   if (!internal && !started_) {
@@ -167,7 +163,7 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
     if (current_log_reader_->IsEOF()) {
       current_log_reader_->UnmarkEOF();
     }
-    while (RestrictedRead(&record, &scratch)) {
+    while (RestrictedRead(&record)) {
       if (record.size() < WriteBatchInternal::kHeader) {
         reporter_.Corruption(
           record.size(), Status::Corruption("very small log record"));
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index 68ba620714c..7d6993d1d0a 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -86,6 +86,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   size_t current_file_index_;
   std::unique_ptr<WriteBatch> current_batch_;
   std::unique_ptr<log::Reader> current_log_reader_;
+  std::string scratch_;
   Status OpenLogFile(const LogFile* log_file,
                      std::unique_ptr<SequentialFileReader>* file);
 
@@ -107,7 +108,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   VersionSet const* const versions_;
   const bool seq_per_batch_;
   // Reads from transaction log only if the writebatch record has been written
-  bool RestrictedRead(Slice* record, std::string* scratch);
+  bool RestrictedRead(Slice* record);
   // Seeks to startingSequenceNumber reading from startFileIndex in files_.
   // If strict is set,then must get a batch starting with startingSequenceNumber
   void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);

From ba0967b567cfdad44e9e1762a17dd1802e02c2ce Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Mon, 26 Aug 2019 14:59:24 -0700
Subject: [PATCH 327/572] Reduce severity of too many levels log message
 (#5742)

Summary:
This condition is now a normal occurrence during write burst so there is
no need to warn the user about it. Here is a scenario where it happens
under completely normal conditions.

* Initially we have a DB of three levels (L0, L1, and L2) that is stable, i.e., compaction scores are all less than one.
* Now a write burst comes along. At first L0 blows up a bit in size as compaction hasn't had a chance to catch up.
* As a result of the above, `base_bytes_min` also increases since it is based on L0 size as of https://github.com/facebook/rocksdb/issues/4338
* If `base_bytes_min` increased enough (i.e., to be larger than L1), then we are shown the warning that the DB has more levels than necessary.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5742

Differential Revision: D17059221

fbshipit-source-id: e4a31d6eea42089a8d273095f19653991bd91bea
---
 db/version_set.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index e94ad3d5a45..7b4b9ccc1aa 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3215,7 +3215,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         // base_bytes_min. We set it be base_bytes_min.
         base_level_size = base_bytes_min + 1U;
         base_level_ = first_non_empty_level;
-        ROCKS_LOG_WARN(ioptions.info_log,
+        ROCKS_LOG_INFO(ioptions.info_log,
                        "More existing levels in DB than needed. "
                        "max_bytes_for_level_multiplier may not be guaranteed.");
       } else {

From 1d6a10f52db468888eee8b3cca5ed1e50a27b92f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 26 Aug 2019 15:00:04 -0700
Subject: [PATCH 328/572] Extend stress test to cover periodic compaction and
 compaction TTL (#5741)

Summary:
Covering periodic compaction and compaction TTL can help us expose potential issues. Add it there.
Randomly select value for these two options.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5741

Test Plan: Run crash_test and see the perameters generated.

Differential Revision: D17059515

fbshipit-source-id: 8213974846a0b6a22fc13be705825c9054d1d097
---
 tools/db_crashtest.py | 10 +++++++++-
 tools/db_stress.cc    | 12 ++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index ed709a1acf5..a62a7b685d4 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -47,7 +47,7 @@
     "max_write_buffer_number": 3,
     "mmap_read": lambda: random.randint(0, 1),
     "nooverwritepercent": 1,
-    "open_files": 500000,
+    "open_files": lambda : random.choice([-1, 500000]),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -67,6 +67,9 @@
     "format_version": lambda: random.randint(2, 4),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget" : lambda: random.randint(0, 1),
+    "periodic_compaction_seconds" :
+        lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    "compaction_ttl" : lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@@ -162,6 +165,11 @@ def finalize_and_sanitize(src_params):
         dest_params["delrangepercent"] = 0
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
+    if dest_params.get("open_files", 1) != -1:
+        # Compaction TTL and periodic compactions are only compatible
+        # with open_files = -1
+        dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
     return dest_params
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 3461a75d260..8403eced173 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -320,6 +320,12 @@ DEFINE_uint64(subcompactions, 1,
               "Maximum number of subcompactions to divide L0-L1 compactions "
               "into.");
 
+DEFINE_uint64(periodic_compaction_seconds, 1000,
+              "Files older than this value will be picked up for compaction.");
+
+DEFINE_uint64(compaction_ttl, 1000,
+              "Files older than TTL will be compacted to the next level.");
+
 DEFINE_bool(allow_concurrent_memtable_write, false,
             "Allow multi-writers to update mem tables in parallel.");
 
@@ -2745,6 +2751,10 @@ class StressTest {
     }
     fprintf(stdout, "Snapshot refresh nanos    : %" PRIu64 "\n",
             FLAGS_snap_refresh_nanos);
+    fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
+            FLAGS_periodic_compaction_seconds);
+    fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
+            FLAGS_compaction_ttl);
 
     fprintf(stdout, "------------------------------------------------\n");
   }
@@ -2821,6 +2831,8 @@ class StressTest {
       options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
       options_.allow_concurrent_memtable_write =
           FLAGS_allow_concurrent_memtable_write;
+      options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+      options_.ttl = FLAGS_compaction_ttl;
       options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
       options_.enable_write_thread_adaptive_yield =
           FLAGS_enable_write_thread_adaptive_yield;

From 1b4c104a678ce211fb919a701f9aa35843b6278f Mon Sep 17 00:00:00 2001
From: Pratik Dhandharia <pratikrd@gmail.com>
Date: Tue, 27 Aug 2019 10:57:28 -0700
Subject: [PATCH 329/572] replace some reinterpret_cast with
 static_cast_with_check (#5740)

Summary:
This PR focuses on replacing some of the reinterpret_cast<DBImpl*> to static_cast_with_check<DBImpl, DB>.

Files impacted:

./db/db_impl/db_impl_compaction_flush.cc
./db/write_batch.cc
./utilities/blob_db/blob_db_impl.cc
./utilities/transactions/pessimistic_transaction_db.cc
./utilities/transactions/transaction_base.cc
./utilities/transactions/write_prepared_txn_db.cc
./utilities/transactions/write_unprepared_txn_db.cc
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5740

Differential Revision: D17055691

Pulled By: pdhandharia

fbshipit-source-id: 0f8034d1b32eade56e37d59c04b7bf236a81d8e8
---
 db/db_impl/db_impl_compaction_flush.cc               | 6 ++++--
 db/write_batch.cc                                    | 3 ++-
 utilities/blob_db/blob_db_impl.cc                    | 2 +-
 utilities/transactions/pessimistic_transaction_db.cc | 2 +-
 utilities/transactions/transaction_base.cc           | 3 ++-
 utilities/transactions/write_prepared_txn_db.cc      | 2 +-
 utilities/transactions/write_unprepared_txn_db.cc    | 2 +-
 7 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index ffb6b2b60ca..1b862e033d0 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -19,6 +19,7 @@
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/concurrent_task_limiter_impl.h"
 
 namespace rocksdb {
@@ -2072,7 +2073,8 @@ void DBImpl::BGWorkFlush(void* arg) {
 
   IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush");
-  reinterpret_cast<DBImpl*>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
+  static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
+      fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
 }
 
@@ -2083,7 +2085,7 @@ void DBImpl::BGWorkCompaction(void* arg) {
   TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
   auto prepicked_compaction =
       static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
-  reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(
+  static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
       prepicked_compaction, Env::Priority::LOW);
   delete prepicked_compaction;
 }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 2a1bf19489f..396f8e44a01 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -54,6 +54,7 @@
 #include "monitoring/statistics.h"
 #include "rocksdb/merge_operator.h"
 #include "util/autovector.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/duplicate_detector.h"
 #include "util/string_util.h"
@@ -1265,7 +1266,7 @@ class MemTableInserter : public WriteBatch::Handler {
         ignore_missing_column_families_(ignore_missing_column_families),
         recovering_log_number_(recovering_log_number),
         log_number_ref_(0),
-        db_(reinterpret_cast<DBImpl*>(db)),
+        db_(static_cast_with_check<DBImpl, DB>(db)),
         concurrent_memtable_writes_(concurrent_memtable_writes),
         post_info_created_(false),
         has_valid_writes_(has_valid_writes),
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 8088e4273dc..574338e5296 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1431,7 +1431,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
       : cfd_(cfd), key_(key), upper_bound_(upper_bound) {}
 
   Status Callback(DB* db) override {
-    auto* db_impl = reinterpret_cast<DBImpl*>(db);
+    auto* db_impl = static_cast_with_check<DBImpl, DB>(db);
     auto* sv = db_impl->GetAndRefSuperVersion(cfd_);
     SequenceNumber latest_seq = 0;
     bool found_record_for_key = false;
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index caac2ab180c..4aee1147bbd 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -113,7 +113,7 @@ Status PessimisticTransactionDB::Initialize(
   Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
 
   // create 'real' transactions from recovered shell transactions
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtrxs = dbimpl->recovered_transactions();
 
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 30861f09148..81f39812407 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -14,6 +14,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -21,7 +22,7 @@ namespace rocksdb {
 TransactionBaseImpl::TransactionBaseImpl(DB* db,
                                          const WriteOptions& write_options)
     : db_(db),
-      dbimpl_(reinterpret_cast<DBImpl*>(db)),
+      dbimpl_(static_cast_with_check<DBImpl, DB>(db)),
       write_options_(write_options),
       cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
       start_time_(db_->GetEnv()->NowMicros()),
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 7ff89a4f87f..f58305de2ea 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -29,7 +29,7 @@ namespace rocksdb {
 Status WritePreparedTxnDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtxns = dbimpl->recovered_transactions();
   std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 3a8eff5ec5e..f6809faebdf 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -179,7 +179,7 @@ Status WriteUnpreparedTxnDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
   // TODO(lth): Reduce code duplication in this function.
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
 
   db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));

From 1daff8f85a59a454303091d0b0bea85d4f8c0a31 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 27 Aug 2019 17:54:18 -0700
Subject: [PATCH 330/572] crash_test to skip compaction TTL for FIFO
 compaction. (#5749)

Summary:
https://github.com/facebook/rocksdb/pull/5741 added compaction TTL to crash test, but it causes assertion fails for FIFO compaction. Disable this combination for now while we debug the assertion failure.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5749

Test Plan: Run crash test and observe that when compaction_style=2, compaction_ttl is always 0.

Differential Revision: D17078292

fbshipit-source-id: 446821a3b9739956094d5e4f9be1251a15b57f5d
---
 tools/db_crashtest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index a62a7b685d4..73a46b4240d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -170,6 +170,10 @@ def finalize_and_sanitize(src_params):
         # with open_files = -1
         dest_params["compaction_ttl"] = 0
         dest_params["periodic_compaction_seconds"] = 0
+    if dest_params.get("compaction_style", 0) == 2:
+        # Disable compaction TTL in FIFO compaction, because right
+        # now assertion failures are triggered.
+        dest_params["compaction_ttl"] = 0
     return dest_params
 
 

From e10570331d1cbaf522c342c9b6b17b92550c4b41 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 28 Aug 2019 16:10:38 -0700
Subject: [PATCH 331/572] Support row cache with batched MultiGet (#5706)

Summary:
This PR adds support for row cache in ```rocksdb::TableCache::MultiGet```.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5706

Test Plan:
1. Unit tests in db_basic_test
2. db_bench results with batch size of 2 (```Get``` is faster than ```MultiGet``` for single key) -
Get -
readrandom   :       3.935 micros/op 254116 ops/sec;   28.1 MB/s (22870998 of 22870999 found)
MultiGet -
multireadrandom :       3.743 micros/op 267190 ops/sec; (24047998 of 24047998 found)

Command used -
TEST_TMPDIR=/dev/shm/multiget numactl -C 10  ./db_bench -use_existing_db=true -use_existing_keys=false -benchmarks="readtorowcache,[read|multiread]random" -write_buffer_size=16777216 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -row_cache_size=4194304000 -batch_size=2 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=131072

Differential Revision: D17086297

Pulled By: anand1976

fbshipit-source-id: 85784378da913e05f1baf31ec1b4e7c9345e7f57
---
 db/db_basic_test.cc      |  92 +++++++++++++++++
 db/table_cache.cc        | 213 ++++++++++++++++++++++++++-------------
 db/table_cache.h         |  14 +++
 db/version_set.cc        |   2 +-
 table/multiget_context.h |   2 -
 5 files changed, 251 insertions(+), 72 deletions(-)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 963dde6cea9..e09f5fe8aeb 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1335,6 +1335,98 @@ INSTANTIATE_TEST_CASE_P(
     MultiGetPrefix, MultiGetPrefixExtractorTest,
     ::testing::Bool());
 
+class DBMultiGetRowCacheTest
+    : public DBBasicTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+  do {
+    option_config_ = kRowCache;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    Flush(1);
+    ASSERT_OK(Put(1, "k5", "v5"));
+    const Snapshot* snap1 = dbfull()->GetSnapshot();
+    ASSERT_OK(Delete(1, "k4"));
+    Flush(1);
+    const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    bool use_snapshots = GetParam();
+    if (use_snapshots) {
+      ro.snapshot = snap2;
+    }
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+
+    // Call MultiGet() again with some intersection with the previous set of
+    // keys. Those should already be in the row cache.
+    keys.assign({"no_key", "k5", "k3", "k2"});
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i].Reset();
+      s[i] = Status::OK();
+    }
+    get_perf_context()->Reset();
+
+    if (use_snapshots) {
+      ro.snapshot = snap1;
+    }
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_OK(s[3]);
+    if (use_snapshots) {
+      // Only reads from the first SST file would have been cached, since
+      // snapshot seq no is > fd.largest_seqno
+      ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+    } else {
+      ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+    }
+
+    SetPerfLevel(kDisable);
+    dbfull()->ReleaseSnapshot(snap1);
+    dbfull()->ReleaseSnapshot(snap2);
+  } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+    testing::Values(true, false));
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBBasicTest, GetAllKeyVersions) {
   Options options = CurrentOptions();
diff --git a/db/table_cache.cc b/db/table_cache.cc
index f4de3b8fbb5..bd85fe0d3d6 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -277,6 +277,80 @@ Status TableCache::GetRangeTombstoneIterator(
   return s;
 }
 
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(
+        const ReadOptions& options,
+        const FileDescriptor& fd, const Slice& internal_key,
+        GetContext* get_context, IterKey& row_cache_key) {
+  uint64_t fd_number = fd.GetNumber();
+  // We use the user key as cache key instead of the internal key,
+  // otherwise the whole cache would be invalidated every time the
+  // sequence key increases. However, to support caching snapshot
+  // reads, we append the sequence number (incremented by 1 to
+  // distinguish from 0) only in this case.
+  // If the snapshot is larger than the largest seqno in the file,
+  // all data should be exposed to the snapshot, so we treat it
+  // the same as there is no snapshot. The exception is that if
+  // a seq-checking callback is registered, some internal keys
+  // may still be filtered out.
+  uint64_t seq_no = 0;
+  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+  if (options.snapshot != nullptr &&
+      (get_context->has_callback() ||
+       static_cast_with_check<const SnapshotImpl, const Snapshot>(
+           options.snapshot)
+               ->GetSequenceNumber() <= fd.largest_seqno)) {
+    // We should consider to use options.snapshot->GetSequenceNumber()
+    // instead of GetInternalKeySeqno(k), which will make the code
+    // easier to understand.
+    seq_no = 1 + GetInternalKeySeqno(internal_key);
+  }
+
+  // Compute row cache key.
+  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                           row_cache_id_.size());
+  AppendVarint64(&row_cache_key, fd_number);
+  AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(
+          const Slice& user_key, IterKey& row_cache_key,
+          size_t prefix_size, GetContext* get_context) {
+  bool found = false;
+
+  row_cache_key.TrimAppend(prefix_size, user_key.data(),
+                           user_key.size());
+  if (auto row_handle =
+          ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+    // Cleanable routine to release the cache entry
+    Cleanable value_pinner;
+    auto release_cache_entry_func = [](void* cache_to_clean,
+                                       void* cache_handle) {
+      ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+    };
+    auto found_row_cache_entry = static_cast<const std::string*>(
+        ioptions_.row_cache->Value(row_handle));
+    // If it comes here value is located on the cache.
+    // found_row_cache_entry points to the value on cache,
+    // and value_pinner has cleanup procedure for the cached entry.
+    // After replayGetContextLog() returns, get_context.pinnable_slice_
+    // will point to cache entry buffer (or a copy based on that) and
+    // cleanup routine under value_pinner will be delegated to
+    // get_context.pinnable_slice_. Cache entry is released when
+    // get_context.pinnable_slice_ is reset.
+    value_pinner.RegisterCleanup(release_cache_entry_func,
+                                 ioptions_.row_cache.get(), row_handle);
+    replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                        &value_pinner);
+    RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+    found = true;
+  } else {
+    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+  }
+  return found;
+}
+#endif  // ROCKSDB_LITE
+
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
                        const FileMetaData& file_meta, const Slice& k,
@@ -294,66 +368,11 @@ Status TableCache::Get(const ReadOptions& options,
   // Check row cache if enabled. Since row cache does not currently store
   // sequence numbers, we cannot use it if we need to fetch the sequence.
   if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
-    uint64_t fd_number = fd.GetNumber();
     auto user_key = ExtractUserKey(k);
-    // We use the user key as cache key instead of the internal key,
-    // otherwise the whole cache would be invalidated every time the
-    // sequence key increases. However, to support caching snapshot
-    // reads, we append the sequence number (incremented by 1 to
-    // distinguish from 0) only in this case.
-    // If the snapshot is larger than the largest seqno in the file,
-    // all data should be exposed to the snapshot, so we treat it
-    // the same as there is no snapshot. The exception is that if
-    // a seq-checking callback is registered, some internal keys
-    // may still be filtered out.
-    uint64_t seq_no = 0;
-    // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
-    if (options.snapshot != nullptr &&
-        (get_context->has_callback() ||
-         static_cast_with_check<const SnapshotImpl, const Snapshot>(
-             options.snapshot)
-                 ->GetSequenceNumber() <= fd.largest_seqno)) {
-      // We should consider to use options.snapshot->GetSequenceNumber()
-      // instead of GetInternalKeySeqno(k), which will make the code
-      // easier to understand.
-      seq_no = 1 + GetInternalKeySeqno(k);
-    }
-
-    // Compute row cache key.
-    row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
-                             row_cache_id_.size());
-    AppendVarint64(&row_cache_key, fd_number);
-    AppendVarint64(&row_cache_key, seq_no);
-    row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(),
-                             user_key.size());
-
-    if (auto row_handle =
-            ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
-      // Cleanable routine to release the cache entry
-      Cleanable value_pinner;
-      auto release_cache_entry_func = [](void* cache_to_clean,
-                                         void* cache_handle) {
-        ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
-      };
-      auto found_row_cache_entry = static_cast<const std::string*>(
-          ioptions_.row_cache->Value(row_handle));
-      // If it comes here value is located on the cache.
-      // found_row_cache_entry points to the value on cache,
-      // and value_pinner has cleanup procedure for the cached entry.
-      // After replayGetContextLog() returns, get_context.pinnable_slice_
-      // will point to cache entry buffer (or a copy based on that) and
-      // cleanup routine under value_pinner will be delegated to
-      // get_context.pinnable_slice_. Cache entry is released when
-      // get_context.pinnable_slice_ is reset.
-      value_pinner.RegisterCleanup(release_cache_entry_func,
-                                   ioptions_.row_cache.get(), row_handle);
-      replayGetContextLog(*found_row_cache_entry, user_key, get_context,
-                          &value_pinner);
-      RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
-      done = true;
-    } else {
-      // Not found, setting up the replay log.
-      RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+    CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+                           get_context);
+    if (!done) {
       row_cache_entry = &row_cache_entry_buffer;
     }
   }
@@ -413,8 +432,6 @@ Status TableCache::Get(const ReadOptions& options,
 }
 
 // Batched version of TableCache::MultiGet.
-// TODO: Add support for row cache. As of now, this ignores the row cache
-// and directly looks up in the table files
 Status TableCache::MultiGet(const ReadOptions& options,
                             const InternalKeyComparator& internal_comparator,
                             const FileMetaData& file_meta,
@@ -426,7 +443,41 @@ Status TableCache::MultiGet(const ReadOptions& options,
   Status s;
   TableReader* t = fd.table_reader;
   Cache::Handle* handle = nullptr;
-  if (s.ok()) {
+  MultiGetRange table_range(*mget_range, mget_range->begin(), mget_range->end());
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache = ioptions_.row_cache &&
+          !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) {
+      const Slice& user_key = miter->ukey;;
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
     if (t == nullptr) {
       s = FindTable(
           env_options_, internal_comparator, fd, &handle, prefix_extractor,
@@ -441,21 +492,20 @@ Status TableCache::MultiGet(const ReadOptions& options,
       std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
           t->NewRangeTombstoneIterator(options));
       if (range_del_iter != nullptr) {
-        for (auto iter = mget_range->begin(); iter != mget_range->end();
+        for (auto iter = table_range.begin(); iter != table_range.end();
              ++iter) {
-          const Slice& k = iter->ikey;
           SequenceNumber* max_covering_tombstone_seq =
               iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq = std::max(
-              *max_covering_tombstone_seq,
-              range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
+          *max_covering_tombstone_seq =
+              std::max(*max_covering_tombstone_seq,
+                       range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
         }
       }
     }
     if (s.ok()) {
-      t->MultiGet(options, mget_range, prefix_extractor, skip_filters);
+      t->MultiGet(options, &table_range, prefix_extractor, skip_filters);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
-      for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
         Status* status = iter->s;
         if (status->IsIncomplete()) {
           // Couldn't find Table in cache but treat as kFound if no_io set
@@ -466,6 +516,31 @@ Status TableCache::MultiGet(const ReadOptions& options,
     }
   }
 
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey;;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge =
+            row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                                    &DeleteEntry<std::string>);
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
   if (handle != nullptr) {
     ReleaseHandle(handle);
   }
diff --git a/db/table_cache.h b/db/table_cache.h
index f7e0b0b3544..85592858a8c 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -202,6 +202,20 @@ class TableCache {
                         bool skip_filters = false, int level = -1,
                         bool prefetch_index_and_filter_in_cache = true);
 
+  // Create a key prefix for looking up the row cache. The prefix is of the
+  // format row_cache_id + fd_number + seq_no. Later, the user key can be
+  // appended to form the full key
+  void CreateRowCacheKeyPrefix(const ReadOptions& options,
+                               const FileDescriptor& fd,
+                               const Slice& internal_key,
+                               GetContext* get_context,
+                               IterKey& row_cache_key);
+
+  // Helper function to lookup the row cache for a key. It appends the
+  // user key to row_cache_key at offset prefix_size
+  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                       size_t prefix_size, GetContext* get_context);
+
   const ImmutableCFOptions& ioptions_;
   const EnvOptions& env_options_;
   Cache* const cache_;
diff --git a/db/version_set.cc b/db/version_set.cc
index 7b4b9ccc1aa..ea6b0ab6a14 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1866,7 +1866,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
         user_comparator(), merge_operator_, info_log_, db_statistics_,
         iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
         iter->value, nullptr, &(iter->merge_context), true,
-        &iter->max_covering_tombstone_seq, this->env_, &iter->seq,
+        &iter->max_covering_tombstone_seq, this->env_, nullptr,
         merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
         tracing_mget_id);
   }
diff --git a/table/multiget_context.h b/table/multiget_context.h
index c9e682fad4b..88ec4dcc453 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -25,7 +25,6 @@ struct KeyContext {
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq;
   bool key_exists;
-  SequenceNumber seq;
   void* cb_arg;
   PinnableSlice* value;
   GetContext* get_context;
@@ -36,7 +35,6 @@ struct KeyContext {
         s(stat),
         max_covering_tombstone_seq(0),
         key_exists(false),
-        seq(0),
         cb_arg(nullptr),
         value(val),
         get_context(nullptr) {}

From ab0645a596e0dfed0ae37ae61420fff923fd7fa2 Mon Sep 17 00:00:00 2001
From: Shafreeck Sea <shafreeck@gmail.com>
Date: Thu, 29 Aug 2019 10:55:40 -0700
Subject: [PATCH 332/572] Fix comment of function
 NotifyCollectTableCollectorsOnFinish (#5738)

Summary:
Signed-off-by: Shafreeck Sea <shafreeck@gmail.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5738

Differential Revision: D17097075

Pulled By: riversand963

fbshipit-source-id: ed01b5f59e8eed262a49abe1f96552842d364af1
---
 table/meta_blocks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 86c703f953c..63d66497f63 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -89,7 +89,7 @@ void NotifyCollectTableCollectorsOnBlockAdd(
     uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
     uint64_t blockCompressedBytesSlow);
 
-// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
 bool NotifyCollectTableCollectorsOnFinish(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,

From c5e12ebfd2bea5abb97074a75ce041aa43414bde Mon Sep 17 00:00:00 2001
From: Jeremy Taylor <jdt@juxt.pro>
Date: Thu, 29 Aug 2019 11:28:37 -0700
Subject: [PATCH 333/572] Add Crux to USERS.md

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5718

Differential Revision: D17096939

Pulled By: riversand963

fbshipit-source-id: 4301078d3ca3d54a1c7e841eccad95379cd1570d
---
 USERS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/USERS.md b/USERS.md
index 6401757d2bd..fdb5af74d9e 100644
--- a/USERS.md
+++ b/USERS.md
@@ -96,3 +96,6 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed
 
 ## IOTA Foundation
  [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
+ 
+## Crux
+[Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.

From 1729779b85e29f190612f82592e7070b6d24f58b Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 29 Aug 2019 12:12:05 -0700
Subject: [PATCH 334/572] Disable MultiGet row cache test in LITE mode (#5756)

Summary:
Row cache is not supported in LITE mode. So disable the test in that mode.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5756

Test Plan: make LITE=1 all check

Differential Revision: D17115684

Pulled By: anand1976

fbshipit-source-id: e6433c2e528674645cea76cdfc80ddc473708fc2
---
 db/db_basic_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index e09f5fe8aeb..960feef5ff6 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1335,6 +1335,7 @@ INSTANTIATE_TEST_CASE_P(
     MultiGetPrefix, MultiGetPrefixExtractorTest,
     ::testing::Bool());
 
+#ifndef ROCKSDB_LITE
 class DBMultiGetRowCacheTest
     : public DBBasicTest,
       public ::testing::WithParamInterface<bool> {};
@@ -1427,7 +1428,6 @@ TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
 INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
     testing::Values(true, false));
 
-#ifndef ROCKSDB_LITE
 TEST_F(DBBasicTest, GetAllKeyVersions) {
   Options options = CurrentOptions();
   options.env = env_;

From a28182233109f09e23e6336b5ed860ddf2c8eece Mon Sep 17 00:00:00 2001
From: Pratik Dhandharia <pratikrd@gmail.com>
Date: Thu, 29 Aug 2019 14:06:07 -0700
Subject: [PATCH 335/572] Lower the risk for users to run
 options.force_consistency_checks = true (#5744)

Summary:
Open-source users recently reported two occurrences of LSM-tree corruption (https://github.com/facebook/rocksdb/issues/5558 is one), which would be caught by options.force_consistency_checks = true. options.force_consistency_checks has a usability limitation because it crashes the service once inconsistency is detected. This makes the feature hard to use. Most users serve from multiple RocksDB shards per server and the impacts of crashing the service is higher than it should be.

Instead, we just pass the error back to users without killing the service, and ask them to deal with the problem accordingly.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5744

Differential Revision: D17096940

Pulled By: pdhandharia

fbshipit-source-id: b6780039044e265f26ed2ad03c51f4abbe8b603c
---
 HISTORY.md               |  1 +
 db/db_compaction_test.cc | 24 +++++++++++
 db/version_builder.cc    | 86 ++++++++++++++++++++++++++++------------
 db/version_builder.h     | 10 ++---
 db/version_set.cc        | 43 +++++++++++++++-----
 db/version_set.h         |  4 +-
 6 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index da06ca2280b..9bb717423b8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 ### New Features
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
+* When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 45ba1701139..49307bc7839 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -4658,7 +4658,31 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Close();
 }
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency", [&](void* arg) {
+      auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < 2; ++k) {
+    ASSERT_OK(Put("foo", "bar"));
+    Flush();
+  }
 
+  ASSERT_NOK(Put("foo", "bar"));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 9d2ba9ab4ee..b97853f2d6d 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -27,6 +27,7 @@
 #include "db/version_set.h"
 #include "port/port.h"
 #include "table/table_reader.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -138,12 +139,12 @@ class VersionBuilder::Rep {
     }
   }
 
-  void CheckConsistency(VersionStorageInfo* vstorage) {
+  Status CheckConsistency(VersionStorageInfo* vstorage) {
 #ifdef NDEBUG
     if (!vstorage->force_consistency_checks()) {
       // Dont run consistency checks in release mode except if
       // explicitly asked to
-      return;
+      return Status::OK();
     }
 #endif
     // make sure the files are sorted correctly
@@ -152,10 +153,14 @@ class VersionBuilder::Rep {
       for (size_t i = 1; i < level_files.size(); i++) {
         auto f1 = level_files[i - 1];
         auto f2 = level_files[i];
+#ifndef NDEBUG
+        auto pair = std::make_pair(&f1, &f2);
+        TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair);
+#endif
         if (level == 0) {
           if (!level_zero_cmp_(f1, f2)) {
             fprintf(stderr, "L0 files are not sorted properly");
-            abort();
+            return Status::Corruption("L0 files are not sorted properly");
           }
 
           if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
@@ -168,7 +173,14 @@ class VersionBuilder::Rep {
                       " vs. file with global_seqno %" PRIu64 "\n",
                       f1->fd.smallest_seqno, f1->fd.largest_seqno,
                       external_file_seqno);
-              abort();
+              return Status::Corruption("L0 file with seqno " +
+                                        NumberToString(f1->fd.smallest_seqno) +
+                                        " " +
+                                        NumberToString(f1->fd.largest_seqno) +
+                                        " vs. file with global_seqno" +
+                                        NumberToString(external_file_seqno) +
+                                        " with fileNumber " +
+                                        NumberToString(f1->fd.GetNumber()));
             }
           } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
             fprintf(stderr,
@@ -176,12 +188,19 @@ class VersionBuilder::Rep {
                     " %" PRIu64 "\n",
                     f1->fd.smallest_seqno, f1->fd.largest_seqno,
                     f2->fd.smallest_seqno, f2->fd.largest_seqno);
-            abort();
+            return Status::Corruption(
+                "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) +
+                " " + NumberToString(f1->fd.largest_seqno) + " " +
+                NumberToString(f1->fd.GetNumber()) + " vs. " +
+                NumberToString(f2->fd.smallest_seqno) + " " +
+                NumberToString(f2->fd.largest_seqno) + " " +
+                NumberToString(f2->fd.GetNumber()));
           }
         } else {
           if (!level_nonzero_cmp_(f1, f2)) {
             fprintf(stderr, "L%d files are not sorted properly", level);
-            abort();
+            return Status::Corruption("L" + NumberToString(level) +
+                                      " files are not sorted properly");
           }
 
           // Make sure there is no overlap in levels > 0
@@ -190,20 +209,24 @@ class VersionBuilder::Rep {
             fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
                     (f1->largest).DebugString(true).c_str(),
                     (f2->smallest).DebugString(true).c_str());
-            abort();
+            return Status::Corruption(
+                "L" + NumberToString(level) + " have overlapping ranges " +
+                (f1->largest).DebugString(true) + " vs. " +
+                (f2->smallest).DebugString(true));
           }
         }
       }
     }
+    return Status::OK();
   }
 
-  void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
-                                  int level) {
+  Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
+                                    int level) {
 #ifdef NDEBUG
     if (!base_vstorage_->force_consistency_checks()) {
       // Dont run consistency checks in release mode except if
       // explicitly asked to
-      return;
+      return Status::OK();
     }
 #endif
     // a file to be deleted better exist in the previous version
@@ -241,8 +264,9 @@ class VersionBuilder::Rep {
     }
     if (!found) {
       fprintf(stderr, "not found %" PRIu64 "\n", number);
-      abort();
+      return Status::Corruption("not found " + NumberToString(number));
     }
+    return Status::OK();
   }
 
   bool CheckConsistencyForNumLevels() {
@@ -259,8 +283,11 @@ class VersionBuilder::Rep {
   }
 
   // Apply all of the edits in *edit to the current state.
-  void Apply(VersionEdit* edit) {
-    CheckConsistency(base_vstorage_);
+  Status Apply(VersionEdit* edit) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
 
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
@@ -308,12 +335,20 @@ class VersionBuilder::Rep {
         }
       }
     }
+    return s;
   }
 
   // Save the current state in *v.
-  void SaveTo(VersionStorageInfo* vstorage) {
-    CheckConsistency(base_vstorage_);
-    CheckConsistency(vstorage);
+  Status SaveTo(VersionStorageInfo* vstorage) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = CheckConsistency(vstorage);
+    if (!s.ok()) {
+      return s;
+    }
 
     for (int level = 0; level < num_levels_; level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
@@ -357,7 +392,8 @@ class VersionBuilder::Rep {
       }
     }
 
-    CheckConsistency(vstorage);
+    s = CheckConsistency(vstorage);
+    return s;
   }
 
   Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
@@ -475,23 +511,23 @@ VersionBuilder::VersionBuilder(const EnvOptions& env_options,
 
 VersionBuilder::~VersionBuilder() { delete rep_; }
 
-void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
-  rep_->CheckConsistency(vstorage);
+Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  return rep_->CheckConsistency(vstorage);
 }
 
-void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
-                                                uint64_t number, int level) {
-  rep_->CheckConsistencyForDeletes(edit, number, level);
+Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                  uint64_t number, int level) {
+  return rep_->CheckConsistencyForDeletes(edit, number, level);
 }
 
 bool VersionBuilder::CheckConsistencyForNumLevels() {
   return rep_->CheckConsistencyForNumLevels();
 }
 
-void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); }
 
-void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
-  rep_->SaveTo(vstorage);
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  return rep_->SaveTo(vstorage);
 }
 
 Status VersionBuilder::LoadTableHandlers(
diff --git a/db/version_builder.h b/db/version_builder.h
index 168301fdd61..f5fd121897b 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -27,12 +27,12 @@ class VersionBuilder {
   VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
                  VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
   ~VersionBuilder();
-  void CheckConsistency(VersionStorageInfo* vstorage);
-  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
-                                  int level);
+  Status CheckConsistency(VersionStorageInfo* vstorage);
+  Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                    int level);
   bool CheckConsistencyForNumLevels();
-  void Apply(VersionEdit* edit);
-  void SaveTo(VersionStorageInfo* vstorage);
+  Status Apply(VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage);
   Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
                            bool prefetch_index_and_filter_in_cache,
                            bool is_initial_load,
diff --git a/db/version_set.cc b/db/version_set.cc
index ea6b0ab6a14..e7d334825f1 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3622,7 +3622,14 @@ Status VersionSet::ProcessManifestWrites(
         } else if (group_start != std::numeric_limits<size_t>::max()) {
           group_start = std::numeric_limits<size_t>::max();
         }
-        LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        if (!s.ok()) {
+          // free up the allocated memory
+          for (auto v : versions) {
+            delete v;
+          }
+          return s;
+        }
         batch_edits.push_back(e);
       }
     }
@@ -3630,7 +3637,14 @@ Status VersionSet::ProcessManifestWrites(
       assert(!builder_guards.empty() &&
              builder_guards.size() == versions.size());
       auto* builder = builder_guards[i]->version_builder();
-      builder->SaveTo(versions[i]->storage_info());
+      Status s = builder->SaveTo(versions[i]->storage_info());
+      if (!s.ok()) {
+        // free up the allocated memory
+        for (auto v : versions) {
+          delete v;
+        }
+        return s;
+      }
     }
   }
 
@@ -4010,9 +4024,9 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   }
 }
 
-void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
-                                   VersionBuilder* builder, VersionEdit* edit,
-                                   InstrumentedMutex* mu) {
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                     VersionBuilder* builder, VersionEdit* edit,
+                                     InstrumentedMutex* mu) {
 #ifdef NDEBUG
   (void)cfd;
 #endif
@@ -4036,7 +4050,9 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
   edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
                                                       : last_sequence_);
 
-  builder->Apply(edit);
+  Status s = builder->Apply(edit);
+
+  return s;
 }
 
 Status VersionSet::ApplyOneVersionEditToBuilder(
@@ -4129,7 +4145,10 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
     // to builder
     auto builder = builders.find(edit.column_family_);
     assert(builder != builders.end());
-    builder->second->version_builder()->Apply(&edit);
+    Status s = builder->second->version_builder()->Apply(&edit);
+    if (!s.ok()) {
+      return s;
+    }
   }
   return ExtractInfoFromVersionEdit(
       cfd, edit, have_log_number, log_number, have_prev_log_number,
@@ -4748,7 +4767,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->version_builder()->Apply(&edit);
+        s = builder->second->version_builder()->Apply(&edit);
+        if (!s.ok()) {
+          break;
+        }
       }
 
       if (cfd != nullptr && edit.has_log_number_) {
@@ -5767,7 +5789,10 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     }
     active_version_builders_.erase(builder_iter);
   } else {
-    builder->Apply(&edit);
+    Status s = builder->Apply(&edit);
+    if (!s.ok()) {
+      return s;
+    }
   }
   Status s = ExtractInfoFromVersionEdit(
       cfd, edit, have_log_number, log_number, have_prev_log_number,
diff --git a/db/version_set.h b/db/version_set.h
index 028c6ad1ba1..766e071f4e5 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1154,8 +1154,8 @@ class VersionSet {
                                const ColumnFamilyOptions* new_cf_options);
 
   void LogAndApplyCFHelper(VersionEdit* edit);
-  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
-                         VersionEdit* edit, InstrumentedMutex* mu);
+  Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+                           VersionEdit* edit, InstrumentedMutex* mu);
 };
 
 // ReactiveVersionSet represents a collection of versions of the column

From 9a449865e3963e7cff0bc7fc7f0731d3dea459de Mon Sep 17 00:00:00 2001
From: Paul O'Shannessy <poshannessy@fb.com>
Date: Thu, 29 Aug 2019 23:19:10 -0700
Subject: [PATCH 336/572] Adopt Contributor Covenant

Summary:
In order to foster healthy open source communities, we're adopting the
[Contributor Covenant](https://www.contributor-covenant.org/). It has been
built by open source community members and represents a shared understanding of
what is expected from a healthy community.

Reviewed By: josephsavona, danobi, rdzhabarov

Differential Revision: D17104640

fbshipit-source-id: d210000de686c5f0d97d602b50472d5869bc6a49
---
 CODE_OF_CONDUCT.md | 76 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 0a45f9bd5f0..d1abc700d28 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,3 +1,77 @@
 # Code of Conduct
 
-Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.facebook.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+

From 672befea2a514e32c8506389883f552129d2d5eb Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 30 Aug 2019 12:40:24 -0700
Subject: [PATCH 337/572] Fix assertion failure in FIFO compaction with TTL
 (#5754)

Summary:
Before this PR, the following sequence of events can cause assertion failure as shown below.
Stack trace (partial):
```
(gdb) bt
2  0x00007f59b350ad15 in __assert_fail_base (fmt=<optimized out>, assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:92
3  0x00007f59b350adc3 in __GI___assert_fail (assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:101
4  0x0000000000492ccd in rocksdb::Compaction::MarkFilesBeingCompacted (this=<optimized out>, mark_as_compacted=<optimized out>) at db/compaction/compaction.cc:394
5  0x000000000049467a in rocksdb::Compaction::Compaction (this=0x7f59af013000, vstorage=0x7f581af53030, _immutable_cf_options=..., _mutable_cf_options=..., _inputs=..., _output_level=<optimized out>, _target_file_size=0, _max_compaction_bytes=0, _output_path_id=0, _compression=<incomplete type>, _compression_opts=..., _max_subcompactions=0, _grandparents=..., _manual_compaction=false, _score=4, _deletion_compaction=true, _compaction_reason=rocksdb::CompactionReason::kFIFOTtl) at db/compaction/compaction.cc:241
6  0x00000000004af9bc in rocksdb::FIFOCompactionPicker::PickTTLCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:101
7  0x00000000004b0771 in rocksdb::FIFOCompactionPicker::PickCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:201
8  0x00000000004838cc in rocksdb::ColumnFamilyData::PickCompaction (this=this@entry=0x7f59b31b3700, mutable_options=..., log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/column_family.cc:933
9  0x00000000004f3645 in rocksdb::DBImpl::BackgroundCompaction (this=this@entry=0x7f59b3176000, made_progress=made_progress@entry=0x7f59b1bfa6bf, job_context=job_context@entry=0x7f59b1bfa760, log_buffer=log_buffer@entry=0x7f59b1bfa930, prepicked_compaction=prepicked_compaction@entry=0x0, thread_pri=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2541
10 0x00000000004f5e2a in rocksdb::DBImpl::BackgroundCallCompaction (this=this@entry=0x7f59b3176000, prepicked_compaction=prepicked_compaction@entry=0x0, bg_thread_pri=bg_thread_pri@entry=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2312
11 0x00000000004f648e in rocksdb::DBImpl::BGWorkCompaction (arg=<optimized out>) at db/db_impl/db_impl_compaction_flush.cc:2087
```
This can be caused by the following sequence of events.
```
Time
|      thr          bg_compact_thr1                     bg_compact_thr2
|      write
|      flush
|                   mark all l0 as being compacted
|      write
|      flush
|                   add cf to queue again
|                                                       mark all l0 as being
|                                                       compacted, fail the
|                                                       assertion
V
```
Test plan (on devserver)
Since bg_compact_thr1 and bg_compact_thr2 are two threads executing the same
code, it is difficult to use sync point dependency to
coordinate their execution. Therefore, I choose to use db_stress.
```
$TEST_TMPDIR=/dev/shm/rocksdb ./db_stress --periodic_compaction_seconds=1 --max_background_compactions=20 --format_version=2 --memtablerep=skip_list --max_write_buffer_number=3 --cache_index_and_filter_blocks=1 --reopen=20 --recycle_log_file_num=0 --acquire_snapshot_one_in=10000 --delpercent=4 --log2_keys_per_lock=22 --compaction_ttl=1 --block_size=16384 --use_multiget=1 --compact_files_one_in=1000000 --target_file_size_multiplier=2 --clear_column_family_one_in=0 --max_bytes_for_level_base=10485760 --use_full_merge_v1=1 --target_file_size_base=2097152 --checkpoint_one_in=1000000 --mmap_read=0 --compression_type=zstd --writepercent=35 --readpercent=45 --subcompactions=4 --use_merge=0 --write_buffer_size=4194304 --test_batches_snapshots=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox --use_direct_reads=0 --compact_range_one_in=1000000 --open_files=-1 --destroy_db_initially=0 --progress_reports=0 --compression_zstd_max_train_bytes=0 --snapshot_hold_ops=100000 --enable_pipelined_write=0 --nooverwritepercent=1 --compression_max_dict_bytes=0 --max_key=1000000 --prefixpercent=5 --flush_one_in=1000000 --ops_per_thread=40000 --index_block_restart_interval=7 --cache_size=1048576 --compaction_style=2 --verify_checksum=1 --delrangepercent=1 --use_direct_io_for_flush_and_compaction=0
```
This should see no assertion failure.
Last but not least,
```
$COMPILE_WITH_ASAN=1 make -j32 all
$make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5754

Differential Revision: D17109791

Pulled By: riversand963

fbshipit-source-id: 25fc46101235add158554e096540b72c324be078
---
 db/compaction/compaction_picker_fifo.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 4ff301d21c3..cacb33c2273 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -54,6 +54,15 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   }
   const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
   std::vector<CompactionInputFiles> inputs;
   inputs.emplace_back();
   inputs[0].level = 0;

From 44eca41addbfb5c158756cc96b135381536dd9a2 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 30 Aug 2019 18:27:43 -0700
Subject: [PATCH 338/572] Fix a bug in file ingestion (#5760)

Summary:
Before this PR, when the number of column families involved in a file ingestion exceeds 2, a bug in the looping logic prevents correct file number being assigned to each ingestion job.
Also skip deleting non-existing hard links during cleanup-after-failure.

Test plan (devserver)
```
$COMPILE_WITH_ASAN=1 make all
$./external_sst_file_test --gtest_filter=ExternalSSTFileTest/ExternalSSTFileTest.IngestFilesIntoMultipleColumnFamilies_*/*
$makke check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5760

Differential Revision: D17142982

Pulled By: riversand963

fbshipit-source-id: 06c1847a4e7a402647bcf28d124e70f2a0f9daf6
---
 HISTORY.md                            |  1 +
 db/db_impl/db_impl.cc                 |  4 +-
 db/external_sst_file_ingestion_job.cc |  5 ++-
 db/external_sst_file_test.cc          | 55 +++++++++++++++++++--------
 4 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 9bb717423b8..da42ecf0787 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 * Fixed a number of data races in BlobDB.
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
+* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
 ### New Features
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 587f1d8d951..bdbc2134601 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3766,9 +3766,9 @@ Status DBImpl::IngestExternalFiles(
     exec_results.emplace_back(false, Status::OK());
   }
   // TODO(yanqin) maybe make jobs run in parallel
+  uint64_t start_file_number = next_file_number;
   for (size_t i = 1; i != num_cfs; ++i) {
-    uint64_t start_file_number =
-        next_file_number + args[i - 1].external_files.size();
+    start_file_number += args[i - 1].external_files.size();
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 36fbe988660..03bcd424022 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -160,7 +160,7 @@ Status ExternalSstFileIngestionJob::Prepare(
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
       if (f.internal_file_path.empty()) {
-        break;
+        continue;
       }
       Status s = env_->DeleteFile(f.internal_file_path);
       if (!s.ok()) {
@@ -291,6 +291,9 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
     // We failed to add the files to the database
     // remove all the files we copied
     for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
       Status s = env_->DeleteFile(f.internal_file_path);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index f5bed175042..7fe325e3cf7 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -2367,10 +2367,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
       new FaultInjectionTestEnv(env_));
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2384,6 +2385,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2391,8 +2395,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
                                          -1, true, true_data);
   ASSERT_OK(s);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2424,10 +2429,11 @@ TEST_P(ExternalSSTFileTest,
 
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   const std::vector<std::map<std::string, std::string>> data_before_ingestion =
       {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
-       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}};
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+       {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
   for (size_t i = 0; i != handles_.size(); ++i) {
     int cf = static_cast<int>(i);
     const auto& orig_data = data_before_ingestion[i];
@@ -2440,6 +2446,7 @@ TEST_P(ExternalSSTFileTest,
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2453,6 +2460,8 @@ TEST_P(ExternalSSTFileTest,
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2506,10 +2515,11 @@ TEST_P(ExternalSSTFileTest,
   dbfull()->ReleaseSnapshot(read_opts.snapshot);
 
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
   // Should see consistent state after ingestion for all column families even
   // without snapshot.
-  ASSERT_EQ(2, handles_.size());
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2539,10 +2549,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
        "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2556,6 +2567,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2575,8 +2589,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
 
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2605,10 +2620,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
        "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2622,6 +2638,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2641,8 +2659,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
 
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2662,7 +2681,7 @@ TEST_P(ExternalSSTFileTest,
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
 
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
 
   SyncPoint::GetInstance()->ClearTrace();
   SyncPoint::GetInstance()->DisableProcessing();
@@ -2680,6 +2699,7 @@ TEST_P(ExternalSSTFileTest,
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2693,6 +2713,8 @@ TEST_P(ExternalSSTFileTest,
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2713,8 +2735,9 @@ TEST_P(ExternalSSTFileTest,
   fault_injection_env->DropUnsyncedFileData();
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {

From 979fbdc6968649662ad388984b6a0f288b6c3811 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Tue, 3 Sep 2019 08:50:47 -0700
Subject: [PATCH 339/572] Persistent globally unique DB ID in manifest (#5725)

Summary:
Each DB has a globally unique ID. A DB can be physically copied around, or backed-up and restored, and the users should be identify the same DB. This unique ID right now is stored as plain text in file IDENTITY under the DB directory. This approach introduces at least two problems: (1) the file is not checksumed; (2) the source of truth of a DB is the manifest file, which can be copied separately from IDENTITY file, causing the DB ID to be wrong.
The goal of this PR is solve this problem by moving the  DB ID to manifest. To begin with we will write to both identity file and manifest. Write to Manifest is controlled via the flag write_dbid_to_manifest in Options and default is false.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5725

Test Plan: Added unit tests.

Differential Revision: D16963840

Pulled By: vjnadimpalli

fbshipit-source-id: 8a86a4c8c82c716003c40fd6b9d2d758030d92e9
---
 db/compaction/compaction_job_test.cc |   8 +
 db/db_basic_test.cc                  |  75 ++++++-
 db/db_impl/db_impl.cc                |  11 +-
 db/db_impl/db_impl.h                 |   3 +
 db/db_impl/db_impl_open.cc           |  67 ++++--
 db/flush_job_test.cc                 |   8 +
 db/version_edit.cc                   |  34 ++-
 db/version_edit.h                    |  11 +
 db/version_edit_test.cc              |  10 +
 db/version_set.cc                    | 309 ++++++++++++---------------
 db/version_set.h                     |  39 ++--
 db/version_set_test.cc               |   9 +-
 file/filename.cc                     |  10 +-
 file/filename.h                      |   3 +-
 include/rocksdb/options.h            |  11 +
 options/db_options.cc                |   3 +
 options/db_options.h                 |   1 +
 options/options_helper.cc            |   3 +
 options/options_settable_test.cc     |   3 +-
 19 files changed, 389 insertions(+), 229 deletions(-)

diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index b813966bc5f..8f858e092de 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -14,6 +14,7 @@
 
 #include "db/column_family.h"
 #include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
@@ -204,8 +205,15 @@ class CompactionJobTest : public testing::Test {
                                    &write_controller_,
                                    /*block_cache_tracer=*/nullptr));
     compaction_job_stats_.Reset();
+    SetIdentityFile(env_, dbname_);
 
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 960feef5ff6..907fd3b4338 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -70,6 +70,44 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
 }
 
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  std::string db_id1;
+  db_->GetDbIdentity(db_id1);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  std::string db_id2;
+  db_->GetDbIdentity(db_id2);
+  ASSERT_EQ(db_id1, db_id2);
+}
+
 TEST_F(DBBasicTest, CompactedDB) {
   const uint64_t kFileSize = 1 << 20;
   Options options = CurrentOptions();
@@ -424,12 +462,39 @@ TEST_F(DBBasicTest, ManifestRollOver) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, IdentityAcrossRestarts) {
+TEST_F(DBBasicTest, IdentityAcrossRestarts1) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    Reopen(options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    if (options.write_dbid_to_manifest) {
+      ASSERT_EQ(id1.compare(id3), 0);
+    } else {
+      // id1 should NOT match id3 because identity was regenerated
+      ASSERT_NE(id1.compare(id3), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts2) {
   do {
     std::string id1;
     ASSERT_OK(db_->GetDbIdentity(id1));
 
     Options options = CurrentOptions();
+    options.write_dbid_to_manifest = true;
     Reopen(options);
     std::string id2;
     ASSERT_OK(db_->GetDbIdentity(id2));
@@ -442,7 +507,7 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) {
     std::string id3;
     ASSERT_OK(db_->GetDbIdentity(id3));
     // id1 should NOT match id3 because identity was regenerated
-    ASSERT_NE(id1.compare(id3), 0);
+    ASSERT_EQ(id1, id3);
   } while (ChangeCompactOptions());
 }
 
@@ -1370,8 +1435,8 @@ TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
     if (use_snapshots) {
       ro.snapshot = snap2;
     }
-    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(),
-                  values.data(), s.data(), false);
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
 
     ASSERT_EQ(values.size(), keys.size());
     ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
@@ -1426,7 +1491,7 @@ TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
 }
 
 INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
-    testing::Values(true, false));
+                        testing::Values(true, false));
 
 TEST_F(DBBasicTest, GetAllKeyVersions) {
   Options options = CurrentOptions();
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index bdbc2134601..fd1f0b45c43 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3158,6 +3158,11 @@ Status DBImpl::CheckConsistency() {
 }
 
 Status DBImpl::GetDbIdentity(std::string& identity) const {
+  identity.assign(db_id_);
+  return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
   std::string idfilename = IdentityFileName(dbname_);
   const EnvOptions soptions;
   std::unique_ptr<SequentialFileReader> id_file_reader;
@@ -3184,10 +3189,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
   if (!s.ok()) {
     return s;
   }
-  identity.assign(id.ToString());
+  identity->assign(id.ToString());
   // If last character is '\n' remove it from identity
-  if (identity.size() > 0 && identity.back() == '\n') {
-    identity.pop_back();
+  if (identity->size() > 0 && identity->back() == '\n') {
+    identity->pop_back();
   }
   return s;
 }
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index dcefc47533a..7fac09bfa67 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -317,6 +317,8 @@ class DBImpl : public DB {
 
   virtual Status GetDbIdentity(std::string& identity) const override;
 
+  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
   ColumnFamilyHandle* DefaultColumnFamily() const override;
 
   ColumnFamilyHandle* PersistentStatsColumnFamily() const;
@@ -926,6 +928,7 @@ class DBImpl : public DB {
  protected:
   Env* const env_;
   const std::string dbname_;
+  std::string db_id_;
   std::unique_ptr<VersionSet> versions_;
   // Flag to check whether we allocated and own the info log file
   bool own_info_log_;
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 737e3a66087..4fed3fea33c 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -231,12 +231,19 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
 
 Status DBImpl::NewDB() {
   VersionEdit new_db;
+  Status s = SetIdentityFile(env_, dbname_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.write_dbid_to_manifest) {
+    std::string temp_db_id;
+    GetDbIdentityFromIdentityFile(&temp_db_id);
+    new_db.SetDBId(temp_db_id);
+  }
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
-  Status s;
-
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
@@ -339,6 +346,9 @@ Status DBImpl::Recover(
     s = env_->FileExists(CurrentFileName(dbname_));
     if (s.IsNotFound()) {
       if (immutable_db_options_.create_if_missing) {
+        // Has to be called only after Identity File creation is successful
+        // because DB ID is stored in Manifest if
+        // immutable_db_options_.write_dbid_to_manifest = true
         s = NewDB();
         is_new_db = true;
         if (!s.ok()) {
@@ -358,30 +368,19 @@ Status DBImpl::Recover(
       assert(s.IsIOError());
       return s;
     }
-    // Check for the IDENTITY file and create it if not there
-    s = env_->FileExists(IdentityFileName(dbname_));
-    if (s.IsNotFound()) {
-      s = SetIdentityFile(env_, dbname_);
-      if (!s.ok()) {
-        return s;
-      }
-    } else if (!s.ok()) {
-      assert(s.IsIOError());
-      return s;
-    }
     // Verify compatibility of env_options_ and filesystem
     {
       std::unique_ptr<RandomAccessFile> idfile;
       EnvOptions customized_env(env_options_);
       customized_env.use_direct_reads |=
           immutable_db_options_.use_direct_io_for_flush_and_compaction;
-      s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+      s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile,
                                     customized_env);
       if (!s.ok()) {
         std::string error_str = s.ToString();
         // Check if unsupported Direct I/O is the root cause
         customized_env.use_direct_reads = false;
-        s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+        s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile,
                                       customized_env);
         if (s.ok()) {
           return Status::InvalidArgument(
@@ -393,8 +392,42 @@ Status DBImpl::Recover(
       }
     }
   }
-
-  Status s = versions_->Recover(column_families, read_only);
+  assert(db_id_.empty());
+  Status s = versions_->Recover(column_families, read_only, &db_id_);
+  if (!s.ok()) {
+    return s;
+  }
+  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+  // the very first time.
+  if (db_id_.empty()) {
+    // Check for the IDENTITY file and create it if not there.
+    s = env_->FileExists(IdentityFileName(dbname_));
+    // Typically Identity file is created in NewDB() and for some reason if
+    // it is no longer available then at this point DB ID is not in Identity
+    // file or Manifest.
+    if (s.IsNotFound()) {
+      s = SetIdentityFile(env_, dbname_);
+      if (!s.ok()) {
+        return s;
+      }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    GetDbIdentityFromIdentityFile(&db_id_);
+    if (immutable_db_options_.write_dbid_to_manifest) {
+      VersionEdit edit;
+      edit.SetDBId(db_id_);
+      Options options;
+      MutableCFOptions mutable_cf_options(options);
+      versions_->db_id_ = db_id_;
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options, &edit, &mutex_, nullptr,
+                             false);
+    }
+  } else {
+    SetIdentityFile(env_, dbname_, db_id_);
+  }
 
   if (immutable_db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 130179ae67b..3725aae9561 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -8,6 +8,7 @@
 #include <string>
 
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/flush_job.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
@@ -55,7 +56,14 @@ class FlushJobTest : public testing::Test {
   }
 
   void NewDB() {
+    SetIdentityFile(env_, dbname_);
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
diff --git a/db/version_edit.cc b/db/version_edit.cc
index ecadf6e3980..ccf19682b86 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -18,8 +18,13 @@
 
 namespace rocksdb {
 
+// Mask for an identified tag from the future which can be safely ignored.
+const uint32_t kTagSafeIgnoreMask = 1 << 13;
+
 // Tag numbers for serialized VersionEdit.  These numbers are written to
-// disk and should not be changed.
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
 enum Tag : uint32_t {
   kComparator = 1,
   kLogNumber = 2,
@@ -31,6 +36,8 @@ enum Tag : uint32_t {
   // 8 was used for large value refs
   kPrevLogNumber = 9,
   kMinLogNumberToKeep = 10,
+  // Ignore-able field
+  kDbId = kTagSafeIgnoreMask + 1,
 
   // these are new formats divergent from open source leveldb
   kNewFile2 = 100,
@@ -44,9 +51,6 @@ enum Tag : uint32_t {
   kInAtomicGroup = 300,
 };
 
-// Mask for an identified tag from the future which can be safely ignored.
-uint32_t kTagSafeIgnoreMask = 1 << 13;
-
 enum CustomTag : uint32_t {
   kTerminate = 1,  // The end of customized fields
   kNeedCompaction = 2,
@@ -67,6 +71,7 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
 }
 
 void VersionEdit::Clear() {
+  db_id_.clear();
   comparator_.clear();
   max_level_ = 0;
   log_number_ = 0;
@@ -75,6 +80,7 @@ void VersionEdit::Clear() {
   next_file_number_ = 0;
   max_column_family_ = 0;
   min_log_number_to_keep_ = 0;
+  has_db_id_ = false;
   has_comparator_ = false;
   has_log_number_ = false;
   has_prev_log_number_ = false;
@@ -93,6 +99,10 @@ void VersionEdit::Clear() {
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_db_id_) {
+    PutVarint32(dst, kDbId);
+    PutLengthPrefixedSlice(dst, db_id_);
+  }
   if (has_comparator_) {
     PutVarint32(dst, kComparator);
     PutLengthPrefixedSlice(dst, comparator_);
@@ -320,9 +330,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
   FileMetaData f;
   Slice str;
   InternalKey key;
-
   while (msg == nullptr && GetVarint32(&input, &tag)) {
     switch (tag) {
+      case kDbId:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          db_id_ = str.ToString();
+          has_db_id_ = true;
+        } else {
+          msg = "db id";
+        }
+        break;
       case kComparator:
         if (GetLengthPrefixedSlice(&input, &str)) {
           comparator_ = str.ToString();
@@ -537,6 +554,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 std::string VersionEdit::DebugString(bool hex_key) const {
   std::string r;
   r.append("VersionEdit {");
+  if (has_db_id_) {
+    r.append("\n  DB ID: ");
+    r.append(db_id_);
+  }
   if (has_comparator_) {
     r.append("\n  Comparator: ");
     r.append(comparator_);
@@ -608,6 +629,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
   JSONWriter jw;
   jw << "EditNumber" << edit_num;
 
+  if (has_db_id_) {
+    jw << "DB ID" << db_id_;
+  }
   if (has_comparator_) {
     jw << "Comparator" << comparator_;
   }
diff --git a/db/version_edit.h b/db/version_edit.h
index 4a93db34e15..a9c4c5a9d98 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -198,6 +198,11 @@ class VersionEdit {
 
   void Clear();
 
+  void SetDBId(const std::string& db_id) {
+    has_db_id_ = true;
+    db_id_ = db_id;
+  }
+
   void SetComparatorName(const Slice& name) {
     has_comparator_ = true;
     comparator_ = name.ToString();
@@ -227,6 +232,8 @@ class VersionEdit {
     min_log_number_to_keep_ = num;
   }
 
+  bool has_db_id() { return has_db_id_; }
+
   bool has_log_number() { return has_log_number_; }
 
   uint64_t log_number() { return log_number_; }
@@ -314,6 +321,8 @@ class VersionEdit {
   std::string DebugString(bool hex_key = false) const;
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
+  const std::string GetDbId() { return db_id_; }
+
  private:
   friend class ReactiveVersionSet;
   friend class VersionSet;
@@ -323,6 +332,7 @@ class VersionEdit {
   bool GetLevel(Slice* input, int* level, const char** msg);
 
   int max_level_;
+  std::string db_id_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -331,6 +341,7 @@ class VersionEdit {
   // The most recent WAL log number that is deleted
   uint64_t min_log_number_to_keep_;
   SequenceNumber last_sequence_;
+  bool has_db_id_;
   bool has_comparator_;
   bool has_log_number_;
   bool has_prev_log_number_;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 23c63b7caea..1bf8d1612c8 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -239,6 +239,16 @@ TEST_F(VersionEditTest, IgnorableField) {
   ASSERT_EQ(88, ve.next_file_number());
 }
 
+TEST_F(VersionEditTest, DbId) {
+  VersionEdit edit;
+  edit.SetDBId("ab34-cd12-435f-er00");
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetDBId("34ba-cd12-435f-er01");
+  TestEncodeDecode(edit);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index e7d334825f1..e3c2397cdbd 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3747,7 +3747,7 @@ Status VersionSet::ProcessManifestWrites(
             nullptr, db_options_->listeners));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
-        s = WriteSnapshot(descriptor_log_.get());
+        s = WriteCurrentStateToManifest(descriptor_log_.get());
       }
     }
 
@@ -4061,10 +4061,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
     std::unordered_map<int, std::string>& column_families_not_found,
     std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
         builders,
-    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    VersionEditParams* version_edit_params) {
   // Not found means that user didn't supply that column
   // family option AND we encountered column family add
   // record. Once we encounter column family drop record,
@@ -4150,61 +4147,55 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
       return s;
     }
   }
-  return ExtractInfoFromVersionEdit(
-      cfd, edit, have_log_number, log_number, have_prev_log_number,
-      previous_log_number, have_next_file, next_file, have_last_sequence,
-      last_sequence, min_log_number_to_keep, max_column_family);
+  return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
 }
 
 Status VersionSet::ExtractInfoFromVersionEdit(
-    ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
-    uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    ColumnFamilyData* cfd, const VersionEdit& from_edit,
+    VersionEditParams* version_edit_params) {
   if (cfd != nullptr) {
-    if (edit.has_log_number_) {
-      if (cfd->GetLogNumber() > edit.log_number_) {
+    if (from_edit.has_db_id_) {
+      version_edit_params->SetDBId(from_edit.db_id_);
+    }
+    if (from_edit.has_log_number_) {
+      if (cfd->GetLogNumber() > from_edit.log_number_) {
         ROCKS_LOG_WARN(
             db_options_->info_log,
             "MANIFEST corruption detected, but ignored - Log numbers in "
             "records NOT monotonically increasing");
       } else {
-        cfd->SetLogNumber(edit.log_number_);
-        *have_log_number = true;
-        *log_number = edit.log_number_;
+        cfd->SetLogNumber(from_edit.log_number_);
+        version_edit_params->SetLogNumber(from_edit.log_number_);
       }
     }
-    if (edit.has_comparator_ &&
-        edit.comparator_ != cfd->user_comparator()->Name()) {
+    if (from_edit.has_comparator_ &&
+        from_edit.comparator_ != cfd->user_comparator()->Name()) {
       return Status::InvalidArgument(
           cfd->user_comparator()->Name(),
-          "does not match existing comparator " + edit.comparator_);
+          "does not match existing comparator " + from_edit.comparator_);
     }
   }
 
-  if (edit.has_prev_log_number_) {
-    *previous_log_number = edit.prev_log_number_;
-    *have_prev_log_number = true;
+  if (from_edit.has_prev_log_number_) {
+    version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
   }
 
-  if (edit.has_next_file_number_) {
-    *next_file = edit.next_file_number_;
-    *have_next_file = true;
+  if (from_edit.has_next_file_number_) {
+    version_edit_params->SetNextFile(from_edit.next_file_number_);
   }
 
-  if (edit.has_max_column_family_) {
-    *max_column_family = edit.max_column_family_;
+  if (from_edit.has_max_column_family_) {
+    version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
   }
 
-  if (edit.has_min_log_number_to_keep_) {
-    *min_log_number_to_keep =
-        std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_);
+  if (from_edit.has_min_log_number_to_keep_) {
+    version_edit_params->min_log_number_to_keep_ =
+        std::max(version_edit_params->min_log_number_to_keep_,
+                 from_edit.min_log_number_to_keep_);
   }
 
-  if (edit.has_last_sequence_) {
-    *last_sequence = edit.last_sequence_;
-    *have_last_sequence = true;
+  if (from_edit.has_last_sequence_) {
+    version_edit_params->SetLastSequence(from_edit.last_sequence_);
   }
   return Status::OK();
 }
@@ -4245,10 +4236,7 @@ Status VersionSet::ReadAndRecover(
     std::unordered_map<int, std::string>& column_families_not_found,
     std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
         builders,
-    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    VersionEditParams* version_edit_params, std::string* db_id) {
   assert(reader != nullptr);
   assert(read_buffer != nullptr);
   Status s;
@@ -4261,6 +4249,12 @@ Status VersionSet::ReadAndRecover(
     if (!s.ok()) {
       break;
     }
+    if (edit.has_db_id_) {
+      db_id_ = edit.GetDbId();
+      if (db_id != nullptr) {
+        db_id->assign(edit.GetDbId());
+      }
+    }
     s = read_buffer->AddEdit(&edit);
     if (!s.ok()) {
       break;
@@ -4270,12 +4264,9 @@ Status VersionSet::ReadAndRecover(
         // Apply edits in an atomic group when we have read all edits in the
         // group.
         for (auto& e : read_buffer->replay_buffer()) {
-          s = ApplyOneVersionEditToBuilder(
-              e, name_to_options, column_families_not_found, builders,
-              have_log_number, log_number, have_prev_log_number,
-              previous_log_number, have_next_file, next_file,
-              have_last_sequence, last_sequence, min_log_number_to_keep,
-              max_column_family);
+          s = ApplyOneVersionEditToBuilder(e, name_to_options,
+                                           column_families_not_found, builders,
+                                           version_edit_params);
           if (!s.ok()) {
             break;
           }
@@ -4288,11 +4279,9 @@ Status VersionSet::ReadAndRecover(
       }
     } else {
       // Apply a normal edit immediately.
-      s = ApplyOneVersionEditToBuilder(
-          edit, name_to_options, column_families_not_found, builders,
-          have_log_number, log_number, have_prev_log_number,
-          previous_log_number, have_next_file, next_file, have_last_sequence,
-          last_sequence, min_log_number_to_keep, max_column_family);
+      s = ApplyOneVersionEditToBuilder(edit, name_to_options,
+                                       column_families_not_found, builders,
+                                       version_edit_params);
       if (s.ok()) {
         recovered_edits++;
       }
@@ -4308,8 +4297,8 @@ Status VersionSet::ReadAndRecover(
 }
 
 Status VersionSet::Recover(
-    const std::vector<ColumnFamilyDescriptor>& column_families,
-    bool read_only) {
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id) {
   std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
   for (auto cf : column_families) {
     cf_name_to_options.insert({cf.name, cf.options});
@@ -4348,16 +4337,6 @@ Status VersionSet::Recover(
     return s;
   }
 
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       builders;
 
@@ -4377,7 +4356,7 @@ Status VersionSet::Recover(
   builders.insert(
       std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
                             new BaseReferencedVersionBuilder(default_cfd))));
-
+  VersionEditParams version_edit_params;
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
@@ -4386,33 +4365,32 @@ Status VersionSet::Recover(
     Slice record;
     std::string scratch;
     AtomicGroupReadBuffer read_buffer;
-    s = ReadAndRecover(
-        &reader, &read_buffer, cf_name_to_options, column_families_not_found,
-        builders, &have_log_number, &log_number, &have_prev_log_number,
-        &previous_log_number, &have_next_file, &next_file, &have_last_sequence,
-        &last_sequence, &min_log_number_to_keep, &max_column_family);
+    s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options,
+                       column_families_not_found, builders,
+                       &version_edit_params, db_id);
   }
 
   if (s.ok()) {
-    if (!have_next_file) {
+    if (!version_edit_params.has_next_file_number_) {
       s = Status::Corruption("no meta-nextfile entry in descriptor");
-    } else if (!have_log_number) {
+    } else if (!version_edit_params.has_log_number_) {
       s = Status::Corruption("no meta-lognumber entry in descriptor");
-    } else if (!have_last_sequence) {
+    } else if (!version_edit_params.has_last_sequence_) {
       s = Status::Corruption("no last-sequence-number entry in descriptor");
     }
 
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
+    if (!version_edit_params.has_prev_log_number_) {
+      version_edit_params.SetPrevLogNumber(0);
     }
 
-    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+    column_family_set_->UpdateMaxColumnFamily(
+        version_edit_params.max_column_family_);
 
     // When reading DB generated using old release, min_log_number_to_keep=0.
     // All log files will be scanned for potential prepare entries.
-    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
-    MarkFileNumberUsed(previous_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit_params.prev_log_number_);
+    MarkFileNumberUsed(version_edit_params.log_number_);
   }
 
   // there were some column families in the MANIFEST that weren't specified
@@ -4473,11 +4451,11 @@ Status VersionSet::Recover(
     }
 
     manifest_file_size_ = current_manifest_file_size;
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
+    next_file_number_.store(version_edit_params.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit_params.last_sequence_;
+    last_published_sequence_ = version_edit_params.last_sequence_;
+    last_sequence_ = version_edit_params.last_sequence_;
+    prev_log_number_ = version_edit_params.prev_log_number_;
 
     ROCKS_LOG_INFO(
         db_options_->info_log,
@@ -4487,8 +4465,9 @@ Status VersionSet::Recover(
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
         manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
-        last_sequence_.load(), log_number, prev_log_number_,
-        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
+        last_sequence_.load(), version_edit_params.log_number_,
+        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
+        min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
@@ -4633,7 +4612,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   }
 
   // we need to allocate an array with the old number of levels size to
-  // avoid SIGSEGV in WriteSnapshot()
+  // avoid SIGSEGV in WriteCurrentStatetoManifest()
   // however, all levels bigger or equal to new_levels will be empty
   std::vector<FileMetaData*>* new_files_list =
       new std::vector<FileMetaData*>[current_levels];
@@ -4873,7 +4852,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
     next_file_number_.store(number + 1, std::memory_order_relaxed);
   }
 }
-
 // Called only either from ::LogAndApply which is protected by mutex or during
 // recovery which is single-threaded.
 void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
@@ -4882,7 +4860,7 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
   }
 }
 
-Status VersionSet::WriteSnapshot(log::Writer* log) {
+Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // WARNING: This method doesn't hold a mutex!!
@@ -4890,6 +4868,22 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   // This is done without DB mutex lock held, but only within single-threaded
   // LogAndApply. Column family manipulations can only happen within LogAndApply
   // (the same single thread), so we're safe to iterate.
+
+  if (db_options_->write_dbid_to_manifest) {
+    VersionEdit edit_for_db_id;
+    assert(!db_id_.empty());
+    edit_for_db_id.SetDBId(db_id_);
+    std::string db_id_record;
+    if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+      return Status::Corruption("Unable to Encode VersionEdit:" +
+                                edit_for_db_id.DebugString(true));
+    }
+    Status add_record = log->AddRecord(db_id_record);
+    if (!add_record.ok()) {
+      return add_record;
+    }
+  }
+
   for (auto cfd : *column_family_set_) {
     if (cfd->IsDropped()) {
       continue;
@@ -4943,7 +4937,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       }
     }
   }
-
   return Status::OK();
 }
 
@@ -5467,17 +5460,6 @@ Status ReactiveVersionSet::Recover(
   // In recovery, nobody else can access it, so it's fine to set it to be
   // initialized earlier.
   default_cfd->set_initialized();
-
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       builders;
   std::unordered_map<int, std::string> column_families_not_found;
@@ -5493,17 +5475,17 @@ Status ReactiveVersionSet::Recover(
   log::Reader* reader = manifest_reader->get();
 
   int retry = 0;
+  VersionEdit version_edit;
   while (s.ok() && retry < 1) {
     assert(reader != nullptr);
     Slice record;
     std::string scratch;
-    s = ReadAndRecover(
-        reader, &read_buffer_, cf_name_to_options, column_families_not_found,
-        builders, &have_log_number, &log_number, &have_prev_log_number,
-        &previous_log_number, &have_next_file, &next_file, &have_last_sequence,
-        &last_sequence, &min_log_number_to_keep, &max_column_family);
+    s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options,
+                       column_families_not_found, builders, &version_edit);
     if (s.ok()) {
-      bool enough = have_next_file && have_log_number && have_last_sequence;
+      bool enough = version_edit.has_next_file_number_ &&
+                    version_edit.has_log_number_ &&
+                    version_edit.has_last_sequence_;
       if (enough) {
         for (const auto& cf : column_families) {
           auto cfd = column_family_set_->GetColumnFamily(cf.name);
@@ -5545,14 +5527,14 @@ Status ReactiveVersionSet::Recover(
   }
 
   if (s.ok()) {
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
+    if (!version_edit.has_prev_log_number_) {
+      version_edit.prev_log_number_ = 0;
     }
-    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+    column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
 
-    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
-    MarkFileNumberUsed(previous_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit.prev_log_number_);
+    MarkFileNumberUsed(version_edit.log_number_);
 
     for (auto cfd : *column_family_set_) {
       assert(builders.count(cfd->GetID()) > 0);
@@ -5585,11 +5567,11 @@ Status ReactiveVersionSet::Recover(
                       !(db_options_->skip_stats_update_on_db_open));
       AppendVersion(cfd, v);
     }
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
+    next_file_number_.store(version_edit.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit.last_sequence_;
+    last_published_sequence_ = version_edit.last_sequence_;
+    last_sequence_ = version_edit.last_sequence_;
+    prev_log_number_ = version_edit.prev_log_number_;
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
         continue;
@@ -5611,16 +5593,6 @@ Status ReactiveVersionSet::ReadAndApply(
   mu->AssertHeld();
 
   Status s;
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
   uint64_t applied_edits = 0;
   while (s.ok()) {
     Slice record;
@@ -5635,7 +5607,7 @@ Status ReactiveVersionSet::ReadAndApply(
       }
 
       // Skip the first VersionEdits of each MANIFEST generated by
-      // VersionSet::WriteSnapshot.
+      // VersionSet::WriteCurrentStatetoManifest.
       if (number_of_edits_to_skip_ > 0) {
         ColumnFamilyData* cfd =
             column_family_set_->GetColumnFamily(edit.column_family_);
@@ -5649,16 +5621,13 @@ Status ReactiveVersionSet::ReadAndApply(
       if (!s.ok()) {
         break;
       }
+      VersionEdit temp_edit;
       if (edit.is_in_atomic_group_) {
         if (read_buffer_.IsFull()) {
           // Apply edits in an atomic group when we have read all edits in the
           // group.
           for (auto& e : read_buffer_.replay_buffer()) {
-            s = ApplyOneVersionEditToBuilder(
-                e, cfds_changed, &have_log_number, &log_number,
-                &have_prev_log_number, &previous_log_number, &have_next_file,
-                &next_file, &have_last_sequence, &last_sequence,
-                &min_log_number_to_keep, &max_column_family);
+            s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
             if (!s.ok()) {
               break;
             }
@@ -5671,11 +5640,7 @@ Status ReactiveVersionSet::ReadAndApply(
         }
       } else {
         // Apply a normal edit immediately.
-        s = ApplyOneVersionEditToBuilder(
-            edit, cfds_changed, &have_log_number, &log_number,
-            &have_prev_log_number, &previous_log_number, &have_next_file,
-            &next_file, &have_last_sequence, &last_sequence,
-            &min_log_number_to_keep, &max_column_family);
+        s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
         if (s.ok()) {
           applied_edits++;
         }
@@ -5700,24 +5665,28 @@ Status ReactiveVersionSet::ReadAndApply(
         break;
       } else {
         // We have switched to a new MANIFEST whose first records have been
-        // generated by VersionSet::WriteSnapshot. Since the secondary instance
-        // has already finished recovering upon start, there is no need for the
-        // secondary to process these records. Actually, if the secondary were
-        // to replay these records, the secondary may end up adding the same
-        // SST files AGAIN to each column family, causing consistency checks
-        // done by VersionBuilder to fail. Therefore, we record the number of
-        // records to skip at the beginning of the new MANIFEST and ignore
-        // them.
+        // generated by VersionSet::WriteCurrentStatetoManifest. Since the
+        // secondary instance has already finished recovering upon start, there
+        // is no need for the secondary to process these records. Actually, if
+        // the secondary were to replay these records, the secondary may end up
+        // adding the same SST files AGAIN to each column family, causing
+        // consistency checks done by VersionBuilder to fail. Therefore, we
+        // record the number of records to skip at the beginning of the new
+        // MANIFEST and ignore them.
         number_of_edits_to_skip_ = 0;
         for (auto* cfd : *column_family_set_) {
           if (cfd->IsDropped()) {
             continue;
           }
-          // Increase number_of_edits_to_skip by 2 because WriteSnapshot()
-          // writes 2 version edits for each column family at the beginning of
-          // the newly-generated MANIFEST.
+          // Increase number_of_edits_to_skip by 2 because
+          // WriteCurrentStatetoManifest() writes 2 version edits for each
+          // column family at the beginning of the newly-generated MANIFEST.
           // TODO(yanqin) remove hard-coded value.
-          number_of_edits_to_skip_ += 2;
+          if (db_options_->write_dbid_to_manifest) {
+            number_of_edits_to_skip_ += 3;
+          } else {
+            number_of_edits_to_skip_ += 2;
+          }
         }
       }
     }
@@ -5744,10 +5713,7 @@ Status ReactiveVersionSet::ReadAndApply(
 
 Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
-    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    VersionEdit* version_edit) {
   ColumnFamilyData* cfd =
       column_family_set_->GetColumnFamily(edit.column_family_);
 
@@ -5794,10 +5760,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
       return s;
     }
   }
-  Status s = ExtractInfoFromVersionEdit(
-      cfd, edit, have_log_number, log_number, have_prev_log_number,
-      previous_log_number, have_next_file, next_file, have_last_sequence,
-      last_sequence, min_log_number_to_keep, max_column_family);
+  Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
   if (!s.ok()) {
     return s;
   }
@@ -5830,23 +5793,23 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     // Some other error has occurred during LoadTableHandlers.
   }
 
-  if (have_next_file) {
-    next_file_number_.store(*next_file + 1);
+  if (version_edit->has_next_file_number()) {
+    next_file_number_.store(version_edit->next_file_number_ + 1);
   }
-  if (have_last_sequence) {
-    last_allocated_sequence_ = *last_sequence;
-    last_published_sequence_ = *last_sequence;
-    last_sequence_ = *last_sequence;
+  if (version_edit->has_last_sequence_) {
+    last_allocated_sequence_ = version_edit->last_sequence_;
+    last_published_sequence_ = version_edit->last_sequence_;
+    last_sequence_ = version_edit->last_sequence_;
   }
-  if (have_prev_log_number) {
-    prev_log_number_ = *previous_log_number;
-    MarkFileNumberUsed(*previous_log_number);
+  if (version_edit->has_prev_log_number_) {
+    prev_log_number_ = version_edit->prev_log_number_;
+    MarkFileNumberUsed(version_edit->prev_log_number_);
   }
-  if (have_log_number) {
-    MarkFileNumberUsed(*log_number);
+  if (version_edit->has_log_number_) {
+    MarkFileNumberUsed(version_edit->log_number_);
   }
-  column_family_set_->UpdateMaxColumnFamily(*max_column_family);
-  MarkMinLogNumberToKeep2PC(*min_log_number_to_keep);
+  column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
+  MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
   return s;
 }
 
@@ -5890,8 +5853,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
       // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
       // active_version_builders_ map because we choose to construct the
       // versions from scratch, thanks to the first part of each MANIFEST
-      // written by VersionSet::WriteSnapshot. This is not necessary, but we
-      // choose this at present for the sake of simplicity.
+      // written by VersionSet::WriteCurrentStatetoManifest. This is not
+      // necessary, but we choose this at present for the sake of simplicity.
       active_version_builders_.clear();
     }
   } while (s.IsPathNotFound());
diff --git a/db/version_set.h b/db/version_set.h
index 766e071f4e5..c5b391a15b3 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -65,6 +65,14 @@ class MergeContext;
 class ColumnFamilySet;
 class MergeIteratorBuilder;
 
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
 // Return the smallest index i such that file_level.files[i]->largest >= key.
 // Return file_level.num_files if there is no such file.
 // REQUIRES: "file_level.files" contains a sorted list of
@@ -851,7 +859,7 @@ class VersionSet {
   // If read_only == true, Recover() will not complain if some column families
   // are not opened
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
-                 bool read_only = false);
+                 bool read_only = false, std::string* db_id = nullptr);
 
   // Reads a manifest file and returns a list of column families in
   // column_families.
@@ -1054,7 +1062,7 @@ class VersionSet {
                            TableReaderCaller caller);
 
   // Save current contents to *log
-  Status WriteSnapshot(log::Writer* log);
+  Status WriteCurrentStateToManifest(log::Writer* log);
 
   void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
 
@@ -1068,10 +1076,7 @@ class VersionSet {
       std::unordered_map<int, std::string>& column_families_not_found,
       std::unordered_map<
           uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+      VersionEditParams* version_edit, std::string* db_id = nullptr);
 
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
@@ -1080,22 +1085,17 @@ class VersionSet {
       std::unordered_map<int, std::string>& column_families_not_found,
       std::unordered_map<
           uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
-
-  Status ExtractInfoFromVersionEdit(
-      ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
-      uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+      VersionEditParams* version_edit);
+
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& from_edit,
+                                    VersionEditParams* version_edit_params);
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
 
   Env* const env_;
   const std::string dbname_;
+  std::string db_id_;
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
   // Any log number equal or lower than this should be ignored during recovery,
@@ -1195,10 +1195,7 @@ class ReactiveVersionSet : public VersionSet {
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
       VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
-      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+      VersionEdit* version_edit);
 
   Status MaybeSwitchManifest(
       log::Reader::Reporter* reporter,
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index a1278bfc7ad..a9e9dc069bd 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_set.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
 #include "logging/logging.h"
 #include "table/mock_table.h"
@@ -637,6 +638,12 @@ class VersionSetTestBase {
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -691,7 +698,7 @@ class VersionSetTestBase {
     std::vector<ColumnFamilyDescriptor> column_families;
     SequenceNumber last_seqno;
     std::unique_ptr<log::Writer> log_writer;
-
+    SetIdentityFile(env_, dbname_);
     PrepareManifest(&column_families, &last_seqno, &log_writer);
     log_writer.reset();
     // Make "CURRENT" file point to the new manifest file.
diff --git a/file/filename.cc b/file/filename.cc
index ba5d84c291f..a6360b5046d 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -393,8 +393,14 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
   return s;
 }
 
-Status SetIdentityFile(Env* env, const std::string& dbname) {
-  std::string id = env->GenerateUniqueId();
+Status SetIdentityFile(Env* env, const std::string& dbname,
+                       const std::string& db_id) {
+  std::string id;
+  if (db_id.empty()) {
+    id = env->GenerateUniqueId();
+  } else {
+    id = db_id;
+  }
   assert(!id.empty());
   // Reserve the filename dbname/000000.dbtmp for the temporary identity file
   std::string tmp = TempFileName(dbname, 0);
diff --git a/file/filename.h b/file/filename.h
index 91b905f07ab..ad19d389594 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -167,7 +167,8 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname,
                              Directory* directory_to_fsync);
 
 // Make the IDENTITY file for the db
-extern Status SetIdentityFile(Env* env, const std::string& dbname);
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+                              const std::string& db_id = {});
 
 // Sync manifest file `file`.
 extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 12cb66510dc..cf1d1f6ecc5 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1090,6 +1090,17 @@ struct DBOptions {
   // ReadOptions::background_purge_on_iterator_cleanup.
   bool avoid_unnecessary_blocking_io = false;
 
+  // Historically DB ID has always been stored in Identity File in DB folder.
+  // If this flag is true, the DB ID is written to Manifest file in addition
+  // to the Identity file. By doing this 2 problems are solved
+  // 1. We don't checksum the Identity file where as Manifest file is.
+  // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+  //    the source of truth. Previously the Identity file could be copied
+  //    independent of Manifest and that can result in wrong DB ID.
+  // We recommend setting this flag to true.
+  // Default: false
+  bool write_dbid_to_manifest = false;
+
   // The number of bytes to prefetch when reading the log. This is mostly useful
   // for reading a remotely located log, as it can save the number of
   // round-trips. If 0, then the prefetching is disabled.
diff --git a/options/db_options.cc b/options/db_options.cc
index 3756c555ceb..a39294211a0 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -86,6 +86,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       atomic_flush(options.atomic_flush),
       avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
       persist_stats_to_disk(options.persist_stats_to_disk),
+      write_dbid_to_manifest(options.write_dbid_to_manifest),
       log_readahead_size(options.log_readahead_size) {
 }
 
@@ -226,6 +227,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    avoid_unnecessary_blocking_io);
   ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
                    persist_stats_to_disk);
+  ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
+                   write_dbid_to_manifest);
   ROCKS_LOG_HEADER(
       log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
       log_readahead_size);
diff --git a/options/db_options.h b/options/db_options.h
index e39e2903ff3..98a790705dc 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -82,6 +82,7 @@ struct ImmutableDBOptions {
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
   bool persist_stats_to_disk;
+  bool write_dbid_to_manifest;
   size_t log_readahead_size;
 };
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 588a45ef70f..91ae2f8b551 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -1664,6 +1664,9 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct DBOptions, avoid_unnecessary_blocking_io),
           OptionType::kBoolean, OptionVerificationType::kNormal, false,
           offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}},
+        {"write_dbid_to_manifest",
+         {offsetof(struct DBOptions, write_dbid_to_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
         {"log_readahead_size",
          {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT,
           OptionVerificationType::kNormal, false, 0}},
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index d0fef584728..a4d4b99107d 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -296,7 +296,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "seq_per_batch=false;"
                              "atomic_flush=false;"
                              "avoid_unnecessary_blocking_io=false;"
-                             "log_readahead_size=0",
+                             "log_readahead_size=0;"
+                             "write_dbid_to_manifest=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),

From cdb6334e689ec6dda253b3e5b06091f5b0ecc4a7 Mon Sep 17 00:00:00 2001
From: git-hulk <hulk.website@gmail.com>
Date: Tue, 3 Sep 2019 11:21:47 -0700
Subject: [PATCH 340/572] MOD: trim last space and comma in perf context and
 iostat context ToString()

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5755

Differential Revision: D17165190

Pulled By: riversand963

fbshipit-source-id: a3a4633961bfe019bf360f97a4c4d36464e7fa0b
---
 monitoring/iostats_context.cc | 4 +++-
 monitoring/perf_context.cc    | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc
index 3d102f91203..20e0467ab88 100644
--- a/monitoring/iostats_context.cc
+++ b/monitoring/iostats_context.cc
@@ -54,7 +54,9 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
   IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos);
   IOSTATS_CONTEXT_OUTPUT(logger_nanos);
 
-  return ss.str();
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
 }
 
 }  // namespace rocksdb
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index 40b0b215c47..5e0d5ac2544 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -529,7 +529,10 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive);
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count);
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count);
-  return ss.str();
+
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
 #endif
 }
 

From 38b17ecd0ed101f15a1de754395a886acf6e236e Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 4 Sep 2019 11:36:47 -0700
Subject: [PATCH 341/572] Replace named comparator struct with lambda (#5768)

Summary:
Tiny code mod: replace a named comparator struct with anonymous lambda.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5768

Differential Revision: D17185141

Pulled By: riversand963

fbshipit-source-id: fabe367649931c33a39ad035dc707d2efc3ad5fc
---
 db/wal_manager.cc | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 0c996baf549..4ef41413235 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -280,17 +280,6 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
                  s.ToString().c_str());
 }
 
-namespace {
-struct CompareLogByPointer {
-  bool operator()(const std::unique_ptr<LogFile>& a,
-                  const std::unique_ptr<LogFile>& b) {
-    LogFileImpl* a_impl = static_cast_with_check<LogFileImpl, LogFile>(a.get());
-    LogFileImpl* b_impl = static_cast_with_check<LogFileImpl, LogFile>(b.get());
-    return *a_impl < *b_impl;
-  }
-};
-}
-
 Status WalManager::GetSortedWalsOfType(const std::string& path,
                                        VectorLogPtr& log_files,
                                        WalFileType log_type) {
@@ -341,8 +330,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
           new LogFileImpl(number, log_type, sequence, size_bytes)));
     }
   }
-  CompareLogByPointer compare_log_files;
-  std::sort(log_files.begin(), log_files.end(), compare_log_files);
+  std::sort(
+      log_files.begin(), log_files.end(),
+      [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+        LogFileImpl* a_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(a.get());
+        LogFileImpl* b_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(b.get());
+        return *a_impl < *b_impl;
+      });
   return status;
 }
 

From 229e6fbe0ebc73fa8ceadd9d20638e2c52bcfb70 Mon Sep 17 00:00:00 2001
From: Affan Dar <affandar@hotmail.com>
Date: Wed, 4 Sep 2019 12:08:56 -0700
Subject: [PATCH 342/572] Adding DB::GetCurrentWalFile() API as a
 repliction/backup helper (#5765)

Summary:
Adding a light weight API to get last live WAL file name and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.

Specifically within MySQL's backup/restore mechanism, this call can be made with a write lock on the mysql db to get a transactionally consistent snapshot of the current WAL file position along with other non-rocksdb log/data files.

Without this, the alternative would be to take the aforementioned lock, scan the WAL dir for all files, find the last file and note its exact size as the rocksdb 'checkpoint'.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5765

Differential Revision: D17172717

Pulled By: affandar

fbshipit-source-id: f2fabafd4c0e6fc45f126670c8c88a9f84cb8a37
---
 HISTORY.md                               |  1 +
 db/db_filesnapshot.cc                    | 10 +++++
 db/db_impl/db_impl.h                     |  1 +
 db/db_test.cc                            |  4 ++
 db/db_wal_test.cc                        | 50 ++++++++++++++++++++++++
 db/wal_manager.cc                        | 28 +++++++++++++
 db/wal_manager.h                         |  2 +
 include/rocksdb/db.h                     |  9 +++++
 include/rocksdb/utilities/stackable_db.h |  4 ++
 9 files changed, 109 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index da42ecf0787..d7a3f0c41b1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -10,6 +10,7 @@
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
+* Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 67d994f5568..a6dcdcccbf3 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -163,6 +163,16 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   return wal_manager_.GetSortedWalFiles(files);
 }
 
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+  uint64_t current_logfile_number;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    current_logfile_number = logfile_number_;
+  }
+
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 7fac09bfa67..1942f0979b0 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -340,6 +340,7 @@ class DBImpl : public DB {
                               uint64_t* manifest_file_size,
                               bool flush_memtable = true) override;
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
diff --git a/db/db_test.cc b/db/db_test.cc
index 906a67cda23..a78ba37c162 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2790,6 +2790,10 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
+  Status GetCurrentWalFile(std::unique_ptr<LogFile>* /*current_log_file*/) override {
+    return Status::OK();
+  }
+
   Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
 
   Status GetUpdatesSince(
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 2d5e7bc1d53..4e0b08c9a5c 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -569,6 +569,56 @@ TEST_F(DBWALTest, GetSortedWalFiles) {
   } while (ChangeWalOptions());
 }
 
+TEST_F(DBWALTest, GetCurrentWalFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+    std::unique_ptr<LogFile>* bad_log_file = nullptr;
+    ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+    std::unique_ptr<LogFile> log_file;
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    // nothing has been written to the log yet
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_EQ(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // add some data and verify that the file size actually moves foward
+    ASSERT_OK(Put(0, "foo", "v1"));
+    ASSERT_OK(Put(0, "foo2", "v2"));
+    ASSERT_OK(Put(0, "foo3", "v3"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // force log files to cycle and add some more data, then check if
+    // log number moves forward
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+
+    ASSERT_OK(Put(0, "foo4", "v4"));
+    ASSERT_OK(Put(0, "foo5", "v5"));
+    ASSERT_OK(Put(0, "foo6", "v6"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+  } while (ChangeWalOptions());
+}
+
 TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
   // Test for regression of WAL cleanup missing files that don't contain data
   // for every column family.
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 4ef41413235..1074279d5ad 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -414,6 +414,34 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   return s;
 }
 
+Status WalManager::GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file) {
+  if (!log_file) {
+    return Status::InvalidArgument("log_file not preallocated.");
+  }
+
+  if(!number) {
+    return Status::PathNotFound("log file not available");
+  }
+
+  Status s;
+
+  uint64_t size_bytes;
+  s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  log_file->reset(new LogFileImpl(
+      number,
+      kAliveLogFile,
+      0,      // SequenceNumber
+      size_bytes));
+
+  return Status::OK();
+}
+
+
 // the function returns status.ok() and sequence == 0 if the file exists, but is
 // empty
 Status WalManager::ReadFirstLine(const std::string& fname,
diff --git a/db/wal_manager.h b/db/wal_manager.h
index 8d185c35076..97211f0003a 100644
--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@@ -59,6 +59,8 @@ class WalManager {
 
   Status DeleteFile(const std::string& fname, uint64_t number);
 
+  Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
   Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
                               SequenceNumber* sequence) {
     return ReadFirstRecord(type, number, sequence);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 023659524fd..6aa05baae3a 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1123,6 +1123,15 @@ class DB {
   // Retrieve the sorted list of all wal files with earliest file first
   virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
 
+  // Retrieve information about the current wal file
+  //
+  // Note that the log might have rolled after this call in which case
+  // the current_log_file would not point to the current log file.
+  //
+  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // would always be set to 0
+  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) = 0;
+
   // Note: this API is not yet consistent with WritePrepared transactions.
   // Sets iter to an iterator that is positioned at a write-batch containing
   // seq_number. If the sequence number is non existent, it returns an iterator
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 3941c1821e7..04f1039c21c 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -371,6 +371,10 @@ class StackableDB : public DB {
     return db_->GetSortedWalFiles(files);
   }
 
+  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override {
+    return db_->GetCurrentWalFile(current_log_file);
+  }
+
   virtual Status DeleteFile(std::string name) override {
     return db_->DeleteFile(name);
   }

From f9fb9f14211ccc49921c3e31294e2619eb4e2fb7 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 4 Sep 2019 14:28:39 -0700
Subject: [PATCH 343/572] Add a unit test to detect infinite loops with reseek
 optimizations (#5727)

Summary:
Iterators reseek to the target key after iterating over max_sequential_skip_in_iterations invalid values. The logic is susceptible to an infinite loop bug, which has been present with WritePrepared Transactions up until 6.2 release. Although the bug is not present on master, the patch adds a unit test to prevent it from resurfacing again.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5727

Differential Revision: D16952759

Pulled By: maysamyabandeh

fbshipit-source-id: d0d973dddc8dfabd5a794931232aa4c862c74f51
---
 utilities/transactions/transaction_test.cc | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 23ae374dcba..691047f5131 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5969,6 +5969,55 @@ TEST_P(TransactionTest, DuplicateKeys) {
   }
 }
 
+// Test that the reseek optimization in iterators will not result in an infinite
+// loop if there are too many uncommitted entries before the snapshot.
+TEST_P(TransactionTest, ReseekOptimization) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ColumnFamilyDescriptor cfd;
+  db->DefaultColumnFamily()->GetDescriptor(&cfd);
+  auto max_skip = cfd.options.max_sequential_skip_in_iterations;
+
+  ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
+
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  // Duplicate keys will result into separate sequence numbers in WritePrepared
+  // and WriteUnPrepared
+  for (size_t i = 0; i < 2 * max_skip; i++) {
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar")));
+  }
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv")));
+
+  ReadOptions read_options;
+  // To avoid loops
+  read_options.max_skippable_internal_keys = 10 * max_skip;
+  Iterator* iter = db->NewIterator(read_options);
+  ASSERT_OK(iter->status());
+  size_t cnt = 0;
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  cnt = 0;
+  iter->SeekToLast();
+  while (iter->Valid()) {
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  delete iter;
+  txn0->Rollback();
+  delete txn0;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 3f2723a81bbbf05e306ab0309d7f4d357db02814 Mon Sep 17 00:00:00 2001
From: ENDOH takanao <djmchl@gmail.com>
Date: Wed, 4 Sep 2019 14:31:02 -0700
Subject: [PATCH 344/572] fix checking the '-march' flag (#5766)

Summary:
Hi! guys,

I got errors on the ARM machine.

before:

```console
$ make static_lib
...
g++: error: unrecognized argument in option '-march=armv8-a+crc+crypto'
g++: note: valid arguments to '-march=' are: armv2 armv2a armv3 armv3m armv4 armv4t armv5 armv5e armv5t armv5te armv6 armv6-m armv6j armv6k armv6kz armv6s-m armv6t2 armv6z armv6zk armv7 armv7-a armv7-m armv7-r armv7e-m armv7ve armv8-a armv8-a+crc armv8.1-a armv8.1-a+crc iwmmxt iwmmxt2 native
```

Thanks!
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5766

Differential Revision: D17191117

fbshipit-source-id: 7a61e3a2a4a06f37faeb8429bd7314da54ec5868
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3bcccf88ae1..b90f4f1bbbc 100644
--- a/Makefile
+++ b/Makefile
@@ -143,7 +143,7 @@ CFLAGS +=  -DHAVE_POWER8
 HAVE_POWER8=1
 endif
 
-ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1))
 CXXFLAGS += -march=armv8-a+crc+crypto
 CFLAGS += -march=armv8-a+crc+crypto
 ARMCRC_SOURCE=1

From 20dec1401f3113a5565c0a99e1cf16e12525b373 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 5 Sep 2019 10:03:42 -0700
Subject: [PATCH 345/572] Copy/split PlainTableBloomV1 from DynamicBloom
 (refactor) (#5767)

Summary:
DynamicBloom was being used both for memory-only and for on-disk filters, as part of the PlainTable format. To set up enhancements to the memtable Bloom filter, this splits the code into two copies and removes unused features from each copy. Adds test PlainTableDBTest.BloomSchema to ensure no accidental change to that format.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5767

Differential Revision: D17206963

Pulled By: pdillinger

fbshipit-source-id: 6cce8d55305ed0df051b4c58bdc98c8ad81d0553
---
 CMakeLists.txt                     |   2 +-
 TARGETS                            |   2 +-
 db/plain_table_db_test.cc          |  50 ++++++++-
 src.mk                             |   2 +-
 table/bloom_block.cc               |  23 -----
 table/bloom_block.h                |  37 -------
 table/plain/plain_table_bloom.cc   |  78 ++++++++++++++
 table/plain/plain_table_bloom.h    | 161 +++++++++++++++++++++++++++++
 table/plain/plain_table_builder.cc |   2 +-
 table/plain/plain_table_builder.h  |   2 +-
 table/plain/plain_table_reader.cc  |   2 +-
 table/plain/plain_table_reader.h   |   4 +-
 util/dynamic_bloom.cc              |  19 +---
 util/dynamic_bloom.h               |  24 +----
 14 files changed, 301 insertions(+), 107 deletions(-)
 delete mode 100644 table/bloom_block.cc
 delete mode 100644 table/bloom_block.h
 create mode 100644 table/plain/plain_table_bloom.cc
 create mode 100644 table/plain/plain_table_bloom.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a094d32615a..62c678ea91c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -597,7 +597,6 @@ set(SOURCES
         table/block_based/partitioned_filter_block.cc
         table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
-        table/bloom_block.cc
         table/cuckoo/cuckoo_table_builder.cc
         table/cuckoo/cuckoo_table_factory.cc
         table/cuckoo/cuckoo_table_reader.cc
@@ -607,6 +606,7 @@ set(SOURCES
         table/merging_iterator.cc
         table/meta_blocks.cc
         table/persistent_cache_helper.cc
+        table/plain/plain_table_bloom.cc
         table/plain/plain_table_builder.cc
         table/plain/plain_table_factory.cc
         table/plain/plain_table_index.cc
diff --git a/TARGETS b/TARGETS
index 058e591e8cc..0a838877569 100644
--- a/TARGETS
+++ b/TARGETS
@@ -229,7 +229,6 @@ cpp_library(
         "table/block_based/partitioned_filter_block.cc",
         "table/block_based/uncompression_dict_reader.cc",
         "table/block_fetcher.cc",
-        "table/bloom_block.cc",
         "table/cuckoo/cuckoo_table_builder.cc",
         "table/cuckoo/cuckoo_table_factory.cc",
         "table/cuckoo/cuckoo_table_reader.cc",
@@ -239,6 +238,7 @@ cpp_library(
         "table/merging_iterator.cc",
         "table/meta_blocks.cc",
         "table/persistent_cache_helper.cc",
+        "table/plain/plain_table_bloom.cc",
         "table/plain/plain_table_builder.cc",
         "table/plain/plain_table_factory.cc",
         "table/plain/plain_table_index.cc",
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 68df71768e2..a2f19108079 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -24,8 +24,8 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/bloom_block.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_key_coding.h"
 #include "table/plain/plain_table_reader.h"
@@ -730,6 +730,54 @@ TEST_P(PlainTableDBTest, Iterator) {
   }
 }
 
+namespace {
+std::string NthKey(size_t n, char filler) {
+  std::string rv(16, filler);
+  rv[0] = n % 10;
+  rv[1] = (n / 10) % 10;
+  rv[2] = (n / 100) % 10;
+  rv[3] = (n / 1000) % 10;
+  return rv;
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+   options.bloom_locality = bloom_locality;
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = 3; // high FP rate for test
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+
+
+    bool expect_bloom_not_match = false;
+    options.table_factory.reset(new TestPlainTableFactory(
+        &expect_bloom_not_match, plain_table_options,
+        0 /* column_family_id */, kDefaultColumnFamilyName));
+    DestroyAndReopen(&options);
+
+    for (unsigned i = 0; i < 2345; ++i) {
+      ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+    }
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+    for (unsigned i = 0; i < 32; ++i) {
+      // Known pattern of Bloom filter false positives can detect schema change
+      // with high probability. Known FPs stuffed into bits:
+      bool expect_fp = (bloom_locality ? 2421694657UL : 1785868347UL)
+                       & (1UL << i);
+      expect_bloom_not_match = !expect_fp;
+      ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+    }
+  }
+}
+
 namespace {
 std::string MakeLongKey(size_t length, char c) {
   return std::string(length, c);
diff --git a/src.mk b/src.mk
index 6d087861d4e..4c768975567 100644
--- a/src.mk
+++ b/src.mk
@@ -124,7 +124,6 @@ LIB_SOURCES =                                                   \
   table/block_based/partitioned_filter_block.cc                 \
   table/block_based/uncompression_dict_reader.cc                \
   table/block_fetcher.cc                             		\
-  table/bloom_block.cc                               		\
   table/cuckoo/cuckoo_table_builder.cc                          \
   table/cuckoo/cuckoo_table_factory.cc                          \
   table/cuckoo/cuckoo_table_reader.cc                           \
@@ -134,6 +133,7 @@ LIB_SOURCES =                                                   \
   table/merging_iterator.cc                                     \
   table/meta_blocks.cc                                          \
   table/persistent_cache_helper.cc                              \
+  table/plain/plain_table_bloom.cc                              \
   table/plain/plain_table_builder.cc                            \
   table/plain/plain_table_factory.cc                            \
   table/plain/plain_table_index.cc                              \
diff --git a/table/bloom_block.cc b/table/bloom_block.cc
deleted file mode 100644
index 61959030a22..00000000000
--- a/table/bloom_block.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "table/bloom_block.h"
-
-#include <string>
-#include "rocksdb/slice.h"
-#include "util/dynamic_bloom.h"
-
-namespace rocksdb {
-
-void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
-  for (auto hash : keys_hashes) {
-    bloom_.AddHash(hash);
-  }
-}
-
-Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
-
-const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
-}  // namespace rocksdb
diff --git a/table/bloom_block.h b/table/bloom_block.h
deleted file mode 100644
index 483fa25d93d..00000000000
--- a/table/bloom_block.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-
-#include <vector>
-#include <string>
-#include "util/dynamic_bloom.h"
-
-namespace rocksdb {
-class Logger;
-
-class BloomBlockBuilder {
- public:
-  static const std::string kBloomBlock;
-
-  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
-
-  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
-                    uint32_t locality, size_t huge_page_tlb_size,
-                    Logger* logger) {
-    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
-                        logger);
-  }
-
-  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
-
-  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
-
-  Slice Finish();
-
- private:
-  DynamicBloom bloom_;
-};
-
-};  // namespace rocksdb
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
new file mode 100644
index 00000000000..778b3b558ad
--- /dev/null
+++ b/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <string>
+#include <algorithm>
+#include "util/dynamic_bloom.h"
+
+#include "memory/allocator.h"
+
+
+namespace rocksdb {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                              uint32_t num_blocks) {
+  data_ = reinterpret_cast<uint8_t*>(raw_data);
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
+                                uint32_t total_bits, uint32_t locality,
+                                size_t huge_page_tlb_size,
+                                Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = reinterpret_cast<uint8_t*>(raw);
+}
+
+void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace rocksdb
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
new file mode 100644
index 00000000000..08c72b2dc2a
--- /dev/null
+++ b/table/plain/plain_table_bloom.h
@@ -0,0 +1,161 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "rocksdb/slice.h"
+
+#include "port/port.h"
+#include "util/hash.h"
+
+#include <memory>
+
+namespace rocksdb {
+class Slice;
+class Allocator;
+class Logger;
+
+class PlainTableBloomV1 {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit PlainTableBloomV1(uint32_t num_probes = 6);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~PlainTableBloomV1() {}
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t hash);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const {
+    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
+  }
+
+  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                  uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  uint8_t* data_;
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    PREFETCH(&(data_[b / 8]), 0, 3);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      //  to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+  }
+  return true;
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+  assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  if (kNumBlocks != 0) {
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      // to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
+  } else {
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+}
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  PlainTableBloomV1 bloom_;
+};
+
+};  // namespace rocksdb
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index 4d50d817643..8a51b64e60e 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -19,9 +19,9 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_builder.h"
-#include "table/bloom_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_index.h"
 #include "util/coding.h"
diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h
index 0a29098d657..ce2169a388b 100644
--- a/table/plain/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -13,7 +13,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/bloom_block.h"
+#include "table/plain/plain_table_bloom.h"
 #include "table/plain/plain_table_index.h"
 #include "table/plain/plain_table_key_coding.h"
 #include "table/table_builder.h"
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 63a28e34a36..3d5c4f2dbfb 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -21,11 +21,11 @@
 
 #include "table/block_based/block.h"
 #include "table/block_based/filter_block.h"
-#include "table/bloom_block.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_key_coding.h"
 #include "table/two_level_iterator.h"
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index ab108b21605..02539cc6970 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -19,10 +19,10 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_index.h"
 #include "table/table_reader.h"
-#include "util/dynamic_bloom.h"
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
@@ -155,7 +155,7 @@ class PlainTableReader: public TableReader {
 
   // Bloom filter is used to rule out non-existent key
   bool enable_bloom_;
-  DynamicBloom bloom_;
+  PlainTableBloomV1 bloom_;
   PlainTableReaderFileInfo file_info_;
   Arena arena_;
   CacheAllocationPtr index_block_alloc_;
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 4dfccb0bf36..e5210d1fb7f 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -33,24 +33,7 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
                            uint32_t locality, uint32_t num_probes,
                            size_t huge_page_tlb_size, Logger* logger)
-    : DynamicBloom(num_probes) {
-  SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
-}
-
-DynamicBloom::DynamicBloom(uint32_t num_probes)
-    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
-
-void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                              uint32_t num_blocks) {
-  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw_data);
-  kTotalBits = total_bits;
-  kNumBlocks = num_blocks;
-}
-
-void DynamicBloom::SetTotalBits(Allocator* allocator,
-                                uint32_t total_bits, uint32_t locality,
-                                size_t huge_page_tlb_size,
-                                Logger* logger) {
+    : kNumProbes(num_probes) {
   kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
                               : (total_bits + 7) / 8 * 8;
   kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 654bc9ad5f3..8b31f3c4884 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -21,6 +21,9 @@ class Slice;
 class Allocator;
 class Logger;
 
+// A Bloom filter intended only to be used in memory, never serialized in a way
+// that could lead to schema incompatibility. Supports opt-in lock-free
+// concurrent access.
 class DynamicBloom {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
@@ -39,12 +42,6 @@ class DynamicBloom {
                         size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
 
-  explicit DynamicBloom(uint32_t num_probes = 6);
-
-  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
-                    uint32_t locality, size_t huge_page_tlb_size,
-                    Logger* logger);
-
   ~DynamicBloom() {}
 
   // Assuming single threaded access to this function.
@@ -69,17 +66,6 @@ class DynamicBloom {
 
   uint32_t GetNumBlocks() const { return kNumBlocks; }
 
-  Slice GetRawData() const {
-    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
-  }
-
-  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                  uint32_t num_blocks = 0);
-
-  uint32_t GetTotalBits() const { return kTotalBits; }
-
-  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
-
  private:
   uint32_t kTotalBits;
   uint32_t kNumBlocks;
@@ -126,7 +112,7 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
 #if defined(_MSC_VER)
 #pragma warning(push)
 // local variable is initialized but not referenced
-#pragma warning(disable : 4189) 
+#pragma warning(disable : 4189)
 #endif
 inline void DynamicBloom::Prefetch(uint32_t h) {
   if (kNumBlocks != 0) {
@@ -139,7 +125,6 @@ inline void DynamicBloom::Prefetch(uint32_t h) {
 #endif
 
 inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  assert(IsInitialized());
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
     uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
@@ -171,7 +156,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
 
 template <typename OrFunc>
 inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) {
-  assert(IsInitialized());
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
     uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);

From 19e8c9b64fca099fd6619bcef0f5513a81c6327d Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Thu, 5 Sep 2019 13:57:59 -0700
Subject: [PATCH 346/572] use c++17's try_emplace if available (#5696)

Summary:
This avoids rehashing the key in TrackKey() in case the key is not already
in the map of tracked keys, which will happen at least once per key used in a
transaction.

Additionally fix two typos.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5696

Differential Revision: D17210178

Pulled By: lth

fbshipit-source-id: 7e2c28e9e505c1d1c1535d435250cf2b191a6fdf
---
 utilities/transactions/transaction_base.cc | 16 ++++++++++++++--
 utilities/transactions/transaction_util.cc |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 81f39812407..1f4723e54b0 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -633,18 +633,30 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
                                    const std::string& key, SequenceNumber seq,
                                    bool read_only, bool exclusive) {
   auto& cf_key_map = (*key_map)[cfh_id];
+#ifdef __cpp_lib_unordered_map_try_emplace
+  // use c++17's try_emplace if available, to avoid rehashing the key
+  // in case it is not already in the map
+  auto result = cf_key_map.try_emplace(key, seq);
+  auto iter = result.first;
+  if (!result.second && 
+      seq < iter->second.seq) {
+    // Now tracking this key with an earlier sequence number
+    iter->second.seq = seq;
+  }
+#else
   auto iter = cf_key_map.find(key);
   if (iter == cf_key_map.end()) {
-    auto result = cf_key_map.emplace(key, TransactionKeyMapInfo(seq));
+    auto result = cf_key_map.emplace(key, seq);
     iter = result.first;
   } else if (seq < iter->second.seq) {
     // Now tracking this key with an earlier sequence number
     iter->second.seq = seq;
   }
+#endif
   // else we do not update the seq. The smaller the tracked seq, the stronger it
   // the guarantee since it implies from the seq onward there has not been a
   // concurrent update to the key. So we update the seq if it implies stronger
-  // guarantees, i.e., if it is smaller than the existing trakced seq.
+  // guarantees, i.e., if it is smaller than the existing tracked seq.
 
   if (read_only) {
     iter->second.num_reads++;
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index 37144850323..5e3c27f09c3 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -75,8 +75,8 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
 
     if (cache_only) {
       result = Status::TryAgain(
-          "Transaction ould not check for conflicts as the MemTable does not "
-          "countain a long enough history to check write at SequenceNumber: ",
+          "Transaction could not check for conflicts as the MemTable does not "
+          "contain a long enough history to check write at SequenceNumber: ",
           ToString(snap_seq));
     }
   } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {

From b55b2f45d04d95010cd1a40f2701990abe43c3de Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 5 Sep 2019 14:57:39 -0700
Subject: [PATCH 347/572] Faster new DynamicBloom implementation (for memtable)
 (#5762)

Summary:
Since DynamicBloom is now only used in-memory, we're free to
change it without schema compatibility issues. The new implementation
is drawn from (with manifest permission)
https://github.com/pdillinger/wormhashing/blob/303542a767437f56d8b66cea6ebecaac0e6a61e9/bloom_simulation_tests/foo.cc#L613

This has several speed advantages over the prior implementation:
* Uses fastrange instead of %
* Minimum logic to determine first (and all) probed memory addresses
* (Major) Two probes per 64-bit memory fetch/write.
* Very fast and effective (murmur-like) hash expansion/re-mixing. (At
least on recent CPUs, integer multiplication is very cheap.)

While a Bloom filter with 512-bit cache locality has about a 1.15x FP
rate penalty (e.g. 0.84% to 0.97%), further restricting to two probes
per 64 bits incurs an additional 1.12x FP rate penalty (e.g. 0.97% to
1.09%). Nevertheless, the unit tests show no "mediocre" FP rate samples,
unlike the old implementation with more erratic FP rates.

Especially for the memtable, we expect speed to outweigh somewhat higher
FP rates. For example, a negative table query would have to be 1000x
slower than a BF query to justify doubling BF query time to shave 10% off
FP rate (working assumption around 1% FP rate). While that seems likely
for SSTs, my data suggests a speed factor of roughly 50x for the memtable
(vs. BF; ~1.5% lower write throughput when enabling memtable Bloom
filter, after this change).  Thus, it's probably not worth even 5% more
time in the Bloom filter to shave off 1/10th of the Bloom FP rate, or 0.1%
in absolute terms, and it's probably at least 20% slower to recoup that
much FP rate from this new implementation. Because of this, we do not see
a need for a 'locality' option that affects the MemTable Bloom filter
and have decoupled the MemTable Bloom filter from Options::bloom_locality.

Note that just 3% more memory to the Bloom filter (10.3 bits per key vs.
just 10) is able to make up for the ~12% FP rate drop in the new
implementation:

[] # Nearly "ideal" FP-wise but reasonably fast cache-local implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_WORM64_FROM32_any.out 10000000 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_WORM64_FROM32_any.out time: 3.29372 sampled_fp_rate: 0.00985956 ...

[] # Close match to this new implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out 10000000 6 10.3 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out time: 2.10072 sampled_fp_rate: 0.00985655 ...

[] # Old locality=1 implementation
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_ROCKSDB_DYNAMIC_any.out 10000000 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_ROCKSDB_DYNAMIC_any.out time: 3.95472 sampled_fp_rate: 0.00988943 ...

Also note the dramatic speed improvement vs. alternatives.

--

Performance unit test: DynamicBloomTest.concurrent_with_perf is updated
to report more precise timing data. (Measure running time of each
thread, not just longest running thread, etc.) Results averaged over
various sizes enabled with --enable_perf and 20 runs each; old dynamic
bloom refers to locality=1, the faster of the old:

old dynamic bloom, avg add latency = 65.6468
new dynamic bloom, avg add latency = 44.3809
old dynamic bloom, avg query latency = 50.6485
new dynamic bloom, avg query latency = 43.2186
old avg parallel add latency = 41.678
new avg parallel add latency = 24.5238
old avg parallel hit latency = 14.6322
new avg parallel hit latency = 12.3939
old avg parallel miss latency = 16.7289
new avg parallel miss latency = 12.2134

Tested on a dedicated 64-bit production machine at Facebook. Significant
improvement all around.

Despite now using std::atomic<uint64_t>, quick before-and-after test on
a 32-bit machine (Intel Atom N270, released 2008) shows no regression in
performance, in some cases modest improvement.

--

Performance integration test (synthetic): with DEBUG_LEVEL=0, used
TEST_TMPDIR=/dev/shm ./db_bench --benchmarks=fillrandom,readmissing,readrandom,stats --num=2000000
and optionally with -memtable_whole_key_filtering -memtable_bloom_size_ratio=0.01
300 runs each configuration.

Write throughput change by enabling memtable bloom:
Old locality=0: -3.06%
Old locality=1: -2.37%
New:            -1.50%
conclusion -> seems to substantially close the gap

Readmissing throughput change by enabling memtable bloom:
Old locality=0: +34.47%
Old locality=1: +34.80%
New:            +33.25%
conclusion -> maybe a small new penalty from FP rate

Readrandom throughput change by enabling memtable bloom:
Old locality=0: +31.54%
Old locality=1: +31.13%
New:            +30.60%
conclusion -> maybe also from FP rate (after memtable flush)

--

Another conclusion we can draw from this new implementation is that the
existing 32-bit hash function is not inherently crippling the Bloom
filter speed or accuracy, below about 5 million keys. For speed, the
implementation is essentially the same whether starting with 32-bits or
64-bits of hash; it just determines whether the first multiplication
after fastrange is a pseudorandom expansion or needed re-mix. Note that
this multiplication can occur while memory is fetching.

For accuracy, in a standard configuration, you need about 5 million
keys before you have about a 1.1x FP penalty due to using a
32-bit hash vs. 64-bit:

[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out $((5 * 1000 * 1000 * 10)) 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_FROM32_any.out time: 2.52069 sampled_fp_rate: 0.0118267 ...
[~/wormhashing/bloom_simulation_tests] ./foo_gcc_IMPL_CACHE_MUL64_BLOCK_any.out $((5 * 1000 * 1000 * 10)) 6 10 $RANDOM 100000000
./foo_gcc_IMPL_CACHE_MUL64_BLOCK_any.out time: 2.43871 sampled_fp_rate: 0.0109059
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5762

Differential Revision: D17214194

Pulled By: pdillinger

fbshipit-source-id: ad9da031772e985fd6b62a0e1db8e81892520595
---
 HISTORY.md                      |   3 +
 db/memtable.cc                  |   2 +-
 table/plain/plain_table_bloom.h |   2 +
 util/dynamic_bloom.cc           |  55 +++----
 util/dynamic_bloom.h            | 131 ++++++++--------
 util/dynamic_bloom_test.cc      | 254 ++++++++++++++------------------
 util/hash.h                     |   8 +
 7 files changed, 223 insertions(+), 232 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d7a3f0c41b1..4a3d715a0c1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,9 @@
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
+* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
+### Performance Improvements
+* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/memtable.cc b/db/memtable.cc
index 06cb2222e02..21d3e347b27 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -116,7 +116,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       moptions_.memtable_prefix_bloom_bits > 0) {
     bloom_filter_.reset(
         new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
-                         ioptions.bloom_locality, 6 /* hard coded 6 probes */,
+                         6 /* hard coded 6 probes */,
                          moptions_.memtable_huge_page_size, ioptions.info_log));
   }
 }
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index 08c72b2dc2a..b9248cdaf12 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -19,6 +19,8 @@ class Slice;
 class Allocator;
 class Logger;
 
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
 class PlainTableBloomV1 {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index e5210d1fb7f..3c153c71942 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -16,44 +16,49 @@ namespace rocksdb {
 
 namespace {
 
-uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
-  uint32_t num_blocks =
-      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
-
-  // Make num_blocks an odd number to make sure more bits are involved
-  // when determining which block.
-  if (num_blocks % 2 == 0) {
-    num_blocks++;
+uint32_t roundUpToPow2(uint32_t x) {
+  uint32_t rv = 1;
+  while (rv < x) {
+    rv <<= 1;
   }
-
-  return num_blocks * (CACHE_LINE_SIZE * 8);
+  return rv;
 }
+
 }
 
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
-                           uint32_t locality, uint32_t num_probes,
+                           uint32_t num_probes,
                            size_t huge_page_tlb_size, Logger* logger)
-    : kNumProbes(num_probes) {
-  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
-                              : (total_bits + 7) / 8 * 8;
-  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+    // Round down, except round up with 1
+    : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
+  assert(num_probes % 2 == 0); // limitation of current implementation
+  assert(num_probes <= 10); // limitation of current implementation
+  assert(kNumDoubleProbes > 0);
 
-  assert(kNumBlocks > 0 || kTotalBits > 0);
-  assert(kNumProbes > 0);
+  // Determine how much to round off + align by so that x ^ i (that's xor) is
+  // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
+  uint32_t block_bytes = /*bytes/u64*/ 8 *
+                         /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
+  kLen = (total_bits + (/*bits/byte*/ 8 * block_bytes - 1)) /
+         /*bits/u64*/ 64;
+  assert(kLen > 0);
 
-  uint32_t sz = kTotalBits / 8;
-  if (kNumBlocks > 0) {
-    sz += CACHE_LINE_SIZE - 1;
-  }
+  uint32_t sz = kLen * /*bytes/u64*/ 8;
+  // Padding to correct for allocation not originally aligned on block_bytes
+  // boundary
+  sz += block_bytes - 1;
   assert(allocator);
 
   char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
   memset(raw, 0, sz);
-  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
-  if (kNumBlocks > 0 && cache_line_offset > 0) {
-    raw += CACHE_LINE_SIZE - cache_line_offset;
+  auto block_offset = reinterpret_cast<uintptr_t>(raw) % block_bytes;
+  if (block_offset > 0) {
+    // Align on block_bytes boundary
+    raw += block_bytes - block_offset;
   }
-  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw);
+  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+                "Expecting zero-space-overhead atomic");
+  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
 }
 
 }  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 8b31f3c4884..cf6ac40600a 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -24,12 +24,20 @@ class Logger;
 // A Bloom filter intended only to be used in memory, never serialized in a way
 // that could lead to schema incompatibility. Supports opt-in lock-free
 // concurrent access.
+//
+// This implementation is also intended for applications generally preferring
+// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
+// For 1% FP rate, that means that the latency of a look-up triggered by an FP
+// should be less than roughly 100x the cost of a Bloom filter op.
+//
+// For simplicity and performance, the current implementation requires
+// num_probes to be a multiple of two and <= 10.
+//
 class DynamicBloom {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
   // total_bits: fixed total bits for the bloom
   // num_probes: number of hash probes for a single key
-  // locality:  If positive, optimize for cache line locality, 0 otherwise.
   // hash_func:  customized hash function
   // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
   //                      within this page size. Need to reserve huge pages for
@@ -37,7 +45,7 @@ class DynamicBloom {
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
   explicit DynamicBloom(Allocator* allocator,
-                        uint32_t total_bits, uint32_t locality = 0,
+                        uint32_t total_bits,
                         uint32_t num_probes = 6,
                         size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
@@ -64,14 +72,15 @@ class DynamicBloom {
 
   void Prefetch(uint32_t h);
 
-  uint32_t GetNumBlocks() const { return kNumBlocks; }
-
  private:
-  uint32_t kTotalBits;
-  uint32_t kNumBlocks;
-  const uint32_t kNumProbes;
+  // Length of the structure, in 64-bit words. For this structure, "word"
+  // will always refer to 64-bit words.
+  uint32_t kLen;
+  // We make the k probes in pairs, two for each 64-bit read/write. Thus,
+  // this stores k/2, the number of words to double-probe.
+  const uint32_t kNumDoubleProbes;
 
-  std::atomic<uint8_t>* data_;
+  std::atomic<uint64_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
   // concurrency safety, working with bytes.
@@ -86,14 +95,14 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     ptr->store(ptr->load(std::memory_order_relaxed) | mask,
                std::memory_order_relaxed);
   });
 }
 
 inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     // Happens-before between AddHash and MaybeContains is handled by
     // access to versions_->LastSequence(), so all we have to do here is
     // avoid races (so we don't give the compiler a license to mess up
@@ -114,67 +123,69 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
 // local variable is initialized but not referenced
 #pragma warning(disable : 4189)
 #endif
-inline void DynamicBloom::Prefetch(uint32_t h) {
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    PREFETCH(&(data_[b / 8]), 0, 3);
-  }
+inline void DynamicBloom::Prefetch(uint32_t h32) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
 }
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
 
-inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      //  to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
+// Speed hacks in this implementation:
+// * Uses fastrange instead of %
+// * Minimum logic to determine first (and all) probed memory addresses.
+//   (Uses constant bit-xor offsets from the starting probe address.)
+// * (Major) Two probes per 64-bit memory fetch/write.
+//   Code simplification / optimization: only allow even number of probes.
+// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
+// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
+// remix provides five pairs of bit addresses within a uint64_t.)
+//   Code simplification / optimization: only allow up to 10 probes, from a
+//   single 64-bit remix.
+//
+// The FP rate penalty for this implementation, vs. standard Bloom filter, is
+// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
+// This implementation does not explicitly use the cache line size, but is
+// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
+//
+// NB: could easily be upgraded to support a 64-bit hash and
+// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
+// because of false positives.)
+
+inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask = ((uint64_t)1 << (h & 63))
+                  | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t val = data_[a ^ i].load(std::memory_order_relaxed);
+    if (i + 1 >= kNumDoubleProbes) {
+      return (val & mask) == mask;
+    } else if ((val & mask) != mask) {
+      return false;
     }
+    h = (h >> 12) | (h << 52);
   }
-  return true;
 }
 
 template <typename OrFunc>
-inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) {
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      // to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      h += delta;
+inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask = ((uint64_t)1 << (h & 63))
+                  | ((uint64_t)1 << ((h >> 6) & 63));
+    or_func(&data_[a ^ i], mask);
+    if (i + 1 >= kNumDoubleProbes) {
+      return;
     }
+    h = (h >> 12) | (h << 52);
   }
 }
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 3f98ccd0189..22741ed87bb 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -45,18 +45,18 @@ class DynamicBloomTest : public testing::Test {};
 
 TEST_F(DynamicBloomTest, EmptyFilter) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   ASSERT_TRUE(!bloom1.MayContain("hello"));
   ASSERT_TRUE(!bloom1.MayContain("world"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   ASSERT_TRUE(!bloom2.MayContain("hello"));
   ASSERT_TRUE(!bloom2.MayContain("world"));
 }
 
 TEST_F(DynamicBloomTest, Small) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.Add("hello");
   bloom1.Add("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -64,7 +64,7 @@ TEST_F(DynamicBloomTest, Small) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.Add("hello");
   bloom2.Add("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -75,7 +75,7 @@ TEST_F(DynamicBloomTest, Small) {
 
 TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.AddConcurrently("hello");
   bloom1.AddConcurrently("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -83,7 +83,7 @@ TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.AddConcurrently("hello");
   bloom2.AddConcurrently("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -116,53 +116,44 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
   fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
           num_probes);
 
-  for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
-    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
-      uint32_t bloom_bits = 0;
-      Arena arena;
-      if (enable_locality == 0) {
-        bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
-      } else {
-        bloom_bits = std::max(num * FLAGS_bits_per_key,
-                              enable_locality * CACHE_LINE_SIZE * 8);
-      }
-      DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes);
-      for (uint64_t i = 0; i < num; i++) {
-        bloom.Add(Key(i, buffer));
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
-      }
-
-      // All added keys must match
-      for (uint64_t i = 0; i < num; i++) {
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
-                                                      << "; key " << i;
-      }
+  for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+    uint32_t bloom_bits = 0;
+    Arena arena;
+    bloom_bits = num * FLAGS_bits_per_key;
+    DynamicBloom bloom(&arena, bloom_bits, num_probes);
+    for (uint64_t i = 0; i < num; i++) {
+      bloom.Add(Key(i, buffer));
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+    }
 
-      // Check false positive rate
+    // All added keys must match
+    for (uint64_t i = 0; i < num; i++) {
+      ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
+                                                    << "; key " << i;
+    }
 
-      int result = 0;
-      for (uint64_t i = 0; i < 10000; i++) {
-        if (bloom.MayContain(Key(i + 1000000000, buffer))) {
-          result++;
-        }
+    // Check false positive rate
+    int result = 0;
+    for (uint64_t i = 0; i < 10000; i++) {
+      if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+        result++;
       }
-      double rate = result / 10000.0;
-
-      fprintf(stderr,
-              "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
-              "enable locality?%u\n",
-              rate * 100.0, num, bloom_bits, enable_locality);
-
-      if (rate > 0.0125)
-        mediocre_filters++;  // Allowed, but not too often
-      else
-        good_filters++;
     }
+    double rate = result / 10000.0;
 
-    fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
-            mediocre_filters);
-    ASSERT_LE(mediocre_filters, good_filters / 5);
+    fprintf(stderr,
+            "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u\n",
+            rate * 100.0, num, bloom_bits);
+
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
   }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+          mediocre_filters);
+  ASSERT_LE(mediocre_filters, good_filters / 5);
 }
 
 TEST_F(DynamicBloomTest, perf) {
@@ -178,7 +169,7 @@ TEST_F(DynamicBloomTest, perf) {
     const uint32_t num_keys = m * 8 * 1024 * 1024;
     fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-    DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
@@ -186,8 +177,8 @@ TEST_F(DynamicBloomTest, perf) {
     }
 
     uint64_t elapsed = timer.ElapsedNanos();
-    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg add latency %3g\n",
+            static_cast<double>(elapsed) / num_keys);
 
     uint32_t count = 0;
     timer.Start();
@@ -199,128 +190,99 @@ TEST_F(DynamicBloomTest, perf) {
     ASSERT_EQ(count, num_keys);
     elapsed = timer.ElapsedNanos();
     assert(count > 0);
-    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
-            elapsed / count);
-
-    // Locality enabled version
-    DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
-
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
-    }
-
-    elapsed = timer.ElapsedNanos();
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
-
-    count = 0;
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      if (blocked_bloom.MayContain(
-              Slice(reinterpret_cast<const char*>(&i), 8))) {
-        ++count;
-      }
-    }
-
-    elapsed = timer.ElapsedNanos();
-    assert(count > 0);
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
-            elapsed / count);
-    ASSERT_TRUE(count == num_keys);
+    fprintf(stderr, "dynamic bloom, avg query latency %3g\n",
+            static_cast<double>(elapsed) / count);
   }
 }
 
 TEST_F(DynamicBloomTest, concurrent_with_perf) {
-  StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
   uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
-  uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0;
 
   uint32_t num_threads = 4;
   std::vector<port::Thread> threads;
 
   for (uint32_t m = 1; m <= m_limit; ++m) {
-    for (uint32_t locality = 0; locality <= locality_limit; ++locality) {
-      Arena arena;
-      const uint32_t num_keys = m * 8 * 1024 * 1024;
-      fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n",
-              m * 8, locality);
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-      DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
-      timer.Start();
+    std::atomic<uint64_t> elapsed(0);
 
-      std::function<void(size_t)> adder([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          std_bloom.AddConcurrently(
-              Slice(reinterpret_cast<const char*>(&i), 8));
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(adder, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
+    std::function<void(size_t)> adder([&](size_t t) {
+      StopWatchNano timer(Env::Default());
+      timer.Start();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        std_bloom.AddConcurrently(
+            Slice(reinterpret_cast<const char*>(&i), 8));
       }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(adder, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
-      uint64_t elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg parallel add latency %3g"
+                    " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
 
+    elapsed = 0;
+    std::function<void(size_t)> hitter([&](size_t t) {
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::function<void(size_t)> hitter([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          ASSERT_TRUE(f);
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(hitter, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        bool f =
+            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+        ASSERT_TRUE(f);
       }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(hitter, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg parallel hit latency %3g"
+                    " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
 
+    elapsed = 0;
+    std::atomic<uint32_t> false_positives(0);
+    std::function<void(size_t)> misser([&](size_t t) {
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::atomic<uint32_t> false_positives(0);
-      std::function<void(size_t)> misser([&](size_t t) {
-        for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
-             i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          if (f) {
-            ++false_positives;
-          }
+      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
+           i += num_threads) {
+        bool f =
+            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+        if (f) {
+          ++false_positives;
         }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(misser, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
       }
-
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64
-                      " nanos/key, %f%% false positive rate\n",
-              elapsed / num_keys, false_positives.load() * 100.0 / num_keys);
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(misser, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
     }
+
+    fprintf(stderr, "dynamic bloom, avg parallel miss latency %3g"
+                    " nanos/key, %f%% false positive rate\n",
+            static_cast<double>(elapsed) / num_threads / num_keys,
+            false_positives.load() * 100.0 / num_keys);
   }
 }
 
diff --git a/util/hash.h b/util/hash.h
index ed42b08942e..836f325ef2a 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -49,4 +49,12 @@ struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
 
+// An alternative to % for mapping a hash value to an arbitrary range. See
+// https://github.com/lemire/fastrange and
+// https://github.com/pdillinger/wormhashing/blob/2c4035a4462194bf15f3e9fc180c27c513335225/bloom_simulation_tests/foo.cc#L57
+inline uint32_t fastrange32(uint32_t a, uint32_t h) {
+  uint64_t product = static_cast<uint64_t>(a) * h;
+  return static_cast<uint32_t>(product >> 32);
+}
+
 }  // namespace rocksdb

From ac97e6930f3443c3bf5a4ce77834bcaa863d2540 Mon Sep 17 00:00:00 2001
From: HouBingjian <54057110+houbingjian@users.noreply.github.com>
Date: Thu, 5 Sep 2019 17:01:58 -0700
Subject: [PATCH 348/572] bloom test check fail on arm (#5745)

Summary:
FullFilterBitsBuilder::CalculateSpace use CACHE_LINE_SIZE which is 64@X86 but 128@ARM64
when it run bloom_test.FullVaryingLengths it failed on ARM64 server,
the assert can be fixed by change  128->CACHE_LINE_SIZE*2 as merged
ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5)) << length;

run  bloom_test
before fix:
/root/rocksdb-master/util/bloom_test.cc:281: Failure
Expected: (FilterSize()) <= ((size_t)((length * 10 / 8) + 128 + 5)), actual: 389 vs 383
200
[  FAILED  ] FullBloomTest.FullVaryingLengths (32 ms)
[----------] 4 tests from FullBloomTest (32 ms total)

[----------] Global test environment tear-down
[==========] 7 tests from 2 test cases ran. (116 ms total)
[  PASSED  ] 6 tests.
[  FAILED  ] 1 test, listed below:
[  FAILED  ] FullBloomTest.FullVaryingLengths

after fix:
Filters: 37 good, 0 mediocre
[       OK ] FullBloomTest.FullVaryingLengths (90 ms)
[----------] 4 tests from FullBloomTest (90 ms total)

[----------] Global test environment tear-down
[==========] 7 tests from 2 test cases ran. (174 ms total)
[  PASSED  ] 7 tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5745

Differential Revision: D17076047

fbshipit-source-id: e7beb5d55d4855fceb2b84bc8119a6b0759de635
---
 util/bloom_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 5e61f31ba60..688920ea08a 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -278,7 +278,7 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5)) << length;
 
     // All added keys must match
     for (int i = 0; i < length; i++) {

From 43a5cdb58c0755b35d52d01c7b925d5e726dcfbc Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Thu, 5 Sep 2019 17:17:18 -0700
Subject: [PATCH 349/572] remove unused #include to fix musl libc build (#5583)

Summary:
The `#include "core_local.h"` was pulling in libgcc's `posix_memalign()`
declaration. That declaration specifies `throw()` whereas musl libc's
declaration does not. This was leading to the following compiler error
when using musl libc:

```
In file included from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb/port/jemalloc_helper.h:26:0,
                 from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb/util/jemalloc_nodump_allocator.h:11,
                 from /go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb/util/jemalloc_nodump_allocator.cc:6:
/go/native/x86_64-unknown-linux-musl/jemalloc/include/jemalloc/jemalloc.h:63:29: error: declaration of 'int posix_memalign(void**, size_t, size_t) throw ()' has a different exception specifier
 #  define je_posix_memalign posix_memalign
                             ^
/go/native/x86_64-unknown-linux-musl/jemalloc/include/jemalloc/jemalloc.h:63:29: note: from previous declaration 'int posix_memalign(void**, size_t, size_t)'
 #  define je_posix_memalign posix_memalign
                             ^
/go/native/x86_64-unknown-linux-musl/jemalloc/include/jemalloc/jemalloc.h:202:38: note: in expansion of macro 'je_posix_memalign'
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_posix_memalign(void **memptr,
                                      ^~~~~~~~~~~~~~~~~
make[4]: *** [CMakeFiles/rocksdb.dir/util/jemalloc_nodump_allocator.cc.o] Error 1
```

Since `#include "core_local.h"` is not actually used, we can just remove
it. I verified that fixes the build.

There was a related PR here (https://github.com/facebook/rocksdb/issues/2188), although the problem description is
slightly different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5583

Differential Revision: D16343227

fbshipit-source-id: 0386bc2b5fd55b2c3b5fba19382014efa52e44f8
---
 memory/jemalloc_nodump_allocator.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/memory/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h
index e93c1223778..f997a3b8120 100644
--- a/memory/jemalloc_nodump_allocator.h
+++ b/memory/jemalloc_nodump_allocator.h
@@ -11,7 +11,6 @@
 #include "port/jemalloc_helper.h"
 #include "port/port.h"
 #include "rocksdb/memory_allocator.h"
-#include "util/core_local.h"
 #include "util/thread_local.h"
 
 #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)

From 83b991922e47e6a54fca7f7e6c413b1c0454adae Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Thu, 5 Sep 2019 17:18:36 -0700
Subject: [PATCH 350/572] Fix EncryptedEnv assert (#5735)

Summary:
Fixes https://github.com/facebook/rocksdb/issues/5734. By reading the code the assert don't quite make sense to me, since `dataSize` and `fileOffset` has no correlation. But my knowledge about `EncryptedEnv` is very limited.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5735

Test Plan:
run `ENCRYPTED_ENV=1 ./db_encryption_test`

Signed-off-by: Yi Wu <yiwu@pingcap.com>

Differential Revision: D17133849

fbshipit-source-id: bb7262d308e5b2503c400b180edc252668df0ef0
---
 db/db_encryption_test.cc | 28 ++++++++++++++++++++++++++++
 env/env_encryption.cc    |  2 --
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 4ddc11986b8..bc72744656e 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -83,6 +83,34 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
   }
 }
 
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+  auto defaultEnv = Env::Default();
+
+  // create empty file for reading it back in later
+  auto envOptions = EnvOptions(CurrentOptions());
+  auto filePath = dbname_ + "/empty.empty";
+
+  Status status;
+  {
+    std::unique_ptr<WritableFile> writableFile;
+    status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+    ASSERT_OK(status);
+  }
+
+  std::unique_ptr<SequentialFile> seqFile;
+  status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+  ASSERT_OK(status);
+
+  std::string scratch;
+  Slice data;
+  // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+  // it should just work and return an empty string
+  status = seqFile->Read(16, &data, (char*)scratch.data());
+  ASSERT_OK(status);
+
+  ASSERT_TRUE(data.empty());
+}
+
 #endif // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 6be2137ed6e..b7095c0f579 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -745,8 +745,6 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
   std::string scratch;
   AllocateScratch(scratch);
 
-  assert(fileOffset < dataSize);
-
   // Decrypt individual blocks.
   while (1) {
     char *block = data;

From eae9f040eb1914e32c59e36cf57db2e7fa80098e Mon Sep 17 00:00:00 2001
From: Jeffrey Xiao <jeffrey.xiao1998@gmail.com>
Date: Thu, 5 Sep 2019 17:28:40 -0700
Subject: [PATCH 351/572] Initialized pinned_pos_ and pinned_seq_pos_ in
 FragmentedRangeTombstoneIterator (#5720)

Summary:
These uninitialized member variables can cause a key to not be pinned when it should be, causing erroneous behavior. For example ingesting a file with range deletion tombstones will yield an "external file have corrupted keys" on a Mac.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5720

Differential Revision: D17217673

fbshipit-source-id: cd7df7ce3ad9cf69c841c4d3dc6fd144eff9e212
---
 db/range_tombstone_fragmenter.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h
index a0b77b67771..23a28396fd4 100644
--- a/db/range_tombstone_fragmenter.h
+++ b/db/range_tombstone_fragmenter.h
@@ -144,6 +144,8 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
   void Invalidate() {
     pos_ = tombstones_->end();
     seq_pos_ = tombstones_->seq_end();
+    pinned_pos_ = tombstones_->end();
+    pinned_seq_pos_ = tombstones_->seq_end();
   }
 
   RangeTombstone Tombstone() const {

From cfc20019d198ca0eb8e749aec968d5e870be4c25 Mon Sep 17 00:00:00 2001
From: Richard He <he@hrichard.com>
Date: Thu, 5 Sep 2019 17:33:35 -0700
Subject: [PATCH 352/572] Fixed FALLOC_FL_KEEP_SIZE undefined (#5614)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Fix `error: ‘FALLOC_FL_KEEP_SIZE’` undeclared error in `io_posix.cc` during Vagrant build in CentOS as per issue https://github.com/facebook/rocksdb/issues/5599
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5614

Differential Revision: D17217960

fbshipit-source-id: ef736c51b16833107fd9ccc7917ed1def2a8d02c
---
 env/io_posix.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index c383494eaa4..3572d7cc9a4 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -14,8 +14,10 @@
 #include <algorithm>
 #if defined(OS_LINUX)
 #include <linux/fs.h>
+#ifndef FALLOC_FL_KEEP_SIZE
 #include <linux/falloc.h>
 #endif
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

From 533e47709c851df30252ea02f6d683d1764a3db7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A5=8F=E4=B9=8B=E7=AB=A0?= <mm304321141@gmail.com>
Date: Thu, 5 Sep 2019 17:50:45 -0700
Subject: [PATCH 353/572] Fix WriteBatchWithIndex with MergeOperator bug
 (#5577)

Summary:
```
TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
  DB* db;
  Options options;

  options.create_if_missing = true;
  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");

  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");

  DestroyDB(dbname, options);
  Status s = DB::Open(options, dbname, &db);
  assert(s.ok());

  ReadOptions read_options;
  WriteOptions write_options;
  FlushOptions flush_options;
  std::string value;

  WriteBatchWithIndex batch;

  ASSERT_OK(db->Put(write_options, "A", "1"));
  ASSERT_OK(db->Flush(flush_options, db->DefaultColumnFamily()));
  ASSERT_OK(batch.Merge("A", "2"));

  ASSERT_OK(batch.GetFromBatchAndDB(db, read_options, "A", &value));
  ASSERT_EQ(value, "1,2");

  delete db;
  DestroyDB(dbname, options);
}
```
Fix ASSERT in batch.GetFromBatchAndDB()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5577

Differential Revision: D16379847

fbshipit-source-id: b1320e24ec8e71350c525083cc0a16180a63f752
---
 .../write_batch_with_index.cc                 |  5 ++-
 .../write_batch_with_index_test.cc            | 32 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 272a2ab4862..e6675dcc6ae 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -917,9 +917,12 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
       }
 
       if (merge_operator) {
+        std::string merge_result;
         s = MergeHelper::TimedFullMerge(
             merge_operator, key, merge_data, merge_context.GetOperands(),
-            pinnable_val->GetSelf(), logger, statistics, env);
+            &merge_result, logger, statistics, env);
+        pinnable_val->Reset();
+        *pinnable_val->GetSelf() = std::move(merge_result);
         pinnable_val->PinSelf();
       } else {
         s = Status::InvalidArgument("Options::merge_operator must be set");
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 3e0a33c3525..7c7efddd154 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -1308,6 +1308,38 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
   DestroyDB(dbname, options);
 }
 
+
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+  DB* db;
+  Options options;
+
+  options.create_if_missing = true;
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
+
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+  FlushOptions flush_options;
+  std::string value;
+
+  WriteBatchWithIndex batch;
+
+  ASSERT_OK(db->Put(write_options, "A", "1"));
+  ASSERT_OK(db->Flush(flush_options, db->DefaultColumnFamily()));
+  ASSERT_OK(batch.Merge("A", "2"));
+
+  ASSERT_OK(batch.GetFromBatchAndDB(db, read_options, "A", &value));
+  ASSERT_EQ(value, "1,2");
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
 void AssertKey(std::string key, WBWIIterator* iter) {
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(key, iter->Entry().key.ToString());

From 2208cc0196b6d297f94479d19d3d5d7fa9e1732f Mon Sep 17 00:00:00 2001
From: Manuel Ung <mung@fb.com>
Date: Fri, 6 Sep 2019 10:16:21 -0700
Subject: [PATCH 354/572] Fix build break in TransactionBaseImpl::TrackKey
 (#5771)

Summary:
Fix build broken in https://github.com/facebook/rocksdb/pull/5696.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5771

Differential Revision: D17217665

Pulled By: lth

fbshipit-source-id: 7aa84a2a9b4feb7a3ab1cab174e09276430fe042
---
 utilities/transactions/transaction_base.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 1f4723e54b0..f10c1b60ff6 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -646,7 +646,7 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
 #else
   auto iter = cf_key_map.find(key);
   if (iter == cf_key_map.end()) {
-    auto result = cf_key_map.emplace(key, seq);
+    auto result = cf_key_map.emplace(key, TransactionKeyMapInfo(seq));
     iter = result.first;
   } else if (seq < iter->second.seq) {
     // Now tracking this key with an earlier sequence number

From 78b8cfc7ece51aa3caec3cb008312441a3617e1f Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Fri, 6 Sep 2019 15:23:44 -0700
Subject: [PATCH 355/572] WriteUnPrepared: Split ReadYourOwnWriteStress to
 three (#5776)

Summary:
ReadYourOwnWriteStress occasionally times out on some platforms. The patch splits it to three.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5776

Differential Revision: D17231743

Pulled By: maysamyabandeh

fbshipit-source-id: d42eeaf22f61a48d50f9c404d98b1081ae8dac94
---
 .../write_unprepared_transaction_test.cc      | 300 ++++++++++--------
 1 file changed, 159 insertions(+), 141 deletions(-)

diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 7257c9880cf..af7b7694ddf 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -36,6 +36,27 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
                       std::make_tuple(false, true, WRITE_UNPREPARED)));
 
+enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
+class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase,
+                                  virtual public ::testing::WithParamInterface<
+                                      std::tuple<bool, StressAction>> {
+ public:
+  WriteUnpreparedStressTest()
+      : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
+                                           WRITE_UNPREPARED),
+        action_(std::get<1>(GetParam())) {}
+  StressAction action_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WriteUnpreparedStressTest, WriteUnpreparedStressTest,
+    ::testing::Values(std::make_tuple(false, NO_SNAPSHOT),
+                      std::make_tuple(false, RO_SNAPSHOT),
+                      std::make_tuple(false, REFRESH_SNAPSHOT),
+                      std::make_tuple(true, NO_SNAPSHOT),
+                      std::make_tuple(true, RO_SNAPSHOT),
+                      std::make_tuple(true, REFRESH_SNAPSHOT)));
+
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
   // The following tests checks whether reading your own write for
   // a transaction works for write unprepared, when there are uncommitted
@@ -116,7 +137,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
 }
 
 #ifndef ROCKSDB_VALGRIND_RUN
-TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
+TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
   // This is a stress test where different threads are writing random keys, and
   // then before committing or aborting the transaction, it validates to see
   // that it can read the keys it wrote, and the keys it did not write respect
@@ -129,170 +150,167 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWriteStress) {
   std::default_random_engine rand(static_cast<uint32_t>(
       std::hash<std::thread::id>()(std::this_thread::get_id())));
 
-  enum Action { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
   // Test with
   // 1. no snapshots set
   // 2. snapshot set on ReadOptions
   // 3. snapshot set, and refreshing after every write.
-  for (Action a : {NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT}) {
-    WriteOptions write_options;
-    txn_db_options.transaction_lock_timeout = -1;
-    options.disable_auto_compactions = true;
-    ReOpen();
+  StressAction a = action_;
+  WriteOptions write_options;
+  txn_db_options.transaction_lock_timeout = -1;
+  options.disable_auto_compactions = true;
+  ReOpen();
 
-    std::vector<std::string> keys;
-    for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
-      keys.push_back("k" + ToString(k));
-    }
-    std::shuffle(keys.begin(), keys.end(), rand);
-
-    // This counter will act as a "sequence number" to help us validate
-    // visibility logic with snapshots. If we had direct access to the seqno of
-    // snapshots and key/values, then we should directly compare those instead.
-    std::atomic<int64_t> counter(0);
-
-    std::function<void(uint32_t)> stress_thread = [&](int id) {
-      size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
-      Random64 rnd(static_cast<uint32_t>(tid));
-
-      Transaction* txn;
-      TransactionOptions txn_options;
-      // batch_size of 1 causes writes to DB for every marker.
-      txn_options.write_batch_flush_threshold = 1;
-      ReadOptions read_options;
-
-      for (uint32_t i = 0; i < kNumIter; i++) {
-        std::set<std::string> owned_keys(&keys[id * kNumKeys],
-                                         &keys[(id + 1) * kNumKeys]);
-        // Add unowned keys to make the workload more interesting, but this
-        // increases row lock contention, so just do it sometimes.
-        if (rnd.OneIn(2)) {
-          owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
-        }
+  std::vector<std::string> keys;
+  for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
+    keys.push_back("k" + ToString(k));
+  }
+  std::shuffle(keys.begin(), keys.end(), rand);
 
-        txn = db->BeginTransaction(write_options, txn_options);
-        txn->SetName(ToString(id));
-        txn->SetSnapshot();
-        if (a >= RO_SNAPSHOT) {
-          read_options.snapshot = txn->GetSnapshot();
-          ASSERT_TRUE(read_options.snapshot != nullptr);
-        }
+  // This counter will act as a "sequence number" to help us validate
+  // visibility logic with snapshots. If we had direct access to the seqno of
+  // snapshots and key/values, then we should directly compare those instead.
+  std::atomic<int64_t> counter(0);
 
-        uint64_t buf[2];
-        buf[0] = id;
+  std::function<void(uint32_t)> stress_thread = [&](int id) {
+    size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rnd(static_cast<uint32_t>(tid));
 
-        // When scanning through the database, make sure that all unprepared
-        // keys have value >= snapshot and all other keys have value < snapshot.
-        int64_t snapshot_num = counter.fetch_add(1);
+    Transaction* txn;
+    TransactionOptions txn_options;
+    // batch_size of 1 causes writes to DB for every marker.
+    txn_options.write_batch_flush_threshold = 1;
+    ReadOptions read_options;
+
+    for (uint32_t i = 0; i < kNumIter; i++) {
+      std::set<std::string> owned_keys(&keys[id * kNumKeys],
+                                       &keys[(id + 1) * kNumKeys]);
+      // Add unowned keys to make the workload more interesting, but this
+      // increases row lock contention, so just do it sometimes.
+      if (rnd.OneIn(2)) {
+        owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
+      }
 
-        Status s;
-        for (const auto& key : owned_keys) {
-          buf[1] = counter.fetch_add(1);
-          s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
-          if (!s.ok()) {
-            break;
-          }
-          if (a == REFRESH_SNAPSHOT) {
-            txn->SetSnapshot();
-            read_options.snapshot = txn->GetSnapshot();
-            snapshot_num = counter.fetch_add(1);
-          }
-        }
+      txn = db->BeginTransaction(write_options, txn_options);
+      txn->SetName(ToString(id));
+      txn->SetSnapshot();
+      if (a >= RO_SNAPSHOT) {
+        read_options.snapshot = txn->GetSnapshot();
+        ASSERT_TRUE(read_options.snapshot != nullptr);
+      }
+
+      uint64_t buf[2];
+      buf[0] = id;
 
-        // Failure is possible due to snapshot validation. In this case,
-        // rollback and move onto next iteration.
+      // When scanning through the database, make sure that all unprepared
+      // keys have value >= snapshot and all other keys have value < snapshot.
+      int64_t snapshot_num = counter.fetch_add(1);
+
+      Status s;
+      for (const auto& key : owned_keys) {
+        buf[1] = counter.fetch_add(1);
+        s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
         if (!s.ok()) {
-          ASSERT_TRUE(s.IsBusy());
-          ASSERT_OK(txn->Rollback());
-          delete txn;
-          continue;
+          break;
         }
+        if (a == REFRESH_SNAPSHOT) {
+          txn->SetSnapshot();
+          read_options.snapshot = txn->GetSnapshot();
+          snapshot_num = counter.fetch_add(1);
+        }
+      }
 
-        auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
-                              const std::string& key,
-                              const std::string& value) {
-          if (owned_keys.count(key) > 0) {
-            ASSERT_EQ(value.size(), 16);
-
-            // Since this key is part of owned_keys, then this key must be
-            // unprepared by this transaction identified by 'id'
-            ASSERT_EQ(((int64_t*)value.c_str())[0], id);
-            if (a == REFRESH_SNAPSHOT) {
-              // If refresh snapshot is true, then the snapshot is refreshed
-              // after every Put(), meaning that the current snapshot in
-              // snapshot_num must be greater than the "seqno" of any keys
-              // written by the current transaction.
-              ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
-            } else {
-              // If refresh snapshot is not on, then the snapshot was taken at
-              // the beginning of the transaction, meaning all writes must come
-              // after snapshot_num
-              ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
-            }
-          } else if (a >= RO_SNAPSHOT) {
-            // If this is not an unprepared key, just assert that the key
-            // "seqno" is smaller than the snapshot seqno.
-            ASSERT_EQ(value.size(), 16);
+      // Failure is possible due to snapshot validation. In this case,
+      // rollback and move onto next iteration.
+      if (!s.ok()) {
+        ASSERT_TRUE(s.IsBusy());
+        ASSERT_OK(txn->Rollback());
+        delete txn;
+        continue;
+      }
+
+      auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
+                            const std::string& key, const std::string& value) {
+        if (owned_keys.count(key) > 0) {
+          ASSERT_EQ(value.size(), 16);
+
+          // Since this key is part of owned_keys, then this key must be
+          // unprepared by this transaction identified by 'id'
+          ASSERT_EQ(((int64_t*)value.c_str())[0], id);
+          if (a == REFRESH_SNAPSHOT) {
+            // If refresh snapshot is true, then the snapshot is refreshed
+            // after every Put(), meaning that the current snapshot in
+            // snapshot_num must be greater than the "seqno" of any keys
+            // written by the current transaction.
             ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+          } else {
+            // If refresh snapshot is not on, then the snapshot was taken at
+            // the beginning of the transaction, meaning all writes must come
+            // after snapshot_num
+            ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
           }
-        };
-
-        // Validate Get()/Next()/Prev(). Do only one of them to save time, and
-        // reduce lock contention.
-        switch (rnd.Uniform(3)) {
-          case 0:  // Validate Get()
-          {
-            for (const auto& key : keys) {
-              std::string value;
-              s = txn->Get(read_options, Slice(key), &value);
-              if (!s.ok()) {
-                ASSERT_TRUE(s.IsNotFound());
-                ASSERT_EQ(owned_keys.count(key), 0);
-              } else {
-                verify_key(key, value);
-              }
-            }
-            break;
-          }
-          case 1:  // Validate Next()
-          {
-            Iterator* iter = txn->GetIterator(read_options);
-            for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-              verify_key(iter->key().ToString(), iter->value().ToString());
+        } else if (a >= RO_SNAPSHOT) {
+          // If this is not an unprepared key, just assert that the key
+          // "seqno" is smaller than the snapshot seqno.
+          ASSERT_EQ(value.size(), 16);
+          ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+        }
+      };
+
+      // Validate Get()/Next()/Prev(). Do only one of them to save time, and
+      // reduce lock contention.
+      switch (rnd.Uniform(3)) {
+        case 0:  // Validate Get()
+        {
+          for (const auto& key : keys) {
+            std::string value;
+            s = txn->Get(read_options, Slice(key), &value);
+            if (!s.ok()) {
+              ASSERT_TRUE(s.IsNotFound());
+              ASSERT_EQ(owned_keys.count(key), 0);
+            } else {
+              verify_key(key, value);
             }
-            delete iter;
-            break;
           }
-          case 2:  // Validate Prev()
-          {
-            Iterator* iter = txn->GetIterator(read_options);
-            for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
-              verify_key(iter->key().ToString(), iter->value().ToString());
-            }
-            delete iter;
-            break;
+          break;
+        }
+        case 1:  // Validate Next()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
           }
-          default:
-            ASSERT_TRUE(false);
+          delete iter;
+          break;
         }
-
-        if (rnd.OneIn(2)) {
-          ASSERT_OK(txn->Commit());
-        } else {
-          ASSERT_OK(txn->Rollback());
+        case 2:  // Validate Prev()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
+          }
+          delete iter;
+          break;
         }
-        delete txn;
+        default:
+          ASSERT_TRUE(false);
       }
-    };
 
-    std::vector<port::Thread> threads;
-    for (uint32_t i = 0; i < kNumThreads; i++) {
-      threads.emplace_back(stress_thread, i);
+      if (rnd.OneIn(2)) {
+        ASSERT_OK(txn->Commit());
+      } else {
+        ASSERT_OK(txn->Rollback());
+      }
+      delete txn;
     }
+  };
 
-    for (auto& t : threads) {
-      t.join();
-    }
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < kNumThreads; i++) {
+    threads.emplace_back(stress_thread, i);
+  }
+
+  for (auto& t : threads) {
+    t.join();
   }
 }
 #endif  // ROCKSDB_VALGRIND_RUN

From cbfa729d371012af51529ee476264e8b99556574 Mon Sep 17 00:00:00 2001
From: houbingjian <houbingjian@huawei.com>
Date: Fri, 6 Sep 2019 17:01:45 -0700
Subject: [PATCH 356/572] =?UTF-8?q?cmakelist=20fix=EF=BC=8C=20add=20+crypt?=
 =?UTF-8?q?o=20flag=20when=20use=20arm=20crc=20(#5750)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
cmake list add +crypto flag when use armv8 cpu

the function crc32c_arm64 use HAVE_ARM64_CRYPTO to check if can enable arm-neon instructions :

#ifdef HAVE_ARM64_CRYPTO
  /* Crc32c Parallel computation
   *   Algorithm comes from Intel whitepaper:
   *   crc-iscsi-polynomial-crc32-instruction-paper
   *
   * Input data is divided into three equal-sized blocks
   *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
   *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
   */

but the cmakelist not check and pass crypto flag now

I check the default Makefile has it:

ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
CXXFLAGS += -march=armv8-a+crc+crypto
CFLAGS += -march=armv8-a+crc+crypto
ARMCRC_SOURCE=1
endif
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5750

Differential Revision: D17242027

fbshipit-source-id: 443c9b89755b4bc34e265205ab922db1b2e14bde
---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62c678ea91c..7496abac1a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,11 +200,11 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
-        CHECK_C_COMPILER_FLAG("-march=armv8-a+crc" HAS_ARMV8_CRC)
+        CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC)
   if(HAS_ARMV8_CRC)
     message(STATUS " HAS_ARMV8_CRC yes")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc -Wno-unused-function")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc -Wno-unused-function")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
   endif(HAS_ARMV8_CRC)
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
 

From adbc25a4c8cfb7ddebc7de017c89de64526e777b Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 6 Sep 2019 17:29:00 -0700
Subject: [PATCH 357/572] Rename InternalDBStatsType enum names (#5779)

Summary:
When building with clang 9, warning is reported for InternalDBStatsType type names shadowed the one for statistics. Rename them.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5779

Test Plan: Build with clang 9 and see it passes.

Differential Revision: D17239378

fbshipit-source-id: af28fb42066c738cd1b841f9fe21ab4671dafd18
---
 db/db_impl/db_impl_write.cc | 45 ++++++++++++++++++++-----------------
 db/internal_stats.cc        | 19 +++++++++-------
 db/internal_stats.h         | 40 ++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index d15165122be..fcf34f83f30 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -294,18 +294,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // We're optimistic, updating the stats before we successfully
     // commit.  That lets us release our leader status early.
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count,
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
                       concurrent_update);
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                       concurrent_update);
     RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
     RecordTick(stats_, WRITE_DONE_BY_SELF);
     auto write_done_by_other = write_group.size - 1;
     if (write_done_by_other > 0) {
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                        concurrent_update);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
       RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
     }
     RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
@@ -503,9 +504,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     }
 
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
     RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
     RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
 
@@ -513,10 +514,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
 
     if (w.status.ok() && !write_options.disableWAL) {
       PERF_TIMER_GUARD(write_wal_time);
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
       RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
       if (wal_write_group.size > 1) {
-        stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
                           wal_write_group.size - 1);
         RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
       }
@@ -593,7 +594,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
     w.sequence = seq;
     size_t total_count = WriteBatchInternal::Count(my_batch);
     InternalStats* stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
 
     ColumnFamilyMemTablesImpl column_family_memtables(
@@ -703,15 +704,16 @@ Status DBImpl::WriteImplWALOnly(
   // We're optimistic, updating the stats before we successfully
   // commit.  That lets us release our leader status early.
   auto stats = default_cf_internal_stats_;
-  stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                     concurrent_update);
   RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-  stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
   RecordTick(stats_, WRITE_DONE_BY_SELF);
   auto write_done_by_other = write_group.size - 1;
   if (write_done_by_other > 0) {
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                      concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
     RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
   }
   RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
@@ -1043,12 +1045,12 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   if (status.ok()) {
     auto stats = default_cf_internal_stats_;
     if (need_log_sync) {
-      stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
       RecordTick(stats_, WAL_FILE_SYNCED);
     }
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
     RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
   return status;
@@ -1094,9 +1096,10 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
   if (status.ok()) {
     const bool concurrent = true;
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size, concurrent);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
     RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal,
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
                       concurrent);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
@@ -1398,8 +1401,8 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
   }
   assert(!delayed || !write_options.no_slowdown);
   if (delayed) {
-    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
-                                           time_delayed);
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
     RecordTick(stats_, STALL_MICROS, time_delayed);
   }
 
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 50f6ed2e688..2ddd9122e84 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -954,14 +954,17 @@ void InternalStats::DumpDBStats(std::string* value) {
            seconds_up, interval_seconds_up);
   value->append(buf);
   // Cumulative
-  uint64_t user_bytes_written = GetDBStats(InternalStats::BYTES_WRITTEN);
-  uint64_t num_keys_written = GetDBStats(InternalStats::NUMBER_KEYS_WRITTEN);
-  uint64_t write_other = GetDBStats(InternalStats::WRITE_DONE_BY_OTHER);
-  uint64_t write_self = GetDBStats(InternalStats::WRITE_DONE_BY_SELF);
-  uint64_t wal_bytes = GetDBStats(InternalStats::WAL_FILE_BYTES);
-  uint64_t wal_synced = GetDBStats(InternalStats::WAL_FILE_SYNCED);
-  uint64_t write_with_wal = GetDBStats(InternalStats::WRITE_WITH_WAL);
-  uint64_t write_stall_micros = GetDBStats(InternalStats::WRITE_STALL_MICROS);
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);
 
   const int kHumanMicrosLen = 32;
   char human_micros[kHumanMicrosLen];
diff --git a/db/internal_stats.h b/db/internal_stats.h
index ebe90d574d6..24a8d98e6db 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -109,15 +109,15 @@ class InternalStats {
   };
 
   enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
   };
 
   InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
@@ -322,7 +322,7 @@ class InternalStats {
   };
 
   void Clear() {
-    for (int i = 0; i < INTERNAL_DB_STATS_ENUM_MAX; i++) {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
       db_stats_[i].store(0);
     }
     for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
@@ -416,7 +416,7 @@ class InternalStats {
   bool HandleBlockCacheStat(Cache** block_cache);
 
   // Per-DB stats
-  std::atomic<uint64_t> db_stats_[INTERNAL_DB_STATS_ENUM_MAX];
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
   // Per-ColumnFamily stats
   uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
   uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
@@ -615,15 +615,15 @@ class InternalStats {
   };
 
   enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
   };
 
   InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}

From fbab9913e241fb618ace06613e65242c5cd045e3 Mon Sep 17 00:00:00 2001
From: Wilfried Goesgens <willi@arangodb.com>
Date: Mon, 9 Sep 2019 11:22:28 -0700
Subject: [PATCH 358/572] upgrade gtest 1.7.0 => 1.8.1 for json result writing

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5332

Differential Revision: D17242232

fbshipit-source-id: c0d4646556a1335e51ac7382b986ca7f6ced7b64
---
 CMakeLists.txt                                |    4 +-
 Makefile                                      |    2 +-
 db/compaction/compaction_picker_test.cc       |    4 +-
 db/db_block_cache_test.cc                     |   16 +-
 db/db_compaction_filter_test.cc               |    2 +-
 db/db_compaction_test.cc                      |   20 +-
 db/db_memtable_test.cc                        |    2 +-
 db/db_options_test.cc                         |   10 +-
 db/db_table_properties_test.cc                |    2 +-
 db/db_test.cc                                 |   18 +-
 db/db_universal_compaction_test.cc            |   18 +-
 db/version_edit_test.cc                       |   10 +-
 db/version_set_test.cc                        |    2 +-
 db/write_batch.cc                             |   10 +-
 db/write_batch_internal.h                     |    4 +-
 db/write_batch_test.cc                        |   44 +-
 include/rocksdb/write_batch.h                 |    2 +-
 java/rocksjni/write_batch_test.cc             |    2 +-
 monitoring/stats_history_test.cc              |    6 +-
 options/options_test.cc                       |   34 +-
 table/block_based/block_test.cc               |   26 +-
 table/table_test.cc                           |   22 +-
 .../fused-src/gtest/CMakeLists.txt            |    0
 .../fused-src/gtest/gtest-all.cc              | 2437 +++++---
 .../fused-src/gtest/gtest.h                   | 4950 +++++++++++------
 .../gtest-1.8.1/fused-src/gtest/gtest_main.cc |   37 +
 util/hash_test.cc                             |   82 +-
 utilities/transactions/transaction_test.cc    |    6 +-
 .../write_batch_with_index.cc                 |    2 +-
 29 files changed, 5166 insertions(+), 2608 deletions(-)
 rename third-party/{gtest-1.7.0 => gtest-1.8.1}/fused-src/gtest/CMakeLists.txt (100%)
 rename third-party/{gtest-1.7.0 => gtest-1.8.1}/fused-src/gtest/gtest-all.cc (85%)
 rename third-party/{gtest-1.7.0 => gtest-1.8.1}/fused-src/gtest/gtest.h (90%)
 create mode 100644 third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7496abac1a8..0c6c566e8a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -468,7 +468,7 @@ endif()
 
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
+include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
 if(WITH_FOLLY_DISTRIBUTED_MUTEX)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
 endif()
@@ -879,7 +879,7 @@ endif()
 
 option(WITH_TESTS "build with tests" ON)
 if(WITH_TESTS)
-  add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
+  add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
   set(TESTS
         cache/cache_test.cc
         cache/lru_cache_test.cc
diff --git a/Makefile b/Makefile
index b90f4f1bbbc..dc90bdc9ae6 100644
--- a/Makefile
+++ b/Makefile
@@ -310,7 +310,7 @@ endif
 
 export GTEST_THROW_ON_FAILURE=1
 export GTEST_HAS_EXCEPTIONS=1
-GTEST_DIR = ./third-party/gtest-1.7.0/fused-src
+GTEST_DIR = ./third-party/gtest-1.8.1/fused-src
 # AIX: pre-defined system headers are surrounded by an extern "C" block
 ifeq ($(PLATFORM), OS_AIX)
 	PLATFORM_CCFLAGS += -I$(GTEST_DIR)
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 58a0a12f03e..f3c94022b19 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -1504,7 +1504,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
   ASSERT_EQ(5U, compaction->num_input_files(0));
   ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
             compaction->compaction_reason());
-  ASSERT_EQ(0U, compaction->output_level());
+  ASSERT_EQ(0, compaction->output_level());
 }
 
 TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
@@ -1534,7 +1534,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   ASSERT_EQ(4U, compaction->num_input_files(0));
   ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
             compaction->compaction_reason());
-  ASSERT_EQ(0U, compaction->output_level());
+  ASSERT_EQ(0, compaction->output_level());
 }
 
 }  // namespace rocksdb
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 422fd83bc20..39bb4de2f2c 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -484,11 +484,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
               TestGetTickerCount(options, BLOCK_CACHE_ADD));
     ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
     if (priority == Cache::Priority::LOW) {
-      ASSERT_EQ(0, MockCache::high_pri_insert_count);
-      ASSERT_EQ(2, MockCache::low_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::low_pri_insert_count);
     } else {
-      ASSERT_EQ(2, MockCache::high_pri_insert_count);
-      ASSERT_EQ(0, MockCache::low_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::low_pri_insert_count);
     }
 
     // Access data block.
@@ -502,11 +502,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
 
     // Data block should be inserted with low priority.
     if (priority == Cache::Priority::LOW) {
-      ASSERT_EQ(0, MockCache::high_pri_insert_count);
-      ASSERT_EQ(3, MockCache::low_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(3u, MockCache::low_pri_insert_count);
     } else {
-      ASSERT_EQ(2, MockCache::high_pri_insert_count);
-      ASSERT_EQ(1, MockCache::low_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(1u, MockCache::low_pri_insert_count);
     }
   }
 }
diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc
index 37e80048e6d..90fa1e88337 100644
--- a/db/db_compaction_filter_test.cc
+++ b/db/db_compaction_filter_test.cc
@@ -720,7 +720,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
   cfilter_count = 0;
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // The filter should delete 40 records.
-  ASSERT_EQ(40U, cfilter_count);
+  ASSERT_EQ(40, cfilter_count);
 
   {
     // Scan the entire database as of the snapshot to ensure
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 49307bc7839..1ff8bd38c54 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -732,7 +732,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
 
   // Now all column families qualify compaction but only one should be
   // scheduled, because no column family hits speed up condition.
-  ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   // Create two more files for one column family, which triggers speed up
   // condition, three compactions will be scheduled.
@@ -746,7 +746,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 2));
   }
-  ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   // Unblock all threads to unblock all compactions.
   for (size_t i = 0; i < kTotalTasks; i++) {
@@ -777,7 +777,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
 
   // Now all column families qualify compaction but only one should be
   // scheduled, because no column family hits speed up condition.
-  ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   for (size_t i = 0; i < kTotalTasks; i++) {
     sleeping_tasks[i].WakeUp();
@@ -4165,7 +4165,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
 
   const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
     "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
-  const int cf_count = sizeof cf_names / sizeof cf_names[0];
+  const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
 
   std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
 
@@ -4184,7 +4184,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   std::vector<Options> option_vector;
   option_vector.reserve(cf_count);
 
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     ColumnFamilyOptions cf_opt(options);
     if (cf == 0) {
       // "Default" CF does't use compaction limiter
@@ -4202,7 +4202,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     option_vector.emplace_back(DBOptions(options), cf_opt);
   }
 
-  for (int cf = 1; cf < cf_count; cf++) {
+  for (unsigned int cf = 1; cf < cf_count; cf++) {
     CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
   }
 
@@ -4254,7 +4254,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   int keyIndex = 0;
 
   for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
-    for (int cf = 0; cf < cf_count; cf++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
       for (int i = 0; i < kNumKeysPerFile; i++) {
         ASSERT_OK(Put(cf, Key(keyIndex++), ""));
       }
@@ -4262,13 +4262,13 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
       ASSERT_OK(Put(cf, "", ""));
     }
 
-    for (int cf = 0; cf < cf_count; cf++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
       dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
     }
   }
 
   // Enough L0 files to trigger compaction
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     ASSERT_EQ(NumTableFilesAtLevel(0, cf),
       options.level0_file_num_compaction_trigger);
   }
@@ -4295,7 +4295,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     sleeping_compact_tasks[i].WaitUntilDone();
   }
 
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
   }
 
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 184c6f53b11..d9ad649e736 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -322,7 +322,7 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
 
-  for (int cf = 0; cf < 2; ++cf) {
+  for (uint32_t cf = 0; cf < 2; ++cf) {
     ASSERT_OK(Put(cf, "key", "val"));
     ASSERT_OK(Flush(cf));
     ASSERT_EQ(
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index fd8d849cd56..787f77ebd72 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -505,10 +505,10 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
   options.stats_dump_period_sec = 5;
   options.env = env_;
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
 
   for (int i = 0; i < 20; i++) {
-    int num = rand() % 5000 + 1;
+    unsigned int num = rand() % 5000 + 1;
     ASSERT_OK(
         dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}}));
     ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
@@ -522,12 +522,12 @@ TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
   options.stats_persist_period_sec = 5;
   options.env = env_;
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
 
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
-  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
   ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
-  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
 }
 
 static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 82f106133e8..164042bc277 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -230,7 +230,7 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
 
   // Create one table per CF, then verify it was created with the column family
   // name property.
-  for (int cf = 0; cf < 2; ++cf) {
+  for (uint32_t cf = 0; cf < 2; ++cf) {
     Put(cf, "key", "val");
     Flush(cf);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index a78ba37c162..116403402ec 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4790,15 +4790,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
   DestroyAndReopen(options);
 
   // Initial defaults
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
@@ -4811,15 +4811,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
 
   ASSERT_OK(dbfull()->SetOptions(
       {{"compaction_options_universal", "{size_ratio=7;}"}}));
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
@@ -4832,15 +4832,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
 
   ASSERT_OK(dbfull()->SetOptions(
       {{"compaction_options_universal", "{min_merge_width=11;}"}}));
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            11);
+            11u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 4f1df4a7d57..524892e9e41 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -441,17 +441,17 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200U);
   ASSERT_OK(dbfull()->SetOptions(handles_[1],
                                  {{"compaction_options_universal",
                                    "{max_size_amplification_percent=110;}"}}));
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_size_amplification_percent,
-            110);
+            110u);
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
-  ASSERT_EQ(110, mutable_cf_options.compaction_options_universal
+  ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
                      .max_size_amplification_percent);
 
   dbfull()->TEST_WaitForCompact();
@@ -522,20 +522,20 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_merge_width,
-            2);
+            2u);
   ASSERT_EQ(
       dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
-      100);
+      100u);
 
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2u);
 
   dbfull()->TEST_WaitForCompact();
 
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 1bf8d1612c8..69381ca53b8 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -78,9 +78,9 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   ASSERT_TRUE(new_files[0].second.marked_for_compaction);
   ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
   ASSERT_TRUE(new_files[2].second.marked_for_compaction);
-  ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
-  ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
-  ASSERT_EQ(0, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
 }
 
 TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
@@ -127,8 +127,8 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   auto& new_files = parsed.GetNewFiles();
   ASSERT_TRUE(new_files[0].second.marked_for_compaction);
   ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
-  ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
-  ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
   ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
 }
 
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index a9e9dc069bd..a848d32d2a9 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -759,7 +759,7 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
         uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
-        EXPECT_EQ(0, *cf_id);
+        EXPECT_EQ(0u, *cf_id);
         ++count;
       });
   SyncPoint::GetInstance()->EnableProcessing();
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 396f8e44a01..225e3e94706 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -333,7 +333,7 @@ void WriteBatch::Clear() {
   wal_term_point_.clear();
 }
 
-int WriteBatch::Count() const {
+uint32_t WriteBatch::Count() const {
   return WriteBatchInternal::Count(this);
 }
 
@@ -538,7 +538,7 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
   // batches. We do that by checking whether the accumulated batch is empty
   // before seeing the next Noop.
   bool empty_batch = true;
-  int found = 0;
+  uint32_t found = 0;
   Status s;
   char tag = 0;
   uint32_t column_family = 0;  // default
@@ -733,11 +733,11 @@ void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
   b->is_latest_persistent_state_ = true;
 }
 
-int WriteBatchInternal::Count(const WriteBatch* b) {
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
   return DecodeFixed32(b->rep_.data() + 8);
 }
 
-void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
   EncodeFixed32(&b->rep_[8], n);
 }
 
@@ -1149,7 +1149,7 @@ Status WriteBatch::RollbackToSavePoint() {
   save_points_->stack.pop();
 
   assert(savepoint.size <= rep_.size());
-  assert(savepoint.count <= Count());
+  assert(static_cast<uint32_t>(savepoint.count) <= Count());
 
   if (savepoint.size == rep_.size()) {
     // No changes to rollback
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 6793e845074..1d742fee190 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -115,10 +115,10 @@ class WriteBatchInternal {
   static Status InsertNoop(WriteBatch* batch);
 
   // Return the number of entries in the batch.
-  static int Count(const WriteBatch* batch);
+  static uint32_t Count(const WriteBatch* batch);
 
   // Set the count for the number of entries in the batch.
-  static void SetCount(WriteBatch* batch, int n);
+  static void SetCount(WriteBatch* batch, uint32_t n);
 
   // Return the sequence number for the start of this batch.
   static SequenceNumber Sequence(const WriteBatch* batch);
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 6a3f9e68038..869cfa8cb72 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -37,7 +37,7 @@ static std::string PrintContents(WriteBatch* b) {
   ColumnFamilyMemTablesDefault cf_mems_default(mem);
   Status s =
       WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
-  int count = 0;
+  uint32_t count = 0;
   int put_count = 0;
   int delete_count = 0;
   int single_delete_count = 0;
@@ -132,8 +132,8 @@ class WriteBatchTest : public testing::Test {};
 TEST_F(WriteBatchTest, Empty) {
   WriteBatch batch;
   ASSERT_EQ("", PrintContents(&batch));
-  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
-  ASSERT_EQ(0, batch.Count());
+  ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0u, batch.Count());
 }
 
 TEST_F(WriteBatchTest, Multiple) {
@@ -144,14 +144,14 @@ TEST_F(WriteBatchTest, Multiple) {
   batch.Put(Slice("baz"), Slice("boo"));
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
-  ASSERT_EQ(4, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
   ASSERT_EQ(
       "Put(baz, boo)@103"
       "Delete(box)@101"
       "Put(foo, bar)@100"
       "DeleteRange(bar, foo)@102",
       PrintContents(&batch));
-  ASSERT_EQ(4, batch.Count());
+  ASSERT_EQ(4u, batch.Count());
 }
 
 TEST_F(WriteBatchTest, Corruption) {
@@ -174,19 +174,19 @@ TEST_F(WriteBatchTest, Append) {
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("",
             PrintContents(&b1));
-  ASSERT_EQ(0, b1.Count());
+  ASSERT_EQ(0u, b1.Count());
   b2.Put("a", "va");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200",
             PrintContents(&b1));
-  ASSERT_EQ(1, b1.Count());
+  ASSERT_EQ(1u, b1.Count());
   b2.Clear();
   b2.Put("b", "vb");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@201",
             PrintContents(&b1));
-  ASSERT_EQ(2, b1.Count());
+  ASSERT_EQ(2u, b1.Count());
   b2.Delete("foo");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200"
@@ -194,7 +194,7 @@ TEST_F(WriteBatchTest, Append) {
             "Put(b, vb)@201"
             "Delete(foo)@203",
             PrintContents(&b1));
-  ASSERT_EQ(4, b1.Count());
+  ASSERT_EQ(4u, b1.Count());
   b2.Clear();
   b2.Put("c", "cc");
   b2.Put("d", "dd");
@@ -209,29 +209,29 @@ TEST_F(WriteBatchTest, Append) {
       "Put(d, dd)@205"
       "Delete(foo)@203",
       PrintContents(&b1));
-  ASSERT_EQ(6, b1.Count());
+  ASSERT_EQ(6u, b1.Count());
   ASSERT_EQ(
       "Put(c, cc)@0"
       "Put(d, dd)@1"
       "Put(e, ee)@2",
       PrintContents(&b2));
-  ASSERT_EQ(3, b2.Count());
+  ASSERT_EQ(3u, b2.Count());
 }
 
 TEST_F(WriteBatchTest, SingleDeletion) {
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ("", PrintContents(&batch));
-  ASSERT_EQ(0, batch.Count());
+  ASSERT_EQ(0u, batch.Count());
   batch.Put("a", "va");
   ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   batch.SingleDelete("a");
   ASSERT_EQ(
       "SingleDelete(a)@101"
       "Put(a, va)@100",
       PrintContents(&batch));
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 }
 
 namespace {
@@ -317,7 +317,7 @@ namespace {
 TEST_F(WriteBatchTest, PutNotImplemented) {
   WriteBatch batch;
   batch.Put(Slice("k1"), Slice("v1"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -327,7 +327,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) {
 TEST_F(WriteBatchTest, DeleteNotImplemented) {
   WriteBatch batch;
   batch.Delete(Slice("k2"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -337,7 +337,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) {
 TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
   WriteBatch batch;
   batch.SingleDelete(Slice("k2"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -347,7 +347,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
 TEST_F(WriteBatchTest, MergeNotImplemented) {
   WriteBatch batch;
   batch.Merge(Slice("foo"), Slice("bar"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -364,7 +364,7 @@ TEST_F(WriteBatchTest, Blob) {
   batch.SingleDelete(Slice("k3"));
   batch.PutLogData(Slice("blob2"));
   batch.Merge(Slice("foo"), Slice("bar"));
-  ASSERT_EQ(6, batch.Count());
+  ASSERT_EQ(6u, batch.Count());
   ASSERT_EQ(
       "Merge(foo, bar)@5"
       "Put(k1, v1)@0"
@@ -399,7 +399,7 @@ TEST_F(WriteBatchTest, PrepareCommit) {
   ASSERT_EQ(s, Status::NotFound());
   WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
   WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 
   TestHandler handler;
   batch.Iterate(&handler);
@@ -489,7 +489,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
     batch.Put(raw, raw);
   }
 
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 
   struct NoopHandler : public WriteBatch::Handler {
     int num_seen = 0;
@@ -600,7 +600,7 @@ TEST_F(WriteBatchTest, PutGatherSlices) {
             "Put(foo, bar)@100"
             "Put(keypart2part3, value)@102",
             PrintContents(&batch));
-  ASSERT_EQ(3, batch.Count());
+  ASSERT_EQ(3u, batch.Count());
 }
 
 namespace {
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index b6b7c8bb820..0b42b9bd577 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -284,7 +284,7 @@ class WriteBatch : public WriteBatchBase {
   size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
-  int Count() const;
+  uint32_t Count() const;
 
   // Returns true if PutCF will be called during Iterate
   bool HasPut() const;
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index eeb269e486a..4b51b12781b 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -54,7 +54,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
   rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(
       b, &cf_mems_default, nullptr, nullptr);
-  int count = 0;
+  unsigned int count = 0;
   rocksdb::Arena arena;
   rocksdb::ScopedArenaIterator iter(
       mem->NewIterator(rocksdb::ReadOptions(), &arena));
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index 9adacdbf7bc..f1ec3391132 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -56,7 +56,7 @@ TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
       "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
   dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
   ASSERT_GE(counter, 1);
 
@@ -95,7 +95,7 @@ TEST_F(StatsHistoryTest, StatsPersistScheduling) {
       "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
   dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
   ASSERT_GE(counter, 1);
 
@@ -132,7 +132,7 @@ TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   Reopen(options);
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
   dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
   ASSERT_GE(counter, 1);
   Close();
diff --git a/options/options_test.cc b/options/options_test.cc
index 0e7cebf3a15..9f3e2f0926a 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -166,15 +166,15 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
   ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
   ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
-  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7);
-  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
   ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
-  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8);
-  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.num_levels, 8);
   ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
@@ -383,10 +383,10 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
              "write_buffer_size=13; =100;", &new_cf_opt));
   ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
 
-  const int64_t kilo = 1024UL;
-  const int64_t mega = 1024 * kilo;
-  const int64_t giga = 1024 * mega;
-  const int64_t tera = 1024 * giga;
+  const uint64_t kilo = 1024UL;
+  const uint64_t mega = 1024 * kilo;
+  const uint64_t giga = 1024 * mega;
+  const uint64_t tera = 1024 * giga;
 
   // Units (k)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
@@ -397,7 +397,7 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
             "max_write_buffer_number=16m;inplace_update_num_locks=17M",
             &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
-  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
   // Units (g)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       base_cf_opt,
@@ -543,7 +543,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
              "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
              "bad_option=1",
              &new_opt));
-  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+  ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.index_type, new_opt.index_type);
 
@@ -692,7 +692,7 @@ TEST_F(OptionsTest, GetPlainTableOptionsFromString) {
             "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
             "full_scan_mode=true;store_index_in_file=true",
             &new_opt));
-  ASSERT_EQ(new_opt.user_key_len, 66);
+  ASSERT_EQ(new_opt.user_key_len, 66u);
   ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
   ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
   ASSERT_EQ(new_opt.index_sparseness, 8);
@@ -792,15 +792,15 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.compression_opts.window_bits, 4);
   ASSERT_EQ(new_options.compression_opts.level, 5);
   ASSERT_EQ(new_options.compression_opts.strategy, 6);
-  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0);
-  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
   ASSERT_EQ(new_options.compression_opts.enabled, false);
   ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
   ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
   ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
   ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
-  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0);
-  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
   ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
   ASSERT_EQ(new_options.write_buffer_size, 10U);
   ASSERT_EQ(new_options.max_write_buffer_number, 16);
@@ -1874,9 +1874,9 @@ TEST_F(OptionsParserTest, IntegerParsing) {
   ASSERT_EQ(ParseUint64("18446744073709551615"), 18446744073709551615U);
   ASSERT_EQ(ParseUint32("4294967295"), 4294967295U);
   ASSERT_EQ(ParseSizeT("18446744073709551615"), 18446744073709551615U);
-  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807U);
+  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807);
   ASSERT_EQ(ParseInt64("-9223372036854775808"), port::kMinInt64);
-  ASSERT_EQ(ParseInt32("2147483647"), 2147483647U);
+  ASSERT_EQ(ParseInt32("2147483647"), 2147483647);
   ASSERT_EQ(ParseInt32("-2147483648"), port::kMinInt32);
   ASSERT_EQ(ParseInt("-32767"), -32767);
   ASSERT_EQ(ParseDouble("-1.234567"), -1.234567);
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index e0ca24bf482..38fa55089f1 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -478,19 +478,19 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
 TEST_F(BlockTest, ReadAmpBitmapPow2) {
   std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
-  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32);
-
-  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
 }
 
 class IndexBlockTest
diff --git a/table/table_test.cc b/table/table_test.cc
index c1f9ed3f3a8..cd7363df0da 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2629,7 +2629,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
                       0, 0, 0);
     ASSERT_EQ(props.GetCacheBytesRead(), 0);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2645,7 +2645,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache hit, bytes read from cache should increase
     ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2658,7 +2658,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache miss, Bytes read from cache should not change
     ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2672,7 +2672,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache hit, bytes read from cache should increase
     ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
   }
   // release the iterator so that the block cache can reset correctly.
   iter.reset();
@@ -3745,8 +3745,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   };
 
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(0, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(0u, global_seqno);
 
   InternalIterator* iter = GetTableInternalIter();
   char current_c = 'a';
@@ -3766,8 +3766,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   // Update global sequence number to 10
   SetGlobalSeqno(10);
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(10, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(10u, global_seqno);
 
   iter = GetTableInternalIter();
   current_c = 'a';
@@ -3803,8 +3803,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   // Update global sequence number to 3
   SetGlobalSeqno(3);
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(3, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(3u, global_seqno);
 
   iter = GetTableInternalIter();
   current_c = 'a';
@@ -4023,7 +4023,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     Block properties_block(std::move(properties_contents),
                            kDisableGlobalSequenceNumber);
 
-    ASSERT_EQ(properties_block.NumRestarts(), 1);
+    ASSERT_EQ(properties_block.NumRestarts(), 1u);
   }
 }
 
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt b/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
similarity index 100%
rename from third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt
rename to third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
similarity index 85%
rename from third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
rename to third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
index 7c5217202f8..9f2b3d56530 100644
--- a/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
@@ -26,17 +26,13 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
+// Google C++ Testing and Mocking Framework (Google Test)
 //
 // Sometimes it's desirable to build Google Test by compiling a single file.
 // This file serves this purpose.
 
-// Suppress clang analyzer warnings.
-#ifndef __clang_analyzer__
-
 // This line ensures that gtest.h can be compiled on its own, even
 // when it's fused.
 #include "gtest/gtest.h"
@@ -70,10 +66,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 // Copyright 2007, Google Inc.
 // All rights reserved.
@@ -103,15 +98,20 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 //
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
+// GOOGLETEST_CM0004 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // This helper class can be used to mock out Google Test failure reporting
@@ -172,13 +172,12 @@ class GTEST_API_ SingleFailureChecker {
  public:
   // The constructor remembers the arguments.
   SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
+                       TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
-  const string substr_;
+  const std::string substr_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
 };
@@ -187,6 +186,8 @@ class GTEST_API_ SingleFailureChecker {
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // A set of macros for testing Google Test assertions or code that's expected
 // to generate Google Test fatal failures.  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
@@ -326,7 +327,7 @@ class GTEST_API_ SingleFailureChecker {
 
 #if GTEST_OS_LINUX
 
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 // gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -365,9 +366,9 @@ class GTEST_API_ SingleFailureChecker {
 
 # if GTEST_OS_WINDOWS_MINGW
 // MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
+// FIXME: There are other ways to get the time on
 //   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
 //   supports these.  consider using them instead.
 #  define GTEST_HAS_GETTIMEOFDAY_ 1
@@ -382,7 +383,7 @@ class GTEST_API_ SingleFailureChecker {
 #else
 
 // Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -404,12 +405,6 @@ class GTEST_API_ SingleFailureChecker {
 # include <sys/types.h>  // NOLINT
 #endif
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
 // Copyright 2005, Google Inc.
 // All rights reserved.
 //
@@ -439,24 +434,13 @@ class GTEST_API_ SingleFailureChecker {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
+// Utility functions and classes used by the Google C++ testing framework.//
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
 #ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
 #define GTEST_SRC_GTEST_INTERNAL_INL_H_
 
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// If this file is included from the user's code, just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
 #ifndef _WIN32_WCE
 # include <errno.h>
 #endif  // !_WIN32_WCE
@@ -479,6 +463,9 @@ class GTEST_API_ SingleFailureChecker {
 #endif  // GTEST_OS_WINDOWS
 
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // Declares the flags.
@@ -503,12 +490,14 @@ const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
 const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
 const char kRepeatFlag[] = "repeat";
 const char kShuffleFlag[] = "shuffle";
 const char kStackTraceDepthFlag[] = "stack_trace_depth";
 const char kStreamResultToFlag[] = "stream_result_to";
 const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
 
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
@@ -582,6 +571,7 @@ class GTestFlagSaver {
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
     print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
     repeat_ = GTEST_FLAG(repeat);
     shuffle_ = GTEST_FLAG(shuffle);
@@ -603,6 +593,7 @@ class GTestFlagSaver {
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
     GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
     GTEST_FLAG(repeat) = repeat_;
     GTEST_FLAG(shuffle) = shuffle_;
@@ -624,6 +615,7 @@ class GTestFlagSaver {
   bool list_tests_;
   std::string output_;
   bool print_time_;
+  bool print_utf8_;
   internal::Int32 random_seed_;
   internal::Int32 repeat_;
   bool shuffle_;
@@ -673,7 +665,7 @@ GTEST_API_ bool ShouldShard(const char* total_shards_str,
 
 // Parses the environment variable var as an Int32. If it is unset,
 // returns default_val. If it is not an Int32, prints an error and
-// aborts.
+// and aborts.
 GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
 
 // Given the total number of shards, the shard index, and the test id,
@@ -834,13 +826,17 @@ class OsStackTraceGetterInterface {
   //                in the trace.
   //   skip_count - the number of top frames to be skipped; doesn't count
   //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
 
   // UponLeavingGTest() should be called immediately before Google Test calls
   // user code. It saves some information about the current stack that
   // CurrentStackTrace() will use to find and hide Google Test stack frames.
   virtual void UponLeavingGTest() = 0;
 
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
 };
@@ -848,25 +844,21 @@ class OsStackTraceGetterInterface {
 // A working implementation of the OsStackTraceGetterInterface interface.
 class OsStackTraceGetter : public OsStackTraceGetterInterface {
  public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
+  OsStackTraceGetter() {}
 
-  virtual string CurrentStackTrace(int max_depth, int skip_count)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
 
  private:
-  Mutex mutex_;  // protects all internal state
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
 
   // We save the stack frame below the frame that calls user code.
   // We do this because the address of the frame immediately below
   // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
 };
@@ -1082,13 +1074,11 @@ class GTEST_API_ UnitTestImpl {
                 tear_down_tc)->AddTestInfo(test_info);
   }
 
-#if GTEST_HAS_PARAM_TEST
   // Returns ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
     return parameterized_test_registry_;
   }
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Sets the TestCase object for the test that's currently running.
   void set_current_test_case(TestCase* a_current_test_case) {
@@ -1263,14 +1253,12 @@ class GTEST_API_ UnitTestImpl {
   // shuffled order.
   std::vector<int> test_case_indices_;
 
-#if GTEST_HAS_PARAM_TEST
   // ParameterizedTestRegistry object used to register value-parameterized
   // tests.
   internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
 
   // Indicates whether RegisterParameterizedTests() has been called already.
   bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Index of the last death test case registered.  Initially -1.
   int last_death_test_case_;
@@ -1410,7 +1398,7 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
 
   const bool parse_success = *end == '\0' && errno == 0;
 
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // FIXME: Convert this to compile time assertion when it is
   // available.
   GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
 
@@ -1458,21 +1446,19 @@ class StreamingListener : public EmptyTestEventListener {
     virtual ~AbstractSocketWriter() {}
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
+    virtual void Send(const std::string& message) = 0;
 
     // Closes the socket.
     virtual void CloseConnection() {}
 
     // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
+    void SendLn(const std::string& message) { Send(message + "\n"); }
   };
 
   // Concrete class for actually writing strings to a socket.
   class SocketWriter : public AbstractSocketWriter {
    public:
-    SocketWriter(const string& host, const string& port)
+    SocketWriter(const std::string& host, const std::string& port)
         : sockfd_(-1), host_name_(host), port_num_(port) {
       MakeConnection();
     }
@@ -1483,7 +1469,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) {
+    virtual void Send(const std::string& message) {
       GTEST_CHECK_(sockfd_ != -1)
           << "Send() can be called only when there is a connection.";
 
@@ -1509,17 +1495,19 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
+    const std::string host_name_;
+    const std::string port_num_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
+  static std::string UrlEncode(const char* str);
 
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
       : socket_writer_(socket_writer) { Start(); }
@@ -1580,13 +1568,13 @@ class StreamingListener : public EmptyTestEventListener {
 
  private:
   // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
 
   // Called at the start of streaming to notify the receiver what
   // protocol we are using.
   void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
 
-  string FormatBool(bool value) { return value ? "1" : "0"; }
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
 
   const scoped_ptr<AbstractSocketWriter> socket_writer_;
 
@@ -1598,13 +1586,27 @@ class StreamingListener : public EmptyTestEventListener {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
-#undef GTEST_IMPLEMENTATION_
 
 #if GTEST_OS_WINDOWS
 # define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 using internal::CountIf;
@@ -1626,8 +1628,10 @@ static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
 // A test filter that matches everything.
 static const char kUniversalFilter[] = "*";
 
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
 
 // The environment variable name for the test shard index.
 static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
@@ -1646,9 +1650,31 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // specified on the command line.
 bool g_help_flag = false;
 
+// Utilty function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = NULL;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == NULL) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
 }  // namespace internal
 
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
 static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
   return kUniversalFilter;
 }
 
@@ -1685,15 +1711,28 @@ GTEST_DEFINE_string_(
     "exclude).  A test is run if it matches one of the positive "
     "patterns and does not match any of the negative patterns.");
 
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
 GTEST_DEFINE_bool_(list_tests, false,
                    "List all tests without running them.");
 
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
+    internal::StringFromGTestEnv("output",
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
     "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
     "If a directory is specified, output files will be created "
     "within that directory, with file-names based on the test "
@@ -1706,6 +1745,12 @@ GTEST_DEFINE_bool_(
     "True iff " GTEST_NAME_
     " should display elapsed time in text output.");
 
+GTEST_DEFINE_bool_(
+    print_utf8,
+    internal::BoolFromGTestEnv("print_utf8", true),
+    "True iff " GTEST_NAME_
+    " prints UTF8 characters as text.");
+
 GTEST_DEFINE_int32_(
     random_seed,
     internal::Int32FromGTestEnv("random_seed", 0),
@@ -1747,7 +1792,14 @@ GTEST_DEFINE_bool_(
     internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
 namespace internal {
 
@@ -1756,7 +1808,8 @@ namespace internal {
 // than kMaxRange.
 UInt32 Random::Generate(UInt32 range) {
   // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
 
   GTEST_CHECK_(range > 0)
       << "Cannot generate a number in the range [0, 0).";
@@ -1773,13 +1826,7 @@ UInt32 Random::Generate(UInt32 range) {
 // GTestIsInitialized() returns true iff the user has initialized
 // Google Test.  Useful for catching the user mistake of not initializing
 // Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-GTEST_API_ int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
 
 // Iterates over a vector of TestCases, keeping a running sum of the
 // results of calling a given int-returning method on each.
@@ -1835,8 +1882,19 @@ void AssertHelper::operator=(const Message& message) const {
 // Mutex for linked pointers.
 GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
 
-// Application pathname gotten in InitGoogleTest.
-std::string g_executable_path;
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
 
 // Returns the current application's name, removing directory path if that
 // is present.
@@ -1844,9 +1902,9 @@ FilePath GetCurrentExecutableName() {
   FilePath result;
 
 #if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
 #else
-  result.Set(FilePath(g_executable_path));
+  result.Set(FilePath(GetArgvs()[0]));
 #endif  // GTEST_OS_WINDOWS
 
   return result.RemoveDirectoryName();
@@ -1857,8 +1915,6 @@ FilePath GetCurrentExecutableName() {
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == NULL) ?
       std::string(gtest_output_flag) :
@@ -1869,19 +1925,22 @@ std::string UnitTestOptions::GetOutputFormat() {
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
+
+  std::string format = GetOutputFormat();
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
+    return internal::FilePath::MakeFileName(
         internal::FilePath(
             UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // FIXME: on Windows \some\path is not an absolute
     // path (as its meaning depends on the current drive), yet the
     // following logic for turning it into an absolute path is wrong.
     // Fix it.
@@ -2072,12 +2131,12 @@ extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
 // given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
   const std::string expected(type == TestPartResult::kFatalFailure ?
                         "1 fatal failure" :
                         "1 non-fatal failure");
@@ -2111,13 +2170,10 @@ AssertionResult HasOneFailure(const char* /* results_expr */,
 // The constructor of SingleFailureChecker remembers where to look up
 // test part results, what type of failure we expect, and what
 // substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
 
 // The destructor of SingleFailureChecker verifies that the given
 // TestPartResultArray contains exactly one failure that has the given
@@ -2238,8 +2294,12 @@ int UnitTestImpl::test_to_run_count() const {
 // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return "";
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
 }
 
 // Returns the current time in milliseconds.
@@ -2254,7 +2314,7 @@ TimeInMillis GetTimeInMillis() {
   SYSTEMTIME now_systime;
   FILETIME now_filetime;
   ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
+  // FIXME: Shouldn't this just use
   //   GetSystemTimeAsFileTime()?
   GetSystemTime(&now_systime);
   if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
@@ -2270,11 +2330,11 @@ TimeInMillis GetTimeInMillis() {
 
   // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
   // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  // FIXME: Use GetTickCount()?  Or use
   //   SystemTimeToFileTime()
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
   _ftime64(&now);
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
   return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
 #elif GTEST_HAS_GETTIMEOFDAY_
@@ -2359,6 +2419,23 @@ static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
 
 #endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
 
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
 }  // namespace internal
 
 // Constructs an empty Message.
@@ -2724,41 +2801,42 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 // and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
 // where foo is 5 and bar is 6, we have:
 //
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
 //
 // The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
 // be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const std::string& expected_value,
-                          const std::string& actual_value,
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
                           bool ignoring_case) {
   Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
   }
 
-  msg << "\nExpected: " << expected_expression;
   if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
+    msg << "\nIgnoring case";
   }
 
-  if (!expected_value.empty() && !actual_value.empty()) {
-    const std::vector<std::string> expected_lines =
-        SplitEscapedString(expected_value);
-    const std::vector<std::string> actual_lines =
-        SplitEscapedString(actual_value);
-    if (expected_lines.size() > 1 || actual_lines.size() > 1) {
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
-          << edit_distance::CreateUnifiedDiff(expected_lines, actual_lines);
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
     }
   }
 
@@ -2791,7 +2869,7 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
-  // TODO(wan): do not print the value of an expression if it's
+  // FIXME: do not print the value of an expression if it's
   // already a literal.
   return AssertionFailure()
       << "The difference between " << expr1 << " and " << expr2
@@ -2857,18 +2935,18 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_EQ with int or enum
 // arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
                    false);
 }
 
@@ -2907,34 +2985,34 @@ GTEST_IMPL_CMP_HELPER_(GT, > )
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    true);
 }
 
@@ -3086,7 +3164,7 @@ namespace {
 AssertionResult HRESULTFailureHelper(const char* expr,
                                      const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
@@ -3143,7 +3221,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -3289,18 +3367,18 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 }
 
 // Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
@@ -3519,13 +3597,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
-};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 template <int kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
@@ -3561,8 +3634,9 @@ static std::string FormatWordList(const std::vector<std::string>& words) {
   return word_list.GetString();
 }
 
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
           reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
@@ -3633,14 +3707,15 @@ int TestResult::test_property_count() const {
 
 // Creates a Test object.
 
-// The c'tor saves the values of all Google Test flags.
+// The c'tor saves the states of all flags.
 Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
 }
 
-// The d'tor restores the values of all Google Test flags.
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
 Test::~Test() {
-  delete gtest_flag_saver_;
 }
 
 // Sets up the test fixture.
@@ -3858,6 +3933,8 @@ Result HandleExceptionsInMethodIfSupported(
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
     } catch (const internal::GoogleTestFailureException&) {  // NOLINT
       // This exception type can only be thrown by a failed Google
       // Test assertion with the intention of letting another testing
@@ -3924,12 +4001,14 @@ TestInfo::TestInfo(const std::string& a_test_case_name,
                    const std::string& a_name,
                    const char* a_type_param,
                    const char* a_value_param,
+                   internal::CodeLocation a_code_location,
                    internal::TypeId fixture_class_id,
                    internal::TestFactoryBase* factory)
     : test_case_name_(a_test_case_name),
       name_(a_name),
       type_param_(a_type_param ? new std::string(a_type_param) : NULL),
       value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
       fixture_class_id_(fixture_class_id),
       should_run_(false),
       is_disabled_(false),
@@ -3953,6 +4032,7 @@ namespace internal {
 //                     this is not a typed or a type-parameterized test.
 //   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -3964,20 +4044,20 @@ TestInfo* MakeAndRegisterTestInfo(
     const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
     TestFactoryBase* factory) {
   TestInfo* const test_info =
       new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
+                   code_location, fixture_class_id, factory);
   GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
   return test_info;
 }
 
-#if GTEST_HAS_PARAM_TEST
 void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
+                               CodeLocation code_location) {
   Message errors;
   errors
       << "Attempted redefinition of test case " << test_case_name << ".\n"
@@ -3989,11 +4069,10 @@ void ReportInvalidTestCaseType(const char* test_case_name,
       << "probably rename one of the classes to put the tests into different\n"
       << "test cases.";
 
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
 }
-#endif  // GTEST_HAS_PARAM_TEST
-
 }  // namespace internal
 
 namespace {
@@ -4031,12 +4110,10 @@ namespace internal {
 // and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
 // This will be done just once during the program runtime.
 void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
   if (!parameterized_tests_registered_) {
     parameterized_test_registry_.RegisterTests();
     parameterized_tests_registered_ = true;
   }
-#endif
 }
 
 }  // namespace internal
@@ -4064,18 +4141,18 @@ void TestInfo::Run() {
       factory_, &internal::TestFactoryBase::CreateTest,
       "the test fixture's constructor");
 
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
+  // Runs the test if the constructor didn't generate a fatal failure.
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure()) {
     // This doesn't throw as all user code that can throw are wrapped into
     // exception handling code.
     test->Run();
   }
 
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
 
   result_.set_elapsed_time(internal::GetTimeInMillis() - start);
 
@@ -4301,10 +4378,10 @@ enum GTestColor {
 };
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
+static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
     case COLOR_RED:    return FOREGROUND_RED;
     case COLOR_GREEN:  return FOREGROUND_GREEN;
@@ -4313,11 +4390,42 @@ WORD GetColorAttribute(GTestColor color) {
   }
 }
 
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
 #else
 
 // Returns the ANSI color code for the given color.  COLOR_DEFAULT is
 // an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
+static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
     case COLOR_RED:     return "1";
     case COLOR_GREEN:   return "2";
@@ -4333,7 +4441,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
   const char* const gtest_color = GTEST_FLAG(color).c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
     // On Windows the TERM variable is usually not set, but the
     // console there does support colors.
     return stdout_is_tty;
@@ -4346,6 +4454,10 @@ bool ShouldUseColor(bool stdout_is_tty) {
         String::CStringEquals(term, "xterm-256color") ||
         String::CStringEquals(term, "screen") ||
         String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
         String::CStringEquals(term, "linux") ||
         String::CStringEquals(term, "cygwin");
     return stdout_is_tty && term_supports_color;
@@ -4365,7 +4477,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -4386,20 +4498,21 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   }
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
   const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
   vprintf(fmt, args);
 
   fflush(stdout);
@@ -4413,12 +4526,12 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_end(args);
 }
 
-// Text printed in Google Test's text output and --gunit_list_tests
+// Text printed in Google Test's text output and --gtest_list_tests
 // output to label the type parameter and value parameter for a test.
 static const char kTypeParamLabel[] = "TypeParam";
 static const char kValueParamLabel[] = "GetParam()";
 
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
   const char* const type_param = test_info.type_param();
   const char* const value_param = test_info.value_param();
 
@@ -4689,7 +4802,7 @@ void TestEventRepeater::Append(TestEventListener *listener) {
   listeners_.push_back(listener);
 }
 
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+// FIXME: Factor the search functionality into Vector::Find.
 TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
@@ -4763,6 +4876,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   explicit XmlUnitTestResultPrinter(const char* output_file);
 
   virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  void ListTestsMatchingFilter(const std::vector<TestCase*>& test_cases);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -4824,6 +4942,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // to delimit this attribute from prior attributes.
   static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
 
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
   // The output file.
   const std::string output_file_;
 
@@ -4833,46 +4956,30 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
 // Creates a new XmlUnitTestResultPrinter.
 XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
     : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
   }
 }
 
 // Called after the unit test ends.
 void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                   int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
+  FILE* xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlUnitTest(&stream, unit_test);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
   fclose(xmlout);
 }
 
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestCase*>& test_cases) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_cases);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
 // Returns an XML-escaped copy of the input string str.  If is_attribute
 // is true, the text is meant to appear as an attribute value, and
 // normalizable whitespace is preserved by replacing it with character
@@ -4883,7 +4990,7 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// FIXME: It might be nice to have a minimally invasive, human-readable
 // escaping scheme for invalid characters, rather than dropping them.
 std::string XmlUnitTestResultPrinter::EscapeXml(
     const std::string& str, bool is_attribute) {
@@ -4944,6 +5051,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
+// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -4961,7 +5069,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 // Formats the given time in milliseconds as seconds.
 std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
   ::std::stringstream ss;
-  ss << ms/1000.0;
+  ss << (static_cast<double>(ms) * 1e-3);
   return ss.str();
 }
 
@@ -5033,13 +5141,17 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute(
 }
 
 // Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
+// FIXME: There is also value in printing properties with the plain printer.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_case_name,
                                                  const TestInfo& test_info) {
   const TestResult& result = *test_info.result();
   const std::string kTestcase = "testcase";
 
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
   *stream << "    <testcase";
   OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
 
@@ -5050,13 +5162,19 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
   if (test_info.type_param() != NULL) {
     OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
   }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestcase, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestcase, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
 
   OutputXmlAttribute(stream, kTestcase, "status",
                      test_info.should_run() ? "run" : "notrun");
   OutputXmlAttribute(stream, kTestcase, "time",
                      FormatTimeInMillisAsSeconds(result.elapsed_time()));
   OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
 
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
@@ -5065,22 +5183,28 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
       if (++failures == 1) {
         *stream << ">\n";
       }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
               << EscapeXmlAttribute(summary.c_str())
               << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
+      const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
     }
   }
 
-  if (failures == 0)
+  if (failures == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
-  else
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
     *stream << "    </testcase>\n";
+  }
 }
 
 // Prints an XML representation of a TestCase object
@@ -5091,17 +5215,18 @@ void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_case.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_case.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+    *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result());
+  }
+  *stream << ">\n";
   for (int i = 0; i < test_case.total_test_count(); ++i) {
     if (test_case.GetTestInfo(i)->is_reportable())
       OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
@@ -5135,7 +5260,6 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
-
   *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
 
   OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
@@ -5148,6 +5272,28 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
   *stream << "</" << kTestsuites << ">\n";
 }
 
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    PrintXmlTestCase(stream, *test_cases[i]);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
 std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
@@ -5161,130 +5307,536 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   return attributes.GetString();
 }
 
-// End XmlUnitTestResultPrinter
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
 
-#if GTEST_CAN_STREAM_RESULTS_
+  if (result.test_property_count() <= 0) {
+    return;
+  }
 
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
   }
-  return result;
+  *stream << "</" << kProperties << ">\n";
 }
 
-void StreamingListener::SocketWriter::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
+// End XmlUnitTestResultPrinter
 
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
 
-  freeaddrinfo(servinfo);  // all done with this structure
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
 
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
 
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_case_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestCase object
+  static void PrintJsonTestCase(::std::ostream* stream,
+                                const TestCase& test_case);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
 
-// Class ScopedTrace
+  // The output file.
+  const std::string output_file_;
 
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
 
-  UnitTest::GetInstance()->PushGTestTrace(trace);
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
 }
 
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
 }
 
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
 
-// class OsStackTraceGetter
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
 
-// Returns the current OS stack trace as an std::string.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
-                                             int /* skip_count */)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  return "";
+  return m.GetString();
 }
 
-void OsStackTraceGetter::UponLeavingGTest()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
 }
 
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(int width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma)
+    *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma)
+    *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_case_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "value_param",
+                  test_info.value_param(), kIndent);
+  }
+  if (test_info.type_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestcase, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestcase, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestCase object
+void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream,
+                                                  const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(),
+                  kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_case.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_case.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestCase(stream, *unit_test.GetTestCase(i));
+    }
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestCase(stream, *test_cases[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
     "... " GTEST_NAME_ " internal frames ...";
 
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else  // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
 // A helper class that creates the premature-exit file in its
 // constructor and deletes the file in its destructor.
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+    if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
@@ -5295,13 +5847,18 @@ class ScopedPrematureExitFile {
   }
 
   ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
     }
   }
 
  private:
-  const char* const premature_exit_filepath_;
+  const std::string premature_exit_filepath_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
 };
@@ -5571,6 +6128,11 @@ void UnitTest::AddTestPartResult(
       // when a failure happens and both the --gtest_break_on_failure and
       // the --gtest_catch_exceptions flags are specified.
       DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
 #else
       // Dereference NULL through a volatile pointer to prevent the compiler
       // from removing. We use this rather than abort() or __builtin_trap() for
@@ -5638,7 +6200,7 @@ int UnitTest::Run() {
   // used for the duration of the program.
   impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
 
-#if GTEST_HAS_SEH
+#if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
   // tests or this is executing in the context of death test child
   // process. In either case the user does not want to see pop-up dialogs
@@ -5667,7 +6229,7 @@ int UnitTest::Run() {
     // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
     // Users of prior VC versions shall suffer the agony and pain of
     // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // FIXME: find a way to suppress the abort dialog() in the
     // debug mode when compiled with VC 7.1 or lower.
     if (!GTEST_FLAG(break_on_failure))
       _set_abort_behavior(
@@ -5675,7 +6237,7 @@ int UnitTest::Run() {
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
 # endif
   }
-#endif  // GTEST_HAS_SEH
+#endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
       impl(),
@@ -5708,7 +6270,6 @@ const TestInfo* UnitTest::current_test_info() const
 // Returns the random seed used at the start of the current test run.
 int UnitTest::random_seed() const { return impl_->random_seed(); }
 
-#if GTEST_HAS_PARAM_TEST
 // Returns ParameterizedTestCaseRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
 internal::ParameterizedTestCaseRegistry&
@@ -5716,7 +6277,6 @@ internal::ParameterizedTestCaseRegistry&
         GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
-#endif  // GTEST_HAS_PARAM_TEST
 
 // Creates an empty UnitTest.
 UnitTest::UnitTest() {
@@ -5755,10 +6315,8 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent)
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
           &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
       parameterized_test_registry_(),
       parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
       last_death_test_case_(-1),
       current_test_case_(NULL),
       current_test_info_(NULL),
@@ -5825,10 +6383,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
   } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
   }
 }
 
@@ -5843,9 +6403,8 @@ void UnitTestImpl::ConfigureStreamingOutput() {
       listeners()->Append(new StreamingListener(target.substr(0, pos),
                                                 target.substr(pos+1)));
     } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
     }
   }
 }
@@ -5861,6 +6420,11 @@ void UnitTestImpl::PostFlagParsingInit() {
   if (!post_flag_parse_init_performed_) {
     post_flag_parse_init_performed_ = true;
 
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
 #if GTEST_HAS_DEATH_TEST
     InitDeathTestSubprocessControlInfo();
     SuppressTestEventsIfInSubprocess();
@@ -5879,6 +6443,13 @@ void UnitTestImpl::PostFlagParsingInit() {
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
 #endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
   }
 }
 
@@ -5922,11 +6493,11 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
                                     Test::SetUpTestCaseFunc set_up_tc,
                                     Test::TearDownTestCaseFunc tear_down_tc) {
   // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
+  const std::vector<TestCase*>::const_reverse_iterator test_case =
+      std::find_if(test_cases_.rbegin(), test_cases_.rend(),
                    TestCaseNameIs(test_case_name));
 
-  if (test_case != test_cases_.end())
+  if (test_case != test_cases_.rend())
     return *test_case;
 
   // No.  Let's create one.
@@ -5967,13 +6538,8 @@ static void TearDownEnvironment(Environment* env) { env->TearDown(); }
 // All other functions called from RunAllTests() may safely assume that
 // parameterized tests are ready to be counted and run.
 bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
+  // True iff Google Test is initialized before RUN_ALL_TESTS() is called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
   if (g_help_flag)
@@ -5994,6 +6560,11 @@ bool UnitTestImpl::RunAllTests() {
 
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -6096,6 +6667,20 @@ bool UnitTestImpl::RunAllTests() {
 
   repeater->OnTestProgramEnd(*parent_);
 
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
   return !failed;
 }
 
@@ -6197,8 +6782,8 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // each TestCase and TestInfo object.
 // If shard_tests == true, further filters tests based on sharding
 // variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
       Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
@@ -6237,10 +6822,11 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
           (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
 
       num_runnable_tests += is_runnable;
       num_selected_tests += is_selected;
@@ -6310,6 +6896,23 @@ void UnitTestImpl::ListTestsMatchingFilter() {
     }
   }
   fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_cases_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_cases_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
 }
 
 // Sets the OS stack trace getter.
@@ -6330,17 +6933,25 @@ void UnitTestImpl::set_os_stack_trace_getter(
 // getter, and returns it.
 OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
   if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
     os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
   }
 
   return os_stack_trace_getter_;
 }
 
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
+// Returns the most specific TestResult currently running.
 TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
+  if (current_test_info_ != NULL) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_case_ != NULL) {
+    return &current_test_case_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
 }
 
 // Shuffles all test cases, and the tests within each test case,
@@ -6421,9 +7032,8 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
+static const char* ParseFlagValue(const char* str, const char* flag,
+                                  bool def_optional) {
   // str and flag must not be NULL.
   if (str == NULL || flag == NULL) return NULL;
 
@@ -6459,7 +7069,7 @@ const char* ParseFlagValue(const char* str,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
 
@@ -6493,7 +7103,8 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+template <typename String>
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
 
@@ -6529,7 +7140,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) {
 //   @Y    changes the color to yellow.
 //   @D    changes to the default terminal text color.
 //
-// TODO(wan@google.com): Write tests for this once we add stdout
+// FIXME: Write tests for this once we add stdout
 // capturing to Google Test.
 static void PrintColorEncoded(const char* str) {
   GTestColor color = COLOR_DEFAULT;  // The current color.
@@ -6595,24 +7206,25 @@ static const char kColorEncodedHelpMessage[] =
 "      Enable/disable colored output. The default is @Gauto@D.\n"
 "  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
 "      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
     GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
+"      Generate a JSON or XML report in the given directory or with the given\n"
+"      file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+# if GTEST_CAN_STREAM_RESULTS_
 "  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
 "      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
+# endif  // GTEST_CAN_STREAM_RESULTS_
 "\n"
 "Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
 "      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
 "      Turn assertion failures into debugger break-points.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
+"      Turn assertion failures into C++ exceptions for use by an external\n"
+"      test framework.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
 "      Do not report exceptions as test failures. Instead, allow them\n"
 "      to crash the program or throw a pop-up (on Windows).\n"
@@ -6629,6 +7241,56 @@ static const char kColorEncodedHelpMessage[] =
 "(not one in your own code or tests), please report it to\n"
 "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
+static bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.  The type parameter CharType can be
 // instantiated to either char or wchar_t.
@@ -6642,35 +7304,24 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
     using internal::ParseInt32Flag;
     using internal::ParseStringFlag;
 
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
       // that argv has (*argc + 1) elements, the last one always being
       // NULL.  The following loop moves the trailing NULL element as
       // well.
@@ -6684,12 +7335,6 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
       // We also need to decrement the iterator as we just removed
       // an element.
       i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
     }
   }
 
@@ -6705,6 +7350,17 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but iff
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
 }
 void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
@@ -6716,23 +7372,19 @@ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
 // wchar_t.
 template <typename CharType>
 void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
   // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
+  if (GTestIsInitialized()) return;
 
   if (*argc <= 0) return;
 
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
   g_argvs.clear();
   for (int i = 0; i != *argc; i++) {
     g_argvs.push_back(StreamableToString(argv[i]));
   }
 
-#endif  // GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
   GetUnitTestImpl()->PostFlagParsingInit();
@@ -6750,13 +7402,62 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 //
 // Calling the function for the second time has no user-visible effect.
 void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
 void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
 }
 
 }  // namespace testing
@@ -6788,8 +7489,7 @@ void InitGoogleTest(int* argc, wchar_t** argv) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+
 //
 // This file implements death tests.
 
@@ -6821,23 +7521,27 @@ void InitGoogleTest(int* argc, wchar_t** argv) {
 #  include <spawn.h>
 # endif  // GTEST_OS_QNX
 
-#endif  // GTEST_HAS_DEATH_TEST
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
 
+#endif  // GTEST_HAS_DEATH_TEST
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
 // Constants.
 
 // The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
 GTEST_DEFINE_string_(
     death_test_style,
@@ -6877,7 +7581,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
+# endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -6885,10 +7591,10 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
   return !GTEST_FLAG(internal_run_death_test).empty();
 
 # else
@@ -6908,7 +7614,7 @@ ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
@@ -6916,19 +7622,27 @@ bool ExitedWithCode::operator()(int exit_status) const {
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
 KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
 }
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -6939,7 +7653,7 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
@@ -6955,7 +7669,7 @@ static std::string ExitSummary(int exit_code) {
     m << " (core dumped)";
   }
 #  endif
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -6966,7 +7680,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -6975,13 +7689,19 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
   Message msg;
   msg << "Death tests use fork(), which is unsafe particularly"
       << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
+  if (thread_count == 0) {
     msg << "couldn't detect the number of threads.";
-  else
+  } else {
     msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -6989,6 +7709,13 @@ static const char kDeathTestReturned = 'R';
 static const char kDeathTestThrew = 'T';
 static const char kDeathTestInternalError = 'I';
 
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
 // An enumeration describing all of the possible ways that a death test can
 // conclude.  DIED means that the process died while executing the test
 // code; LIVED means that process lived beyond the end of the test code;
@@ -6996,7 +7723,7 @@ static const char kDeathTestInternalError = 'I';
 // statement, which is not allowed; THREW means that the test statement
 // returned control by throwing an exception.  IN_PROGRESS means the test
 // has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
+// FIXME: Unify names and possibly values for
 // AbortReason, DeathTestOutcome, and flag characters above.
 enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 
@@ -7005,7 +7732,7 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-void DeathTestAbort(const std::string& message) {
+static void DeathTestAbort(const std::string& message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
@@ -7309,7 +8036,12 @@ bool DeathTestImpl::Passed(bool status_ok) {
       break;
     case DIED:
       if (status_ok) {
+# if GTEST_USES_PCRE
+        // PCRE regexes support embedded NULs.
+        const bool matched = RE::PartialMatch(error_message, *regex());
+# else
         const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+# endif  // GTEST_USES_PCRE
         if (matched) {
           success = true;
         } else {
@@ -7484,48 +8216,241 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
       "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+  virtual ~FuchsiaDeathTest() {
+    zx_status_t status = zx_handle_close(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+    status = zx_handle_close(port_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  }
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+
+  zx_handle_t child_process_ = ZX_HANDLE_INVALID;
+  zx_handle_t port_ = ZX_HANDLE_INVALID;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+  int size() {
+    return args_.size() - 1;
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Register to wait for the child process to terminate.
+  zx_status_t status_zx;
+  status_zx = zx_object_wait_async(child_process_,
+                                   port_,
+                                   0 /* key */,
+                                   ZX_PROCESS_TERMINATED,
+                                   ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Wait for it to terminate, or an exception to be received.
+  zx_port_packet_t packet;
+  status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  if (ZX_PKT_IS_EXCEPTION(packet.type)) {
+    // Process encountered an exception. Kill it directly rather than letting
+    // other handlers process the event.
+    status_zx = zx_task_kill(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    // Now wait for |child_process_| to terminate.
+    zx_signals_t signals = 0;
+    status_zx = zx_object_wait_one(
+        child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED);
+  } else {
+    // Process terminated.
+    GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+    GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+  }
+
+  ReadAndInterpretStatusByte();
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  zx_info_process_t buffer;
+  status_zx = zx_object_get_info(
+      child_process_,
+      ZX_INFO_PROCESS,
+      &buffer,
+      sizeof(buffer),
+      nullptr,
+      nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
 
-  DeathTest::set_last_death_test_message("");
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
 
   CaptureStderr();
   // Flush the log buffers since the log streams are shared with the child.
   FlushInfoLog();
 
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+  // Build the child process command line.
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  uint32_t type;
+  status = fdio_pipe_half(&child_pipe_handle, &type);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  set_read_fd(status);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t add_handle_action = {};
+  add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd);
+  add_handle_action.h.handle = child_pipe_handle;
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL,
+                          args.Argv()[0], args.Argv(), nullptr, 1,
+                          &add_handle_action, &child_process_, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception port and attach it to the |child_process_|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_port_create(0, &port_);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  status = zx_task_bind_exception_port(
+      child_process_, port_, 0 /* key */, 0 /*options */);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
   set_spawned(true);
   return OVERSEE_TEST;
 }
-# else  // We are not on Windows.
+
+#else  // We are neither on Windows, nor on Fuchsia.
 
 // ForkingDeathTest provides implementations for most of the abstract
 // methods of the DeathTest interface.  Only the AssumeRole method is
@@ -7629,9 +8554,13 @@ class ExecDeathTest : public ForkingDeathTest {
       ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
   virtual TestRole AssumeRole();
  private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -7727,6 +8656,7 @@ static int ExecDeathTestChildMain(void* child_arg) {
 }
 #  endif  // !GTEST_OS_QNX
 
+#  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -7736,20 +8666,22 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+static void StackLowerThanAddress(const void* ptr, bool* result) {
   int dummy;
   *result = (&dummy < ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-bool StackGrowsDown() {
+static bool StackGrowsDown() {
   int dummy;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
+#  endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -7941,6 +8873,13 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
     *test = new WindowsDeathTest(statement, regex, file, line);
   }
 
+# elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, regex, file, line);
+  }
+
 # else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe") {
@@ -7961,31 +8900,11 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
   return true;
 }
 
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
 # if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
                             size_t write_handle_as_size_t,
                             size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
@@ -7996,7 +8915,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                    StreamableToString(parent_process_id));
   }
 
-  // TODO(vladl@google.com): Replace the following check with a
+  // FIXME: Replace the following check with a
   // compile-time assertion when available.
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
@@ -8004,7 +8923,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
       reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
-  // The newly initialized handle is accessible only in in the parent
+  // The newly initialized handle is accessible only in the parent
   // process. To obtain one accessible within the child, we need to use
   // DuplicateHandle.
   if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
@@ -8081,6 +9000,16 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
   write_fd = GetStatusFileDescriptor(parent_process_id,
                                      write_handle_as_size_t,
                                      event_handle_as_size_t);
+
+# elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
 # else
 
   if (fields.size() != 4
@@ -8129,8 +9058,6 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
 
 
 #include <stdlib.h>
@@ -8148,6 +9075,7 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 # include <climits>  // Some Linux distributions define PATH_MAX here.
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
+
 #if GTEST_OS_WINDOWS
 # define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
@@ -8158,7 +9086,6 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 # define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
-
 namespace testing {
 namespace internal {
 
@@ -8351,7 +9278,7 @@ bool FilePath::DirectoryExists() const {
 // root directory per disk drive.)
 bool FilePath::IsRootDirectory() const {
 #if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
+  // FIXME: on Windows a network share like
   // \\server\share can be a root directory, although it cannot be the
   // current directory.  Handle this properly.
   return pathname_.length() == 3 && IsAbsolutePath();
@@ -8451,7 +9378,7 @@ FilePath FilePath::RemoveTrailingPathSeparator() const {
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+// FIXME: handle Windows network shares (e.g. \\server\share).
 void FilePath::Normalize() {
   if (pathname_.c_str() == NULL) {
     pathname_ = "";
@@ -8512,14 +9439,14 @@ void FilePath::Normalize() {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 
 #include <limits.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <fstream>
 
 #if GTEST_OS_WINDOWS
 # include <windows.h>
@@ -8542,14 +9469,16 @@ void FilePath::Normalize() {
 # include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 namespace internal {
@@ -8563,10 +9492,31 @@ const int kStdOutFileno = STDOUT_FILENO;
 const int kStdErrFileno = STDERR_FILENO;
 #endif  // _MSC_VER
 
-#if GTEST_OS_MAC
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
 
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
 size_t GetThreadCount() {
   const task_t task = mach_task_self();
   mach_msg_type_number_t thread_count;
@@ -8604,6 +9554,38 @@ size_t GetThreadCount() {
   }
 }
 
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
 #else
 
 size_t GetThreadCount() {
@@ -8612,7 +9594,7 @@ size_t GetThreadCount() {
   return 0;
 }
 
-#endif  // GTEST_OS_MAC
+#endif  // GTEST_OS_LINUX
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
@@ -8686,9 +9668,9 @@ Mutex::Mutex()
 Mutex::~Mutex() {
   // Static mutexes are leaked intentionally. It is not thread-safe to try
   // to clean them up.
-  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires
   // nothing to clean it up but is available only on Vista and later.
-  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks
   if (type_ == kDynamic) {
     ::DeleteCriticalSection(critical_section_);
     delete critical_section_;
@@ -8719,6 +9701,43 @@ void Mutex::AssertHeld() {
       << "The current thread is not holding the mutex @" << this;
 }
 
+namespace {
+
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated
+{
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+#ifdef _MSC_VER
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+#endif  //  _MSC_VER
+  }
+
+  ~MemoryIsNotDeallocated() {
+#ifdef _MSC_VER
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+#endif  //  _MSC_VER
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+
+}  // namespace
+
 // Initializes owner_thread_id_ and critical_section_ in static mutexes.
 void Mutex::ThreadSafeLazyInit() {
   // Dynamic mutexes are initialized in the constructor.
@@ -8729,7 +9748,11 @@ void Mutex::ThreadSafeLazyInit() {
         // If critical_section_init_phase_ was 0 before the exchange, we
         // are the first to test it and need to perform the initialization.
         owner_thread_id_ = 0;
-        critical_section_ = new CRITICAL_SECTION;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+          critical_section_ = new CRITICAL_SECTION;
+        }
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
@@ -8768,7 +9791,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
                              Notification* thread_can_start) {
     ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
     DWORD thread_id;
-    // TODO(yukawa): Consider to use _beginthreadex instead.
+    // FIXME: Consider to use _beginthreadex instead.
     HANDLE thread_handle = ::CreateThread(
         NULL,    // Default security.
         0,       // Default stack size.
@@ -8936,7 +9959,7 @@ class ThreadLocalRegistryImpl {
                                  FALSE,
                                  thread_id);
     GTEST_CHECK_(thread != NULL);
-    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
     DWORD watcher_thread_id;
     HANDLE watcher_thread = ::CreateThread(
@@ -8971,7 +9994,8 @@ class ThreadLocalRegistryImpl {
   // Returns map of thread local instances.
   static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
     mutex_.AssertHeld();
-    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
     return map;
   }
 
@@ -9111,7 +10135,7 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
           << " in simple regular expression \"" << regex << "\": ").GetString();
 }
@@ -9120,7 +10144,7 @@ std::string FormatRegexSyntaxError(const char* regex, int index) {
 // otherwise returns true.
 bool ValidateRegex(const char* regex) {
   if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
+    // FIXME: fix the source file location in the
     // assertion failures to match where the regex is used in user
     // code.
     ADD_FAILURE() << "NULL is not a valid simple regular expression.";
@@ -9345,7 +10369,6 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
     return file_name + ":" + StreamableToString(line);
 }
 
-
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
   const char* const marker =
@@ -9364,9 +10387,10 @@ GTestLog::~GTestLog() {
     posix::Abort();
   }
 }
+
 // Disable Microsoft deprecation warnings for POSIX functions called from
 // this class (creat, dup, dup2, and close)
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 #if GTEST_HAS_STREAM_REDIRECTION
 
@@ -9442,12 +10466,6 @@ class CapturedStream {
   }
 
  private:
-  // Reads the entire content of a file as an std::string.
-  static std::string ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
   const int fd_;  // A stream to capture.
   int uncaptured_fd_;
   // Name of the temporary file holding the stderr output.
@@ -9456,42 +10474,14 @@ class CapturedStream {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
 };
 
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-std::string CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const std::string content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 static CapturedStream* g_captured_stderr = NULL;
 static CapturedStream* g_captured_stdout = NULL;
 
 // Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
   if (*stream != NULL) {
     GTEST_LOG_(FATAL) << "Only one " << stream_name
                       << " capturer can exist at a time.";
@@ -9500,7 +10490,7 @@ void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
 }
 
 // Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
   const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
@@ -9531,25 +10521,67 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-#if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
 
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
 
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
 }
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs = NULL;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
   if (g_injected_test_argvs != NULL) {
     return *g_injected_test_argvs;
   }
-  return g_argvs;
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = NULL;
 }
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -9623,16 +10655,23 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
 //
 // The value is considered true iff it's not "0".
 bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   return string_value == NULL ?
       default_value : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
 }
 
 // Reads and returns a 32-bit integer stored in the environment
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
 Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   if (string_value == NULL) {
@@ -9650,14 +10689,36 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
   }
 
   return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar(){
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (NULL != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
 }
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
 const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = posix::GetEnv(env_var.c_str());
   return value == NULL ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 }
 
 }  // namespace internal
@@ -9690,10 +10751,9 @@ const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -9706,8 +10766,8 @@ const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 // or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
 // defines Foo.
 
-#include <ctype.h>
 #include <stdio.h>
+#include <cctype>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
@@ -9751,7 +10811,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
   // If the object size is bigger than kThreshold, we'll have to omit
   // some details by printing only the first and the last kChunkSize
   // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
+  // FIXME: let the user control the threshold using a flag.
   if (count < kThreshold) {
     PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
   } else {
@@ -9785,7 +10845,7 @@ namespace internal {
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
 enum CharFormat {
   kAsIs,
@@ -9842,7 +10902,10 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
         return kHexEscape;
       }
   }
@@ -9889,7 +10952,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
     return;
   *os << " (" << static_cast<int>(c);
 
-  // For more convenience, we print c's code again in hexidecimal,
+  // For more convenience, we print c's code again in hexadecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
@@ -9921,11 +10984,12 @@ template <typename CharType>
 GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void PrintCharsAsStringTo(
+static CharFormat PrintCharsAsStringTo(
     const CharType* begin, size_t len, ostream* os) {
   const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
   *os << kQuoteBegin;
   bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
     const CharType cur = begin[index];
     if (is_previous_hex && IsXDigit(cur)) {
@@ -9935,8 +10999,13 @@ static void PrintCharsAsStringTo(
       *os << "\" " << kQuoteBegin;
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
   }
   *os << "\"";
+  return print_format;
 }
 
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
@@ -10006,15 +11075,90 @@ void PrintTo(const wchar_t* s, ostream* os) {
 }
 #endif  // wchar_t is native
 
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+        switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
 // Prints a ::string object.
 #if GTEST_HAS_GLOBAL_STRING
 void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 
 // Prints a ::wstring object.
@@ -10061,19 +11205,10 @@ void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
 
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
@@ -10169,8 +11304,8 @@ void HasNewFatalFailureHelper::ReportTestPartResult(
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+
 
 
 namespace testing {
@@ -10196,11 +11331,11 @@ static std::vector<std::string> SplitIntoTestNames(const char* src) {
 }
 
 // Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
+// registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
 const char* TypedTestCasePState::VerifyRegisteredTestNames(
     const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
   registered_ = true;
 
   std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
@@ -10217,10 +11352,10 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
     }
 
     bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
          ++it) {
-      if (name == *it) {
+      if (name == it->first) {
         found = true;
         break;
       }
@@ -10234,11 +11369,11 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
     }
   }
 
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
        ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
     }
   }
 
@@ -10257,5 +11392,3 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
 
 }  // namespace internal
 }  // namespace testing
-
-#endif  // __clang_analyzer__
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
similarity index 90%
rename from third-party/gtest-1.7.0/fused-src/gtest/gtest.h
rename to third-party/gtest-1.8.1/fused-src/gtest/gtest.h
index 3cec41a9e44..ebb16db7b09 100644
--- a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
 // included by any test program that uses Google Test.
@@ -48,6 +47,8 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_H_
 
@@ -84,13 +85,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
@@ -123,8 +124,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan)
-//
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -136,6 +135,8 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
@@ -169,11 +170,9 @@
 //   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
 //                              are enabled.
 //   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
+//                              is/isn't available
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring
+//                              is/isn't available
 //   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
 //                              expressions are/aren't available.
 //   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
@@ -205,6 +204,12 @@
 //   GTEST_CREATE_SHARED_LIBRARY
 //                            - Define to 1 when compiling Google Test itself
 //                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
 
 // Platform-indicating macros
 // --------------------------
@@ -218,12 +223,14 @@
 //   GTEST_OS_AIX      - IBM AIX
 //   GTEST_OS_CYGWIN   - Cygwin
 //   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
 //   GTEST_OS_HPUX     - HP-UX
 //   GTEST_OS_LINUX    - Linux
 //     GTEST_OS_LINUX_ANDROID - Google Android
 //   GTEST_OS_MAC      - Mac OS X
 //     GTEST_OS_IOS    - iOS
 //   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
 //   GTEST_OS_OPENBSD  - OpenBSD
 //   GTEST_OS_QNX      - QNX
 //   GTEST_OS_SOLARIS  - Sun Solaris
@@ -265,15 +272,15 @@
 //   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
 //                            tests)
 //   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
 //   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
+//                            the above RE\b(s) are mutually exclusive.
 //   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
 
 // Misc public macros
@@ -302,6 +309,7 @@
 //
 // C++11 feature wrappers:
 //
+//   testing::internal::forward - portability wrapper for std::forward.
 //   testing::internal::move  - portability wrapper for std::move.
 //
 // Synchronization:
@@ -318,10 +326,10 @@
 //
 // Regular expressions:
 //   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -367,25 +375,51 @@
 # include <TargetConditionals.h>
 #endif
 
+// Brings in the definition of HAS_GLOBAL_STRING.  This must be done
+// BEFORE we test HAS_GLOBAL_STRING.
+#include <string>  // NOLINT
 #include <algorithm>  // NOLINT
 #include <iostream>  // NOLINT
 #include <sstream>  // NOLINT
-#include <string>  // NOLINT
 #include <utility>
+#include <vector>  // NOLINT
 
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
@@ -406,6 +440,9 @@
 #   define GTEST_OS_WINDOWS_PHONE 1
 #  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
 #   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
 #  else
     // WINAPI_FAMILY defined but no known partition matched.
     // Default to desktop.
@@ -421,6 +458,8 @@
 # endif
 #elif defined __FreeBSD__
 # define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+# define GTEST_OS_FUCHSIA 1
 #elif defined __linux__
 # define GTEST_OS_LINUX 1
 # if defined __ANDROID__
@@ -436,18 +475,79 @@
 # define GTEST_OS_HPUX 1
 #elif defined __native_client__
 # define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+# define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
 # define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
 # define GTEST_OS_QNX 1
 #endif  // __CYGWIN__
 
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
 // Macros for disabling Microsoft Visual C++ warnings.
 //
 //   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
-#if _MSC_VER >= 1500
+#if _MSC_VER >= 1400
 # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
     __pragma(warning(push))                        \
     __pragma(warning(disable: warnings))
@@ -459,12 +559,28 @@
 # define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
+#else
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
 #ifndef GTEST_LANG_CXX11
 // gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
 // -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
 // value for __cplusplus, and recent versions of clang, gcc, and
 // probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900
 // Compiling in at least C++11 mode.
 #  define GTEST_LANG_CXX11 1
 # else
@@ -496,10 +612,16 @@
 #if GTEST_STDLIB_CXX11
 # define GTEST_HAS_STD_BEGIN_AND_END_ 1
 # define GTEST_HAS_STD_FORWARD_LIST_ 1
-# define GTEST_HAS_STD_FUNCTION_ 1
+# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824)
+// works only with VS2015U2 and better
+#   define GTEST_HAS_STD_FUNCTION_ 1
+# endif
 # define GTEST_HAS_STD_INITIALIZER_LIST_ 1
 # define GTEST_HAS_STD_MOVE_ 1
 # define GTEST_HAS_STD_UNIQUE_PTR_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_UNORDERED_MAP_ 1
+# define GTEST_HAS_UNORDERED_SET_ 1
 #endif
 
 // C++11 specifies that <tuple> provides std::tuple.
@@ -507,7 +629,8 @@
 #if GTEST_LANG_CXX11
 # define GTEST_HAS_STD_TUPLE_ 1
 # if defined(__clang__)
-// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+// Inspired by
+// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros
 #  if defined(__has_include) && !__has_include(<tuple>)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -519,7 +642,7 @@
 # elif defined(__GLIBCXX__)
 // Inspired by boost/config/stdlib/libstdcpp3.hpp,
 // http://gcc.gnu.org/gcc-4.2/changes.html and
-// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
 #  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -535,10 +658,16 @@
 #  include <io.h>
 # endif
 // In order to avoid having to include <windows.h>, use forward declaration
-// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
 // This assumption is verified by
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
-struct _RTL_CRITICAL_SECTION;
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
@@ -562,7 +691,10 @@ struct _RTL_CRITICAL_SECTION;
 # endif
 #endif
 
-#if GTEST_HAS_POSIX_RE
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
 
 // On some platforms, <regex.h> needs someone to define size_t, and
 // won't compile otherwise.  We can #include it here as we already
@@ -584,13 +716,16 @@ struct _RTL_CRITICAL_SECTION;
 // simple regex implementation instead.
 # define GTEST_USES_SIMPLE_RE 1
 
-#endif  // GTEST_HAS_POSIX_RE
+#endif  // GTEST_USES_PCRE
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
 #  ifndef _HAS_EXCEPTIONS
@@ -634,21 +769,17 @@ struct _RTL_CRITICAL_SECTION;
 # define GTEST_HAS_STD_STRING 1
 #elif !GTEST_HAS_STD_STRING
 // The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
+# error "::std::string isn't available."
 #endif  // !defined(GTEST_HAS_STD_STRING)
 
 #ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
 # define GTEST_HAS_GLOBAL_STRING 0
-
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 #ifndef GTEST_HAS_STD_WSTRING
 // The user didn't tell us whether ::std::wstring is available, so we need
 // to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+// FIXME: uses autoconf to detect whether ::std::wstring
 //   is available.
 
 // Cygwin 1.7 and below doesn't support ::std::wstring.
@@ -736,8 +867,9 @@ struct _RTL_CRITICAL_SECTION;
 //
 // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
 // to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#define GTEST_HAS_PTHREAD                                             \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
@@ -749,6 +881,15 @@ struct _RTL_CRITICAL_SECTION;
 # include <time.h>  // NOLINT
 #endif
 
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if defined(_MSC_VER) && (_MSC_VER < 1900)
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
 // Determines whether Google Test can use tr1/tuple.  You can define
 // this macro to 0 to prevent Google Test from using tuple (any
 // feature depending on tuple with be disabled in this mode).
@@ -756,6 +897,14 @@ struct _RTL_CRITICAL_SECTION;
 # if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
 // STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
 #  define GTEST_HAS_TR1_TUPLE 0
+# elif defined(_MSC_VER) && (_MSC_VER >= 1910)
+// Prevent `warning C4996: 'std::tr1': warning STL4002:
+// The non-Standard std::tr1 namespace and TR1-only machinery
+// are deprecated and will be REMOVED.`
+#  define GTEST_HAS_TR1_TUPLE 0
+# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION)
+// libc++ doesn't support TR1.
+#  define GTEST_HAS_TR1_TUPLE 0
 # else
 // The user didn't tell us not to do it, so we assume it's OK.
 #  define GTEST_HAS_TR1_TUPLE 1
@@ -765,6 +914,10 @@ struct _RTL_CRITICAL_SECTION;
 // Determines whether Google Test's own tr1 tuple implementation
 // should be used.
 #ifndef GTEST_USE_OWN_TR1_TUPLE
+// We use our own tuple implementation on Symbian.
+# if GTEST_OS_SYMBIAN
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# else
 // The user didn't tell us, so we need to figure it out.
 
 // We use our own TR1 tuple if we aren't sure the user has an
@@ -778,7 +931,8 @@ struct _RTL_CRITICAL_SECTION;
 // support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
 // and it can be used with some compilers that define __GNUC__.
 # if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \
+      || (_MSC_VER >= 1600 && _MSC_VER < 1900)
 #  define GTEST_ENV_HAS_TR1_TUPLE_ 1
 # endif
 
@@ -794,12 +948,11 @@ struct _RTL_CRITICAL_SECTION;
 # else
 #  define GTEST_USE_OWN_TR1_TUPLE 1
 # endif
-
+# endif  // GTEST_OS_SYMBIAN
 #endif  // GTEST_USE_OWN_TR1_TUPLE
 
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tuple.
+// To avoid conditional compilation we make it gtest-port.h's responsibility
+// to #include the header implementing tuple.
 #if GTEST_HAS_STD_TUPLE_
 # include <tuple>  // IWYU pragma: export
 # define GTEST_TUPLE_NAMESPACE_ ::std
@@ -845,11 +998,12 @@ struct _RTL_CRITICAL_SECTION;
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Implements a subset of TR1 tuple needed by Google Test and Google Mock.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 
@@ -857,7 +1011,7 @@ struct _RTL_CRITICAL_SECTION;
 
 // The compiler used in Symbian has a bug that prevents us from declaring the
 // tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
+// bypasses the bug by declaring the members that should otherwise be
 // private as public.
 // Sun Studio versions < 12 also have the above bug.
 #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
@@ -1833,22 +1987,6 @@ inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
 #undef GTEST_TUPLE_ELEMENT_
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
 # elif GTEST_OS_SYMBIAN
 
 // On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
@@ -1873,20 +2011,22 @@ using ::std::tuple_size;
 // Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
 // which is #included by <tr1/tuple>, to not compile when RTTI is
 // disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional>.  Hence the following #define is used to prevent
 // <tr1/functional> from being included.
 #   define _TR1_FUNCTIONAL 1
 #   include <tr1/tuple>
 #   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
+                        // <tr1/functional> if they choose to.
 #  else
 #   include <tr1/tuple>  // NOLINT
 #  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
 
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
+// VS 2010 now has tr1 support.
+# elif _MSC_VER >= 1600
 #  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# else  // GTEST_USE_OWN_TR1_TUPLE
+#  include <tr1/tuple>  // IWYU pragma: export  // NOLINT
 # endif  // GTEST_USE_OWN_TR1_TUPLE
 
 #endif  // GTEST_HAS_TR1_TUPLE
@@ -1900,8 +2040,12 @@ using ::std::tuple_size;
 
 # if GTEST_OS_LINUX && !defined(__ia64__)
 #  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
 #     define GTEST_HAS_CLONE 1
 #    else
 #     define GTEST_HAS_CLONE 0
@@ -1932,20 +2076,15 @@ using ::std::tuple_size;
 // Google Test does not support death tests for VC 7.1 and earlier as
 // abort() in a VC 7.1 application compiled as GUI in debug config
 // pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||   \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                         \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) ||          \
      GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \
+     GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 # define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
 #endif
 
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
 // Determines whether to support type-driven tests.
 
 // Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
@@ -1960,7 +2099,7 @@ using ::std::tuple_size;
 // value-parameterized tests are enabled.  The implementation doesn't
 // work on Sun Studio since it doesn't understand templated conversion
 // operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC)
 # define GTEST_HAS_COMBINE 1
 #endif
 
@@ -2011,15 +2150,39 @@ using ::std::tuple_size;
 # define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
+#if GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_ = delete
+#else  // GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_
+#endif  // GTEST_LANG_CXX11
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+# if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
+#else
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+
 // A macro to disallow operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_
 
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) GTEST_CXX11_EQUALS_DELETE_; \
   GTEST_DISALLOW_ASSIGN_(type)
 
 // Tell the compiler to warn about unused return values for functions declared
@@ -2061,25 +2224,36 @@ using ::std::tuple_size;
 # endif
 
 #define GTEST_IS_THREADSAFE \
-    (0 \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
      || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
      || GTEST_HAS_PTHREAD)
 
 #endif  // GTEST_HAS_SEH
 
-#ifdef _MSC_VER
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
 
+#ifdef _MSC_VER
 # if GTEST_LINKED_AS_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllimport)
 # elif GTEST_CREATE_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllexport)
 # endif
-
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
 #endif  // _MSC_VER
 
+#endif  // GTEST_API_
+
 #ifndef GTEST_API_
 # define GTEST_API_
-#endif
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
@@ -2089,10 +2263,12 @@ using ::std::tuple_size;
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
-#else
-# define GTEST_HAS_CXXABI_H_ 0
+#if !defined(GTEST_HAS_CXXABI_H_)
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
@@ -2235,6 +2411,16 @@ struct StaticAssertTypeEqHelper<T, T> {
   enum { value = true };
 };
 
+// Same as std::is_same<>.
+template <typename T, typename U>
+struct IsSame {
+  enum { value = false };
+};
+template <typename T>
+struct IsSame<T, T> {
+  enum { value = true };
+};
+
 // Evaluates to the number of elements in 'array'.
 #define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
 
@@ -2298,6 +2484,10 @@ class scoped_ptr {
 
 // Defines RE.
 
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
 // Regular Expression syntax.
 class GTEST_API_ RE {
@@ -2309,11 +2499,11 @@ class GTEST_API_ RE {
   // Constructs an RE from a string.
   RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   RE(const char* regex) { Init(regex); }  // NOLINT
   ~RE();
@@ -2326,7 +2516,7 @@ class GTEST_API_ RE {
   // PartialMatch(str, re) returns true iff regular expression re
   // matches a substring of str (including str itself).
   //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // FIXME: make FullMatch() and PartialMatch() work
   // when str contains NUL characters.
   static bool FullMatch(const ::std::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -2335,7 +2525,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const ::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -2344,7 +2534,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const char* str, const RE& re);
   static bool PartialMatch(const char* str, const RE& re);
@@ -2353,25 +2543,27 @@ class GTEST_API_ RE {
   void Init(const char* regex);
 
   // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
+  // used where std::string is not available.  FIXME: change to
   // std::string.
   const char* pattern_;
   bool is_valid_;
 
-#if GTEST_USES_POSIX_RE
+# if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-#else  // GTEST_USES_SIMPLE_RE
+# else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-#endif
+# endif
 
   GTEST_DISALLOW_ASSIGN_(RE);
 };
 
+#endif  // GTEST_USES_PCRE
+
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
 GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
@@ -2413,13 +2605,18 @@ class GTEST_API_ GTestLog {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
 };
 
-#define GTEST_LOG_(severity) \
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
     ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
                                   __FILE__, __LINE__).GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(NULL); }
 
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
 // INTERNAL IMPLEMENTATION - DO NOT USE.
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
@@ -2434,12 +2631,13 @@ inline void FlushInfoLog() { fflush(NULL); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
+# define GTEST_CHECK_(condition) \
     GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
     if (::testing::internal::IsTrue(condition)) \
       ; \
     else \
       GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
 // call returns 0 (indicating success).  Known limitation: this
@@ -2451,13 +2649,59 @@ inline void FlushInfoLog() { fflush(NULL); }
     GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
                       << gtest_error
 
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef { typedef const T& type; };
+template <typename T>
+struct ConstRef<T&> { typedef T& type; };
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
 #if GTEST_HAS_STD_MOVE_
+using std::forward;
 using std::move;
+
+template <typename T>
+struct RvalueRef {
+  typedef T&& type;
+};
 #else  // GTEST_HAS_STD_MOVE_
 template <typename T>
 const T& move(const T& t) {
   return t;
 }
+template <typename T>
+GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; }
+
+template <typename T>
+struct RvalueRef {
+  typedef const T& type;
+};
 #endif  // GTEST_HAS_STD_MOVE_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -2481,7 +2725,7 @@ const T& move(const T& t) {
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
 template<typename To>
-inline To ImplicitCast_(To x) { return ::testing::internal::move(x); }
+inline To ImplicitCast_(To x) { return x; }
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -2533,6 +2777,11 @@ template <class Derived, class Base>
 Derived* CheckedDowncastToActualType(Base* base) {
 #if GTEST_HAS_RTTI
   GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
   return dynamic_cast<Derived*>(base);  // NOLINT
 #else
   return static_cast<Derived*>(base);  // Poor man's downcast.
@@ -2553,16 +2802,25 @@ GTEST_API_ void CaptureStderr();
 GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
 
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
 
-#if GTEST_HAS_DEATH_TEST
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
+#if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<testing::internal::string> g_argvs;
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs);
+#endif  // GTEST_HAS_GLOBAL_STRING
+void ClearInjectableArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -2581,7 +2839,10 @@ inline void SleepMilliseconds(int n) {
 }
 # endif  // GTEST_HAS_PTHREAD
 
-# if 0  // OS detection
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
 # elif GTEST_HAS_PTHREAD
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -2675,7 +2936,7 @@ class GTEST_API_ Notification {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
-# endif  // OS detection
+# endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
@@ -2758,9 +3019,13 @@ class ThreadWithParam : public ThreadWithParamBase {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
 };
-# endif  // GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
 
-# if 0  // OS detection
 # elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
@@ -2805,7 +3070,7 @@ class GTEST_API_ Mutex {
   // Initializes owner_thread_id_ and critical_section_ in static mutexes.
   void ThreadSafeLazyInit();
 
-  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
   // we assume that 0 is an invalid value for thread IDs.
   unsigned int owner_thread_id_;
 
@@ -2813,7 +3078,7 @@ class GTEST_API_ Mutex {
   // by the linker.
   MutexType type_;
   long critical_section_init_phase_;  // NOLINT
-  _RTL_CRITICAL_SECTION* critical_section_;
+  GTEST_CRITICAL_SECTION* critical_section_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
@@ -2965,8 +3230,9 @@ class ThreadWithParam : public ThreadWithParamBase {
 template <typename T>
 class ThreadLocal : public ThreadLocalBase {
  public:
-  ThreadLocal() : default_() {}
-  explicit ThreadLocal(const T& value) : default_(value) {}
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
@@ -2980,6 +3246,7 @@ class ThreadLocal : public ThreadLocalBase {
   // knowing the type of T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
+    ValueHolder() : value_() {}
     explicit ValueHolder(const T& value) : value_(value) {}
 
     T* pointer() { return &value_; }
@@ -2996,10 +3263,42 @@ class ThreadLocal : public ThreadLocalBase {
   }
 
   virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
-    return new ValueHolder(default_);
+    return default_factory_->MakeNewHolder();
   }
 
-  const T default_;  // The default value for each thread.
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
@@ -3060,8 +3359,8 @@ class MutexBase {
 // particular, the owner_ field (a pthread_t) is not explicitly initialized.
 // This allows initialization to work whether pthread_t is a scalar or struct.
 // The flag -Wmissing-field-initializers must not be specified for this to work.
-#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
@@ -3118,12 +3417,13 @@ extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
 
 // Implements thread-local storage on pthreads-based systems.
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() {
     // Destroys the managed object for the current thread, if any.
@@ -3143,6 +3443,7 @@ class ThreadLocal {
   // Holds a value of type T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
+    ValueHolder() : value_() {}
     explicit ValueHolder(const T& value) : value_(value) {}
 
     T* pointer() { return &value_; }
@@ -3168,20 +3469,52 @@ class ThreadLocal {
       return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
     }
 
-    ValueHolder* const new_holder = new ValueHolder(default_);
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
     ThreadLocalValueHolderBase* const holder_base = new_holder;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
     return new_holder->pointer();
   }
 
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
+  scoped_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-# endif  // OS detection
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -3216,7 +3549,7 @@ class GTestMutexLock {
 typedef GTestMutexLock MutexLock;
 
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal() : value_() {}
   explicit ThreadLocal(const T& value) : value_(value) {}
@@ -3235,12 +3568,13 @@ class ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 // Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian
 // and the IBM XL C/C++ compiler try to instantiate a copy constructor
 // for objects passed through ellipsis (...), failing for uncopyable
 // objects.  We define this to ensure that only POD is passed through
 // ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \
+     (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130)
 // We lose support for NULL detection where the compiler doesn't like
 // passing non-POD classes through ellipsis (...).
 # define GTEST_ELLIPSIS_NEEDS_POD_ 1
@@ -3266,6 +3600,13 @@ template <bool bool_value> const bool bool_constant<bool_value>::value;
 typedef bool_constant<false> false_type;
 typedef bool_constant<true> true_type;
 
+template <typename T, typename U>
+struct is_same : public false_type {};
+
+template <typename T>
+struct is_same<T, T> : public true_type {};
+
+
 template <typename T>
 struct is_pointer : public false_type {};
 
@@ -3277,6 +3618,7 @@ struct IteratorTraits {
   typedef typename Iterator::value_type value_type;
 };
 
+
 template <typename T>
 struct IteratorTraits<T*> {
   typedef T value_type;
@@ -3408,7 +3750,11 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 // Functions deprecated by MSVC 8.0.
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
 
 // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
 // StrError() aren't needed on Windows CE at this time and thus not
@@ -3438,7 +3784,7 @@ inline int Close(int fd) { return close(fd); }
 inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
   // We are on Windows CE, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return NULL;
@@ -3452,7 +3798,7 @@ inline const char* GetEnv(const char* name) {
 #endif
 }
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Windows CE has no C library. The abort() function is used in
@@ -3553,31 +3899,44 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 // Utilities for command line flags and environment variables.
 
 // Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
 
 // Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
     GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
+# define GTEST_DECLARE_string_(name) \
     GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
     GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
     GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
+# define GTEST_DEFINE_string_(name, default_val, doc) \
     GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
 
+#endif  // !defined(GTEST_DECLARE_bool_)
+
 // Thread annotations
-#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-#define GTEST_LOCK_EXCLUDED_(locks)
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// FIXME: Find a better way to refactor flag and environment parsing
 // out of both gtest-port.cc and gtest.cc to avoid exporting this utility
 // function.
 bool ParseInt32(const Message& src_text, const char* str, Int32* value);
@@ -3586,6 +3945,7 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value);
 // corresponding to the given Google Test flag.
 bool BoolFromGTestEnv(const char* flag, bool default_val);
 GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string OutputFlagAlsoCheckEnvVar();
 const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 }  // namespace internal
@@ -3593,7 +3953,6 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
-
 #if GTEST_OS_LINUX
 # include <stdlib.h>
 # include <sys/types.h>
@@ -3610,6 +3969,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 #include <string.h>
 #include <iomanip>
 #include <limits>
+#include <map>
 #include <set>
 #include <string>
 #include <vector>
@@ -3642,10 +4002,9 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
 //
@@ -3659,12 +4018,17 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Ensures that there is at least one operator<< in the global namespace.
 // See Message& operator<<(...) below for why.
 void operator<<(const testing::internal::Secret&, int);
@@ -3811,7 +4175,6 @@ class GTEST_API_ Message {
   std::string GetString() const;
 
  private:
-
 #if GTEST_OS_SYMBIAN
   // These are needed as the Nokia Symbian Compiler cannot decide between
   // const T& and const T* in a function template. The Nokia compiler _can_
@@ -3862,8 +4225,10 @@ std::string StreamableToString(const T& streamable) {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-// Copyright 2005, Google Inc.
+// Copyright 2008, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -3892,21 +4257,63 @@ std::string StreamableToString(const T& streamable) {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// Google Test filepath utilities
 //
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
 //
-// This header file is #included by <gtest/internal/gtest-internal.h>.
-// It should not be #included by other files.
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifdef __BORLANDC__
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
 # include <mem.h>
 #endif
@@ -4029,48 +4436,9 @@ GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 }  // namespace testing
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keith.ray@gmail.com (Keith Ray)
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in <gtest/internal/gtest-internal.h>.
-// Do not include this header file separately!
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 namespace testing {
 namespace internal {
@@ -4233,6 +4601,8 @@ class GTEST_API_ FilePath {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 // This file was GENERATED by command:
 //     pump.py gtest-type-util.h.pump
@@ -4266,8 +4636,7 @@ class GTEST_API_ FilePath {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Type utilities needed for implementing typed and type-parameterized
 // tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -4277,6 +4646,8 @@ class GTEST_API_ FilePath {
 // Please contact googletestframework@googlegroups.com if you need
 // more.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
@@ -4292,6 +4663,22 @@ class GTEST_API_ FilePath {
 namespace testing {
 namespace internal {
 
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
 // GetTypeName<T>() returns a human-readable name of type T.
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
@@ -4310,7 +4697,7 @@ std::string GetTypeName() {
   char* const readable_name = __cxa_demangle(name, 0, 0, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
-  return name_str;
+  return CanonicalizeForStdLibVersioning(name_str);
 #  else
   return name;
 #  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
@@ -7576,6 +7963,9 @@ struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
 #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
 
+// Stringifies its argument.
+#define GTEST_STRINGIFY_(name) #name
+
 class ProtocolMessage;
 namespace proto2 { class Message; }
 
@@ -7596,13 +7986,9 @@ ::std::string PrintToString(const T& value);
 namespace internal {
 
 struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
 class TestInfoImpl;                    // Opaque implementation of TestInfo
 class UnitTestImpl;                    // Opaque implementation of UnitTest
 
-// How many times InitGoogleTest() has been called.
-GTEST_API_ extern int g_init_gtest_count;
-
 // The text used in failure messages to indicate the start of the
 // stack trace.
 GTEST_API_ extern const char kStackTraceMarker[];
@@ -7642,6 +8028,9 @@ GTEST_API_ std::string AppendUserMessage(
 
 #if GTEST_HAS_EXCEPTIONS
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
+
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
 // are enabled).  We derive it from std::runtime_error, which is for
@@ -7653,26 +8042,9 @@ class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
   explicit GoogleTestFailureException(const TestPartResult& failure);
 };
 
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
 
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
+#endif  // GTEST_HAS_EXCEPTIONS
 
 namespace edit_distance {
 // Returns the optimal edits to go from 'left' to 'right'.
@@ -8004,6 +8376,14 @@ GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
 typedef void (*SetUpTestCaseFunc)();
 typedef void (*TearDownTestCaseFunc)();
 
+struct CodeLocation {
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
 // Creates a new TestInfo object and registers it with Google Test;
 // returns the created object.
 //
@@ -8015,6 +8395,7 @@ typedef void (*TearDownTestCaseFunc)();
 //                     this is not a typed or a type-parameterized test.
 //   value_param       text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -8026,6 +8407,7 @@ GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
     const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
@@ -8038,6 +8420,9 @@ GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
 #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // State of the definition of a type-parameterized test case.
 class GTEST_API_ TypedTestCasePState {
  public:
@@ -8055,10 +8440,21 @@ class GTEST_API_ TypedTestCasePState {
       fflush(stderr);
       posix::Abort();
     }
-    defined_test_names_.insert(test_name);
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
     return true;
   }
 
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
@@ -8066,10 +8462,14 @@ class GTEST_API_ TypedTestCasePState {
       const char* file, int line, const char* registered_tests);
 
  private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
   bool registered_;
-  ::std::set<const char*> defined_test_names_;
+  RegisteredTestsMap registered_tests_;
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Skips to the first non-space char after the first comma in 'str';
 // returns NULL if no comma is found in 'str'.
 inline const char* SkipComma(const char* str) {
@@ -8088,6 +8488,42 @@ inline std::string GetPrefixUntilComma(const char* str) {
   return comma == NULL ? str : std::string(str, comma);
 }
 
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
 // TypeParameterizedTest<Fixture, TestSel, Types>::Register()
 // registers a list of type-parameterized tests with Google Test.  The
 // return value is insignificant - we just need to return something
@@ -8102,8 +8538,10 @@ class TypeParameterizedTest {
   // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
     typedef typename GTEST_BIND_(TestSel, Type) TestClass;
@@ -8111,19 +8549,23 @@ class TypeParameterizedTest {
     // First, registers the first type-parameterized test in the type
     // list.
     MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[index])
+            .c_str(),
         StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
         GetTypeName<Type>().c_str(),
         NULL,  // No value parameter.
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
+        code_location, GetTypeId<FixtureClass>(), TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase, new TestFactoryImpl<TestClass>);
 
     // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
   }
 };
 
@@ -8131,8 +8573,11 @@ class TypeParameterizedTest {
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
 class TypeParameterizedTest<Fixture, TestSel, Types0> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -8144,17 +8589,35 @@ class TypeParameterizedTest<Fixture, TestSel, Types0> {
 template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
 class TypeParameterizedTestCase {
  public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
     typedef typename Tests::Head Head;
 
     // First, register the first test in 'Test' for each type in 'Types'.
     TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
+        prefix, test_location, case_name, test_names, 0, type_names);
 
     // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail,
+                                     Types>::Register(prefix, code_location,
+                                                      state, case_name,
+                                                      SkipComma(test_names),
+                                                      type_names);
   }
 };
 
@@ -8162,8 +8625,11 @@ class TypeParameterizedTestCase {
 template <GTEST_TEMPLATE_ Fixture, typename Types>
 class TypeParameterizedTestCase<Fixture, Templates0, Types> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -8280,31 +8746,6 @@ struct RemoveConst<T[N]> {
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
     GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
 
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
 // ImplicitlyConvertible<From, To>::value is a compile-time bool
 // constant that's true iff type From can be implicitly converted to
 // type To.
@@ -8374,8 +8815,11 @@ struct IsAProtocolMessage
 // a container class by checking the type of IsContainerTest<C>(0).
 // The value of the expression is insignificant.
 //
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
 // class itself (e.g. you can refer to class iterator as either
 // 'iterator' or 'iterator::iterator').  If we look for C::iterator
 // only, for example, we would mistakenly think that a class named
@@ -8385,17 +8829,96 @@ struct IsAProtocolMessage
 // IsContainerTest(typename C::const_iterator*) and
 // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
 typedef int IsContainer;
+#if GTEST_LANG_CXX11
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+#else
 template <class C>
 IsContainer IsContainerTest(int /* dummy */,
                             typename C::iterator* /* it */ = NULL,
                             typename C::const_iterator* /* const_it */ = NULL) {
   return 0;
 }
+#endif  // GTEST_LANG_CXX11
 
 typedef char IsNotContainer;
 template <class C>
 IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
 
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(0, 0)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template<typename T>
+struct VoidT {
+    typedef void value_type;
+};
+
+template <typename T, typename = void>
+struct HasValueType : false_type {};
+template <typename T>
+struct HasValueType<T, VoidT<typename T::value_type> > : true_type {
+};
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer),
+          bool = HasValueType<C>::value>
+struct IsRecursiveContainerImpl;
+
+template <typename C, bool HV>
+struct IsRecursiveContainerImpl<C, false, HV> : public false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, false> : public false_type {};
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, true> {
+  #if GTEST_LANG_CXX11
+  typedef typename IteratorTraits<typename C::const_iterator>::value_type
+      value_type;
+#else
+  typedef typename IteratorTraits<typename C::iterator>::value_type value_type;
+#endif
+  typedef is_same<value_type, C> type;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
 // EnableIf<condition>::type is void when 'Cond' is true, and
 // undefined when 'Cond' is false.  To use SFINAE to make a function
 // overload only apply when a particular expression is true, add
@@ -8527,7 +9050,7 @@ class NativeArray {
  private:
   enum {
     kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
-        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
   };
 
   // Initializes this object with a copy of the input.
@@ -8572,7 +9095,7 @@ class NativeArray {
 #define GTEST_SUCCESS_(message) \
   GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
 
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// Suppress MSVC warning 4702 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
@@ -8683,6 +9206,7 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
   ::test_info_ =\
     ::testing::internal::MakeAndRegisterTestInfo(\
         #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
         (parent_id), \
         parent_class::SetUpTestCase, \
         parent_class::TearDownTestCase, \
@@ -8691,7 +9215,6 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
 void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
 // Copyright 2005, Google Inc.
 // All rights reserved.
 //
@@ -8720,14 +9243,14 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
@@ -8761,12 +9284,11 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
@@ -8786,6 +9308,9 @@ const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
 
 #if GTEST_HAS_DEATH_TEST
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // DeathTest is a class that hides much of the complexity of the
 // GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
 // returns a concrete class that depends on the prevailing death test
@@ -8869,6 +9394,8 @@ class GTEST_API_ DeathTest {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
@@ -8951,14 +9478,18 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 // can be streamed.
 
 // This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                \
+  if (::testing::internal::AlwaysTrue()) {                     \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else if (!::testing::internal::AlwaysTrue()) {             \
+    const ::testing::internal::RE& gtest_regex = (regex);      \
+    static_cast<void>(gtest_regex);                            \
+  } else                                                       \
     ::testing::Message()
 
 // A class representing the parsed contents of the
@@ -8997,53 +9528,6 @@ class InternalRunDeathTestFlag {
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
 #endif  // GTEST_HAS_DEATH_TEST
 
 }  // namespace internal
@@ -9110,10 +9594,11 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
+//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
-//   On other platforms (e.g. Windows), we only support a simple regex
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
 //   syntax implemented as part of Google Test.  This limited
 //   implementation should be enough most of the time when writing
 //   death tests; though it lacks many features you can find in PCRE
@@ -9171,7 +9656,7 @@ GTEST_API_ bool InDeathTestChild();
 //   is rarely a problem as people usually don't put the test binary
 //   directory in PATH.
 //
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
+// FIXME: make thread-safe death tests search the PATH.
 
 // Asserts that a given statement causes the program to exit, with an
 // integer exit status that satisfies predicate, and emitting error output
@@ -9209,9 +9694,10 @@ class GTEST_API_ ExitedWithCode {
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
@@ -9283,21 +9769,69 @@ class GTEST_API_ KilledBySignal {
 # endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system. It is exposed publicly so that
+// systems that have death-tests with stricter requirements than
+// GTEST_HAS_DEATH_TEST can write their own equivalent of
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
     ASSERT_DEATH(statement, regex)
 #else
 # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
 # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
@@ -9336,13 +9870,12 @@ class GTEST_API_ KilledBySignal {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: vladl@google.com (Vlad Losev)
-//
 // Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
+// in Google C++ Testing and Mocking Framework (Google Test)
 //
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
 //
+// GOOGLETEST_CM0001 DO NOT DELETE
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -9384,7 +9917,7 @@ TEST_P(FooTest, HasBlahBlah) {
 // Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
 // case with any set of parameters you want. Google Test defines a number
 // of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
+// (surprise!) parameter generators. Here is a summary of them, which
 // are all in the testing namespace:
 //
 //
@@ -9489,9 +10022,6 @@ TEST_P(DerivedTest, DoesBlah) {
 # include <utility>
 #endif
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
 //
@@ -9520,21 +10050,22 @@ TEST_P(DerivedTest, DoesBlah) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
+#include <ctype.h>
+
 #include <iterator>
+#include <set>
 #include <utility>
 #include <vector>
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 // Copyright 2003 Google Inc.
 // All rights reserved.
 //
@@ -9564,8 +10095,6 @@ TEST_P(DerivedTest, DoesBlah) {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: Dan Egnor (egnor@google.com)
-//
 // A "smart" pointer type with reference tracking.  Every pointer to a
 // particular object is kept on a circular linked list.  When the last pointer
 // to an object is destroyed or reassigned, the object is deleted.
@@ -9599,9 +10128,11 @@ TEST_P(DerivedTest, DoesBlah) {
 //       raw pointer (e.g. via get()) concurrently, and
 //     - it's safe to write to two linked_ptrs that point to the same
 //       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// FIXME: rename this to safe_linked_ptr to avoid
 // confusion with normal linked_ptr.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 
@@ -9805,10 +10336,9 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -9825,6 +10355,10 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 //   2. operator<<(ostream&, const T&) defined in either foo or the
 //      global namespace.
 //
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
 // If none of the above is defined, it will print the debug string of
 // the value if it is a protocol buffer, or print the raw bytes in the
 // value otherwise.
@@ -9871,6 +10405,8 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 // being defined as many user-defined container types don't have
 // value_type.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
@@ -9884,6 +10420,12 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 # include <tuple>
 #endif
 
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 // Definitions in the 'internal' and 'internal2' name spaces are
@@ -9902,7 +10444,11 @@ enum TypeKind {
   kProtobuf,              // a protobuf type
   kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
                           // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
+#if GTEST_HAS_ABSL
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view
+#endif
+  kOtherType  // anything else
 };
 
 // TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
@@ -9914,7 +10460,8 @@ class TypeWithoutFormatter {
  public:
   // This default version is called when kTypeKind is kOtherType.
   static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+    PrintBytesInObjectTo(static_cast<const unsigned char*>(
+                             reinterpret_cast<const void*>(&value)),
                          sizeof(value), os);
   }
 };
@@ -9928,10 +10475,10 @@ template <typename T>
 class TypeWithoutFormatter<T, kProtobuf> {
  public:
   static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
     *os << ("<" + pretty_str + ">");
   }
 };
@@ -9952,6 +10499,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   }
 };
 
+#if GTEST_HAS_ABSL
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view.
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T& value, ::std::ostream* os);
+};
+#endif
+
 // Prints the given value to the given ostream.  If the value is a
 // protocol message, its debug string is printed; if it's an enum or
 // of a type implicitly convertible to BiggestInt, it's printed as an
@@ -9979,10 +10539,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
 template <typename Char, typename CharTraits, typename T>
 ::std::basic_ostream<Char, CharTraits>& operator<<(
     ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
+                               ? kProtobuf
+                               : internal::ImplicitlyConvertible<
+                                     const T&, internal::BiggestInt>::value
+                                     ? kConvertibleToInteger
+                                     :
+#if GTEST_HAS_ABSL
+                                     internal::ImplicitlyConvertible<
+                                         const T&, absl::string_view>::value
+                                         ? kConvertibleToStringView
+                                         :
+#endif
+                                         kOtherType)>::PrintValue(x, &os);
   return os;
 }
 
@@ -10031,6 +10600,103 @@ void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
 namespace testing {
 namespace internal {
 
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
 // UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
 // value to the given ostream.  The caller must ensure that
 // 'ostream_ptr' is not NULL, or the behavior is undefined.
@@ -10044,11 +10710,18 @@ class UniversalPrinter;
 template <typename T>
 void UniversalPrint(const T& value, ::std::ostream* os);
 
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type> struct WrapPrinterType {};
+
 // Used to print an STL-style container when the user doesn't define
 // a PrintTo() for it.
 template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
                     const C& container, ::std::ostream* os) {
   const size_t kMaxCount = 32;  // The maximum number of elements to print.
   *os << '{';
@@ -10081,40 +10754,34 @@ void DefaultPrintTo(IsContainer /* dummy */,
 // implementation-defined.  Therefore they will be printed as raw
 // bytes.)
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
                     T* p, ::std::ostream* os) {
   if (p == NULL) {
     *os << "NULL";
   } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void*>(p);
   }
 }
 
 // Used to print a non-container, non-pointer value when the user
 // doesn't define PrintTo() for it.
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
                     const T& value, ::std::ostream* os) {
   ::testing_internal::DefaultPrintNonContainerTo(value, os);
 }
@@ -10132,11 +10799,8 @@ void DefaultPrintTo(IsNotContainer /* dummy */,
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
   //
   // Note that we check for container types here, prior to we check
   // for protocol message types in our operator<<.  The rationale is:
@@ -10148,13 +10812,27 @@ void PrintTo(const T& value, ::std::ostream* os) {
   // elements; therefore we check for container types here to ensure
   // that our format is used.
   //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !is_pointer<T>::value
+                ? kPrintOther
+#if GTEST_LANG_CXX11
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+#else
+                : !internal::ImplicitlyConvertible<T, const void*>::value
+#endif
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -10261,6 +10939,17 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
+#if GTEST_HAS_ABSL
+// Overload for absl::string_view.
+inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_LANG_CXX11
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+#endif  // GTEST_LANG_CXX11
+
 #if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
@@ -10390,6 +11079,48 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+    absl::visit(Visitor{os}, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
 template <typename T>
@@ -10403,7 +11134,7 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
     // If the array has more than kThreshold elements, we'll have to
     // omit some details by printing only the first and the last
     // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
+    // FIXME: let the user control the threshold using a flag.
     if (len <= kThreshold) {
       PrintRawArrayTo(begin, len, os);
     } else {
@@ -10485,7 +11216,7 @@ class UniversalTersePrinter<const char*> {
     if (str == NULL) {
       *os << "NULL";
     } else {
-      UniversalPrint(string(str), os);
+      UniversalPrint(std::string(str), os);
     }
   }
 };
@@ -10536,7 +11267,7 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector<string> Strings;
+typedef ::std::vector< ::std::string> Strings;
 
 // TuplePolicy<TupleT> must provide:
 // - tuple_size
@@ -10555,12 +11286,13 @@ struct TuplePolicy {
   static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
 
   template <size_t I>
-  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+  struct tuple_element : ::std::tr1::tuple_element<static_cast<int>(I), Tuple> {
+  };
 
   template <size_t I>
-  static typename AddReference<
-      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
-      const Tuple& tuple) {
+  static typename AddReference<const typename ::std::tr1::tuple_element<
+      static_cast<int>(I), Tuple>::type>::type
+  get(const Tuple& tuple) {
     return ::std::tr1::get<I>(tuple);
   }
 };
@@ -10656,6 +11388,16 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
 
 }  // namespace internal
 
+#if GTEST_HAS_ABSL
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T& value, ::std::ostream* os) {
+  internal::PrintTo(absl::string_view(value), os);
+}
+}  // namespace internal2
+#endif
+
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
@@ -10665,21 +11407,86 @@ ::std::string PrintToString(const T& value) {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#if GTEST_HAS_PARAM_TEST
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 namespace testing {
-namespace internal {
 
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
 // Outputs a message explaining invalid registration of different
 // fixture class for the same test case. This may happen when
 // TEST_P macro is used to define two tests with the same name
 // but in different namespaces.
 GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
+                                          CodeLocation code_location);
 
 template <typename> class ParamGeneratorInterface;
 template <typename> class ParamGenerator;
@@ -10827,7 +11634,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
       return base_;
     }
     virtual void Advance() {
-      value_ = value_ + step_;
+      value_ = static_cast<T>(value_ + step_);
       index_++;
     }
     virtual ParamIteratorInterface<T>* Clone() const {
@@ -10864,7 +11671,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
                                const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
       end_index++;
     return end_index;
   }
@@ -10966,6 +11773,37 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
   const ContainerType container_;
 };  // class ValuesInIteratorRangeGenerator
 
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Stores a parameter value and later creates tests parameterized with that
@@ -11038,7 +11876,7 @@ class ParameterizedTestCaseInfoBase {
   virtual ~ParameterizedTestCaseInfoBase() {}
 
   // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
+  virtual const std::string& GetTestCaseName() const = 0;
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
@@ -11070,12 +11908,14 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   typedef typename TestCase::ParamType ParamType;
   // A function that returns an instance of appropriate generator type.
   typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
 
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  virtual const std::string& GetTestCaseName() const { return test_case_name_; }
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
   // TEST_P macro uses AddTestPattern() to record information
@@ -11093,11 +11933,12 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   }
   // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
   // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
+  int AddTestCaseInstantiation(const std::string& instantiation_name,
                                GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
   }
   // UnitTest class invokes this method to register tests in this test case
@@ -11112,25 +11953,45 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
       for (typename InstantiationContainer::iterator gen_it =
                instantiations_.begin(); gen_it != instantiations_.end();
                ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
+        const std::string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
 
-        string test_case_name;
+        std::string test_case_name;
         if ( !instantiation_name.empty() )
           test_case_name = instantiation_name + "/";
         test_case_name += test_info->test_case_base_name;
 
-        int i = 0;
+        size_t i = 0;
+        std::set<std::string> test_param_names;
         for (typename ParamGenerator<ParamType>::iterator param_it =
                  generator.begin();
              param_it != generator.end(); ++param_it, ++i) {
           Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
           MakeAndRegisterTestInfo(
               test_case_name.c_str(),
               test_name_stream.GetString().c_str(),
               NULL,  // No type parameter.
               PrintToString(*param_it).c_str(),
+              code_location_,
               GetTestCaseTypeId(),
               TestCase::SetUpTestCase,
               TestCase::TearDownTestCase,
@@ -11151,17 +12012,50 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
         test_base_name(a_test_base_name),
         test_meta_factory(a_test_meta_factory) {}
 
-    const string test_case_base_name;
-    const string test_base_name;
+    const std::string test_case_base_name;
+    const std::string test_base_name;
     const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
   };
   typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
 
-  const string test_case_name_;
+  const std::string test_case_name_;
+  CodeLocation code_location_;
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
@@ -11189,8 +12083,7 @@ class ParameterizedTestCaseRegistry {
   template <class TestCase>
   ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
       const char* test_case_name,
-      const char* file,
-      int line) {
+      CodeLocation code_location) {
     ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
     for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
          it != test_case_infos_.end(); ++it) {
@@ -11199,7 +12092,7 @@ class ParameterizedTestCaseRegistry {
           // Complain about incorrect usage of Google Test facilities
           // and terminate the program since we cannot guaranty correct
           // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
+          ReportInvalidTestCaseType(test_case_name, code_location);
           posix::Abort();
         } else {
           // At this point we are sure that the object we found is of the same
@@ -11212,7 +12105,8 @@ class ParameterizedTestCaseRegistry {
       }
     }
     if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
       test_case_infos_.push_back(typed_test_info);
     }
     return typed_test_info;
@@ -11235,8 +12129,6 @@ class ParameterizedTestCaseRegistry {
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 // This file was GENERATED by command:
 //     pump.py gtest-param-util-generated.h.pump
@@ -11270,8 +12162,7 @@ class ParameterizedTestCaseRegistry {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -11283,14 +12174,11 @@ class ParameterizedTestCaseRegistry {
 // by the maximum arity of the implementation of tuple which is
 // currently set at 10.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-
-#if GTEST_HAS_PARAM_TEST
 
 namespace testing {
 
@@ -11317,7 +12205,12 @@ class ValueArray1 {
   explicit ValueArray1(T1 v1) : v1_(v1) {}
 
   template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray1(const ValueArray1& other) : v1_(other.v1_) {}
 
  private:
   // No implementation - assignment is unsupported.
@@ -11337,6 +12230,8 @@ class ValueArray2 {
     return ValuesIn(array);
   }
 
+  ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray2& other);
@@ -11357,6 +12252,9 @@ class ValueArray3 {
     return ValuesIn(array);
   }
 
+  ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray3& other);
@@ -11379,6 +12277,9 @@ class ValueArray4 {
     return ValuesIn(array);
   }
 
+  ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray4& other);
@@ -11402,6 +12303,9 @@ class ValueArray5 {
     return ValuesIn(array);
   }
 
+  ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray5& other);
@@ -11428,6 +12332,9 @@ class ValueArray6 {
     return ValuesIn(array);
   }
 
+  ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray6& other);
@@ -11455,6 +12362,10 @@ class ValueArray7 {
     return ValuesIn(array);
   }
 
+  ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray7& other);
@@ -11484,6 +12395,10 @@ class ValueArray8 {
     return ValuesIn(array);
   }
 
+  ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray8& other);
@@ -11515,6 +12430,10 @@ class ValueArray9 {
     return ValuesIn(array);
   }
 
+  ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray9& other);
@@ -11547,6 +12466,10 @@ class ValueArray10 {
     return ValuesIn(array);
   }
 
+  ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray10& other);
@@ -11581,6 +12504,11 @@ class ValueArray11 {
     return ValuesIn(array);
   }
 
+  ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray11& other);
@@ -11617,6 +12545,11 @@ class ValueArray12 {
     return ValuesIn(array);
   }
 
+  ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray12& other);
@@ -11655,6 +12588,11 @@ class ValueArray13 {
     return ValuesIn(array);
   }
 
+  ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray13& other);
@@ -11694,6 +12632,11 @@ class ValueArray14 {
     return ValuesIn(array);
   }
 
+  ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray14& other);
@@ -11735,6 +12678,12 @@ class ValueArray15 {
     return ValuesIn(array);
   }
 
+  ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray15& other);
@@ -11779,6 +12728,12 @@ class ValueArray16 {
     return ValuesIn(array);
   }
 
+  ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray16& other);
@@ -11824,6 +12779,12 @@ class ValueArray17 {
     return ValuesIn(array);
   }
 
+  ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray17& other);
@@ -11871,6 +12832,12 @@ class ValueArray18 {
     return ValuesIn(array);
   }
 
+  ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray18& other);
@@ -11919,6 +12886,13 @@ class ValueArray19 {
     return ValuesIn(array);
   }
 
+  ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray19& other);
@@ -11969,6 +12943,13 @@ class ValueArray20 {
     return ValuesIn(array);
   }
 
+  ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray20& other);
@@ -12022,6 +13003,13 @@ class ValueArray21 {
     return ValuesIn(array);
   }
 
+  ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray21& other);
@@ -12076,6 +13064,13 @@ class ValueArray22 {
     return ValuesIn(array);
   }
 
+  ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray22& other);
@@ -12132,6 +13127,14 @@ class ValueArray23 {
     return ValuesIn(array);
   }
 
+  ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray23& other);
@@ -12190,6 +13193,14 @@ class ValueArray24 {
     return ValuesIn(array);
   }
 
+  ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray24& other);
@@ -12249,6 +13260,14 @@ class ValueArray25 {
     return ValuesIn(array);
   }
 
+  ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray25& other);
@@ -12310,6 +13329,14 @@ class ValueArray26 {
     return ValuesIn(array);
   }
 
+  ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray26& other);
@@ -12374,6 +13401,15 @@ class ValueArray27 {
     return ValuesIn(array);
   }
 
+  ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray27& other);
@@ -12439,6 +13475,15 @@ class ValueArray28 {
     return ValuesIn(array);
   }
 
+  ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray28& other);
@@ -12505,6 +13550,15 @@ class ValueArray29 {
     return ValuesIn(array);
   }
 
+  ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray29& other);
@@ -12574,6 +13628,15 @@ class ValueArray30 {
     return ValuesIn(array);
   }
 
+  ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray30& other);
@@ -12645,6 +13708,16 @@ class ValueArray31 {
     return ValuesIn(array);
   }
 
+  ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray31& other);
@@ -12717,6 +13790,16 @@ class ValueArray32 {
     return ValuesIn(array);
   }
 
+  ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray32& other);
@@ -12792,6 +13875,16 @@ class ValueArray33 {
     return ValuesIn(array);
   }
 
+  ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray33& other);
@@ -12868,6 +13961,16 @@ class ValueArray34 {
     return ValuesIn(array);
   }
 
+  ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray34& other);
@@ -12945,6 +14048,17 @@ class ValueArray35 {
     return ValuesIn(array);
   }
 
+  ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray35& other);
@@ -13025,6 +14139,17 @@ class ValueArray36 {
     return ValuesIn(array);
   }
 
+  ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray36& other);
@@ -13107,6 +14232,17 @@ class ValueArray37 {
     return ValuesIn(array);
   }
 
+  ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray37& other);
@@ -13190,6 +14326,17 @@ class ValueArray38 {
     return ValuesIn(array);
   }
 
+  ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray38& other);
@@ -13275,6 +14422,18 @@ class ValueArray39 {
     return ValuesIn(array);
   }
 
+  ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray39& other);
@@ -13362,6 +14521,18 @@ class ValueArray40 {
     return ValuesIn(array);
   }
 
+  ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray40& other);
@@ -13451,6 +14622,18 @@ class ValueArray41 {
     return ValuesIn(array);
   }
 
+  ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray41& other);
@@ -13542,6 +14725,18 @@ class ValueArray42 {
     return ValuesIn(array);
   }
 
+  ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray42& other);
@@ -13634,6 +14829,19 @@ class ValueArray43 {
     return ValuesIn(array);
   }
 
+  ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray43& other);
@@ -13728,6 +14936,19 @@ class ValueArray44 {
     return ValuesIn(array);
   }
 
+  ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray44& other);
@@ -13824,6 +15045,19 @@ class ValueArray45 {
     return ValuesIn(array);
   }
 
+  ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray45& other);
@@ -13922,6 +15156,19 @@ class ValueArray46 {
     return ValuesIn(array);
   }
 
+  ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray46& other);
@@ -14022,6 +15269,20 @@ class ValueArray47 {
     return ValuesIn(array);
   }
 
+  ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray47& other);
@@ -14124,6 +15385,20 @@ class ValueArray48 {
     return ValuesIn(array);
   }
 
+  ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray48& other);
@@ -14227,6 +15502,20 @@ class ValueArray49 {
     return ValuesIn(array);
   }
 
+  ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray49& other);
@@ -14331,6 +15620,20 @@ class ValueArray50 {
     return ValuesIn(array);
   }
 
+  ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray50& other);
@@ -14443,7 +15746,7 @@ class CartesianProductGenerator2
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14475,7 +15778,7 @@ class CartesianProductGenerator2
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
+        current_value_.reset(new ParamType(*current1_, *current2_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14497,7 +15800,7 @@ class CartesianProductGenerator2
     const typename ParamGenerator<T2>::iterator begin2_;
     const typename ParamGenerator<T2>::iterator end2_;
     typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator2::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14566,7 +15869,7 @@ class CartesianProductGenerator3
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14602,7 +15905,7 @@ class CartesianProductGenerator3
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14628,7 +15931,7 @@ class CartesianProductGenerator3
     const typename ParamGenerator<T3>::iterator begin3_;
     const typename ParamGenerator<T3>::iterator end3_;
     typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator3::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14707,7 +16010,7 @@ class CartesianProductGenerator4
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14747,8 +16050,8 @@ class CartesianProductGenerator4
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14778,7 +16081,7 @@ class CartesianProductGenerator4
     const typename ParamGenerator<T4>::iterator begin4_;
     const typename ParamGenerator<T4>::iterator end4_;
     typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator4::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14865,7 +16168,7 @@ class CartesianProductGenerator5
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14909,8 +16212,8 @@ class CartesianProductGenerator5
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14944,7 +16247,7 @@ class CartesianProductGenerator5
     const typename ParamGenerator<T5>::iterator begin5_;
     const typename ParamGenerator<T5>::iterator end5_;
     typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator5::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15042,7 +16345,7 @@ class CartesianProductGenerator6
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15090,8 +16393,8 @@ class CartesianProductGenerator6
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15129,7 +16432,7 @@ class CartesianProductGenerator6
     const typename ParamGenerator<T6>::iterator begin6_;
     const typename ParamGenerator<T6>::iterator end6_;
     typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator6::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15236,7 +16539,7 @@ class CartesianProductGenerator7
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15288,8 +16591,8 @@ class CartesianProductGenerator7
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15331,7 +16634,7 @@ class CartesianProductGenerator7
     const typename ParamGenerator<T7>::iterator begin7_;
     const typename ParamGenerator<T7>::iterator end7_;
     typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator7::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15449,7 +16752,7 @@ class CartesianProductGenerator8
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15505,8 +16808,8 @@ class CartesianProductGenerator8
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15552,7 +16855,7 @@ class CartesianProductGenerator8
     const typename ParamGenerator<T8>::iterator begin8_;
     const typename ParamGenerator<T8>::iterator end8_;
     typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator8::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15678,7 +16981,7 @@ class CartesianProductGenerator9
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15738,9 +17041,9 @@ class CartesianProductGenerator9
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
+            *current9_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15790,7 +17093,7 @@ class CartesianProductGenerator9
     const typename ParamGenerator<T9>::iterator begin9_;
     const typename ParamGenerator<T9>::iterator end9_;
     typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator9::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15925,7 +17228,7 @@ class CartesianProductGenerator10
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15989,9 +17292,9 @@ class CartesianProductGenerator10
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
+            *current9_, *current10_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -16045,7 +17348,7 @@ class CartesianProductGenerator10
     const typename ParamGenerator<T10>::iterator begin10_;
     const typename ParamGenerator<T10>::iterator end10_;
     typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator10::Iterator
 
   // No implementation - assignment is unsupported.
@@ -16376,12 +17679,8 @@ CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Functions producing parameter generators.
@@ -16461,7 +17760,7 @@ internal::ParamGenerator<T> Range(T start, T end) {
 // each with C-string values of "foo", "bar", and "baz":
 //
 // const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings));
 //
 // This instantiates tests from test case StlStringTest
 // each with STL strings with values "a" and "b":
@@ -17563,8 +18862,6 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 }
 # endif  // GTEST_HAS_COMBINE
 
-
-
 # define TEST_P(test_case_name, test_name) \
   class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
       : public test_case_name { \
@@ -17575,11 +18872,14 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
     static int AddToRegistry() { \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      GTEST_STRINGIFY_(test_case_name), \
+                      GTEST_STRINGIFY_(test_name), \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
       return 0; \
     } \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
@@ -17591,21 +18891,39 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  static ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
+  static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
 
 }  // namespace testing
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 // Copyright 2006, Google Inc.
 // All rights reserved.
@@ -17635,10 +18953,10 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -17649,17 +18967,20 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 //
 // class MyClass {
 //  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
 // };
 //
 // class MyClassTest : public testing::Test {
 //   // ...
 // };
 //
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
 // }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
 
 #define FRIEND_TEST(test_case_name, test_name)\
 friend class test_case_name##_##test_name##_Test
@@ -17694,8 +19015,7 @@ friend class test_case_name##_##test_name##_Test
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: mheule@google.com (Markus Heule)
-//
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
@@ -17703,6 +19023,9 @@ friend class test_case_name##_##test_name##_Test
 #include <iosfwd>
 #include <vector>
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // A copyable object representing the result of a test part (i.e. an
@@ -17808,7 +19131,7 @@ class GTEST_API_ TestPartResultArray {
 };
 
 // This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
+class GTEST_API_ TestPartResultReporterInterface {
  public:
   virtual ~TestPartResultReporterInterface() {}
 
@@ -17841,6 +19164,8 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
@@ -17870,8 +19195,9 @@ class GTEST_API_ HasNewFatalFailureHelper
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -17926,6 +19252,24 @@ TYPED_TEST(FooTest, DoesBlah) {
 
 TYPED_TEST(FooTest, HasPropertyA) { ... }
 
+// TYPED_TEST_CASE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames);
+
 #endif  // 0
 
 // Type-parameterized tests are abstract test patterns parameterized
@@ -17987,6 +19331,11 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // If the type list contains only one type, you can write that type
 // directly without Types<...>:
 //   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_CASE above,
+// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames);
 
 #endif  // 0
 
@@ -18001,31 +19350,46 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // given test case.
 # define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
 
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestCaseName) \
+  gtest_type_params_##TestCaseName##_NameGenerator
+
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+# define TYPED_TEST_CASE(CaseName, Types, ...)                             \
+  typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \
+      CaseName);                                                           \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type    \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName)                                       \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    virtual void TestBody();                                                  \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+        GTEST_ATTRIBUTE_UNUSED_ =                                             \
+      ::testing::internal::TypeParameterizedTest<                             \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   #CaseName, #TestName, 0,                   \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
 
 #endif  // GTEST_HAS_TYPED_TEST
 
@@ -18082,24 +19446,35 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
   namespace GTEST_CASE_NAMESPACE_(CaseName) { \
   typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
   } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \
+      GTEST_ATTRIBUTE_UNUSED_ = \
+          GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \
+              __FILE__, __LINE__, #__VA_ARGS__)
 
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...)      \
+  static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ =       \
+      ::testing::internal::TypeParameterizedTestCase<                     \
+          CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_,     \
+          ::testing::internal::TypeList< Types >::type>::                 \
+          Register(#Prefix,                                               \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName,  \
+                   GTEST_REGISTERED_TEST_NAMES_(CaseName),                \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::TypeList< Types >::type>())
 
 #endif  // GTEST_HAS_TYPED_TEST_P
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Depending on the platform, different string classes are available.
 // On Linux, in addition to ::std::string, Google also makes use of
 // class ::string, which has the same interface as ::std::string, but
@@ -18117,6 +19492,15 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 
 namespace testing {
 
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
+#endif
+
+
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -18138,6 +19522,10 @@ GTEST_DECLARE_string_(color);
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
@@ -18150,6 +19538,9 @@ GTEST_DECLARE_string_(output);
 // test.
 GTEST_DECLARE_bool_(print_time);
 
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
 // This flag specifies the random number seed.
 GTEST_DECLARE_int32_(random_seed);
 
@@ -18170,7 +19561,7 @@ GTEST_DECLARE_int32_(stack_trace_depth);
 
 // When this flag is specified, a failed assertion will throw an
 // exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
+// non-zero code otherwise. For use with an external test framework.
 GTEST_DECLARE_bool_(throw_on_failure);
 
 // When this flag is set with a "host:port" string, on supported
@@ -18178,6 +19569,10 @@ GTEST_DECLARE_bool_(throw_on_failure);
 // the specified host machine.
 GTEST_DECLARE_string_(stream_result_to);
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -18195,6 +19590,7 @@ class TestEventListenersAccessor;
 class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
+class FuchsiaDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
                                     const std::string& message);
@@ -18294,7 +19690,9 @@ class GTEST_API_ AssertionResult {
   // Used in EXPECT_TRUE/FALSE(assertion_result).
   AssertionResult(const AssertionResult& other);
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
 
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
   //
@@ -18311,7 +19709,9 @@ class GTEST_API_ AssertionResult {
           /*enabler*/ = NULL)
       : success_(success) {}
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
 
   // Assignment operator.
   AssertionResult& operator=(AssertionResult other) {
@@ -18332,7 +19732,7 @@ class GTEST_API_ AssertionResult {
   const char* message() const {
     return message_.get() != NULL ?  message_->c_str() : "";
   }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // FIXME: Remove this after making sure no clients use it.
   // Deprecated; please use message() instead.
   const char* failure_message() const { return message(); }
 
@@ -18380,628 +19780,1005 @@ GTEST_API_ AssertionResult AssertionFailure();
 // Deprecated; use AssertionFailure() << msg.
 GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 
-// The abstract class that all tests inherit from.
-//
-// In Google Test, a unit test program contains one or many TestCases, and
-// each TestCase contains one or many Tests.
-//
-// When you define a test using the TEST macro, you don't need to
-// explicitly derive from Test - the TEST macro automatically does
-// this for you.
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+// Copyright 2006, Google Inc.
+// All rights reserved.
 //
-// The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
 //
-//   class FooTest : public testing::Test {
-//    protected:
-//     void SetUp() override { ... }
-//     void TearDown() override { ... }
-//     ...
-//   };
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
 //
-//   TEST_F(FooTest, Bar) { ... }
-//   TEST_F(FooTest, Baz) { ... }
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
-// Test is not copyable.
-class GTEST_API_ Test {
- public:
-  friend class TestInfo;
+// Implements a family of generic predicate assertion macros.
 
-  // Defines types for pointers to functions that set up and tear down
-  // a test case.
-  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
-  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+// GOOGLETEST_CM0001 DO NOT DELETE
 
-  // The d'tor is virtual as we intend to inherit from Test.
-  virtual ~Test();
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-  // Sets up the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::SetUpTestCase() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestCase() method to shadow the one defined in the super
-  // class.
-  static void SetUpTestCase() {}
 
-  // Tears down the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::TearDownTestCase() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestCase() method to shadow the one defined in the super
-  // class.
-  static void TearDownTestCase() {}
+namespace testing {
 
-  // Returns true iff the current test has a fatal failure.
-  static bool HasFatalFailure();
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
 
-  // Returns true iff the current test has a non-fatal failure.
-  static bool HasNonfatalFailure();
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
 
-  // Returns true iff the current test has a (either fatal or
-  // non-fatal) failure.
-  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
 
-  // Logs a property for the current test, test case, or for the entire
-  // invocation of the test program when used outside of the context of a
-  // test case.  Only the last value for a given key is remembered.  These
-  // are public static so they can be called from utility functions that are
-  // not members of the test fixture.  Calls to RecordProperty made during
-  // lifespan of the test (from the moment its constructor starts to the
-  // moment its destructor finishes) will be output in XML as attributes of
-  // the <testcase> element.  Properties recorded from fixture's
-  // SetUpTestCase or TearDownTestCase are logged as attributes of the
-  // corresponding <testsuite> element.  Calls to RecordProperty made in the
-  // global context (before or after invocation of RUN_ALL_TESTS and from
-  // SetUp/TearDown method of Environment objects registered with Google
-  // Test) will be output as attributes of the <testsuites> element.
-  static void RecordProperty(const std::string& key, const std::string& value);
-  static void RecordProperty(const std::string& key, int value);
-
- protected:
-  // Creates a Test object.
-  Test();
-
-  // Sets up the test fixture.
-  virtual void SetUp();
-
-  // Tears down the test fixture.
-  virtual void TearDown();
 
- private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test case.
-  static bool HasSameFixtureClass();
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
 
-  // Runs the test after the test fixture has been set up.
-  //
-  // A sub-class must implement this to define the test logic.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
-  // Instead, use the TEST or TEST_F macro.
-  virtual void TestBody() = 0;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
 
-  // Sets up, executes, and tears down the test.
-  void Run();
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
 
-  // Deletes self.  We deliberately pick an unusual name for this
-  // internal method to avoid clashing with names used in user TESTs.
-  void DeleteSelf_() { delete this; }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
 
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
-  // Often a user misspells SetUp() as Setup() and spends a long time
-  // wondering why it is never called by Google Test.  The declaration of
-  // the following method is solely for catching such an error at
-  // compile time:
-  //
-  //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if void Setup() is declared in the user's
-  //   test fixture.
-  //
-  //   - This method is private, so it will be another compiler error
-  //   if the method is called from the user's test fixture.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION.
-  //
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 
-  // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
-};
 
-typedef internal::TimeInMillis TimeInMillis;
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
 
-// A copyable object representing a user specified test property which can be
-// output as a key/value string pair.
-//
-// Don't inherit from TestProperty as its destructor is not virtual.
-class TestProperty {
- public:
-  // C'tor.  TestProperty does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
 
-  // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
 
-  // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
 
-  // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
- private:
-  // The key supplied by the user.
-  std::string key_;
-  // The value supplied by the user.
-  std::string value_;
-};
 
-// The result of a single Test.  This includes a list of
-// TestPartResults, a list of TestProperties, a count of how many
-// death tests there are in the Test, and how much time it took to run
-// the Test.
-//
-// TestResult is not copyable.
-class GTEST_API_ TestResult {
- public:
-  // Creates an empty TestResult.
-  TestResult();
 
-  // D'tor.  Do not inherit from TestResult.
-  ~TestResult();
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
 
-  // Gets the number of all test parts.  This is the sum of the number
-  // of successful test parts and the number of failed test parts.
-  int total_part_count() const;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
 
-  // Returns the number of the test properties.
-  int test_property_count() const;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
 
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Failed(); }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
 
-  // Returns true iff the test failed.
-  bool Failed() const;
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-  // Returns true iff the test fatally failed.
-  bool HasFatalFailure() const;
 
-  // Returns true iff the test has a non-fatal failure.
-  bool HasNonfatalFailure() const;
 
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
-  const TestPartResult& GetTestPartResult(int i) const;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
 
-  // Returns the i-th test property. i can range from 0 to
-  // test_property_count() - 1. If i is not in that range, aborts the
-  // program.
-  const TestProperty& GetTestProperty(int i) const;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
 
- private:
-  friend class TestInfo;
-  friend class TestCase;
-  friend class UnitTest;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::ExecDeathTest;
-  friend class internal::TestResultAccessor;
-  friend class internal::UnitTestImpl;
-  friend class internal::WindowsDeathTest;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
 
-  // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult>& test_part_results() const {
-    return test_part_results_;
-  }
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-  // Gets the vector of TestProperties.
-  const std::vector<TestProperty>& test_properties() const {
-    return test_properties_;
-  }
 
-  // Sets the elapsed time.
-  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
-  // Adds a test property to the list. The property is validated and may add
-  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
-  // key names). If a property is already recorded for the same key, the
-  // value will be updated, rather than storing multiple values for the same
-  // key.  xml_element specifies the element for which the property is being
-  // recorded and is used for validation.
-  void RecordProperty(const std::string& xml_element,
-                      const TestProperty& test_property);
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
-  // Adds a failure if the key is a reserved attribute of Google Test
-  // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const std::string& xml_element,
-                                   const TestProperty& test_property);
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
 
-  // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult& test_part_result);
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
 
-  // Returns the death test count.
-  int death_test_count() const { return death_test_count_; }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
 
-  // Increments the death test count, returning the new count.
-  int increment_death_test_count() { return ++death_test_count_; }
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-  // Clears the test part results.
-  void ClearTestPartResults();
 
-  // Clears the object.
-  void Clear();
 
-  // Protects mutable state of the property vector and of owned
-  // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+}  // namespace testing
 
-  // The vector of TestPartResults
-  std::vector<TestPartResult> test_part_results_;
-  // The vector of TestProperties
-  std::vector<TestProperty> test_properties_;
-  // Running count of death tests.
-  int death_test_count_;
-  // The elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-  // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
-};  // class TestResult
+namespace testing {
 
-// A TestInfo object stores the following information about a test:
+// The abstract class that all tests inherit from.
 //
-//   Test case name
-//   Test name
-//   Whether the test should be run
-//   A function pointer that creates the test object when invoked
-//   Test result
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
 //
-// The constructor of TestInfo registers itself with the UnitTest
-// singleton such that the RUN_ALL_TESTS() macro knows which tests to
-// run.
-class GTEST_API_ TestInfo {
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
  public:
-  // Destructs a TestInfo object.  This function is not virtual, so
-  // don't inherit from TestInfo.
-  ~TestInfo();
-
-  // Returns the test case name.
-  const char* test_case_name() const { return test_case_name_.c_str(); }
-
-  // Returns the test name.
-  const char* name() const { return name_.c_str(); }
+  friend class TestInfo;
 
-  // Returns the name of the parameter type, or NULL if this is not a typed
-  // or a type-parameterized test.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
-  }
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
 
-  // Returns the text representation of the value parameter, or NULL if this
-  // is not a value-parameterized test.
-  const char* value_param() const {
-    if (value_param_.get() != NULL)
-      return value_param_->c_str();
-    return NULL;
-  }
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
 
-  // Returns true if this test should run, that is if the test is not
-  // disabled (or it is disabled but the also_run_disabled_tests flag has
-  // been specified) and its full name matches the user-specified filter.
-  //
-  // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test case Foo is defined as
-  // "Foo.Bar".  Only the tests that match the filter will run.
+  // Sets up the stuff shared by all tests in this test case.
   //
-  // A filter is a colon-separated list of glob (not regex) patterns,
-  // optionally followed by a '-' and a colon-separated list of
-  // negative patterns (tests to exclude).  A test is run if it
-  // matches one of the positive patterns and does not match any of
-  // the negative patterns.
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
   //
-  // For example, *A*:Foo.* is a filter that matches any string that
-  // contains the character 'A' or starts with "Foo.".
-  bool should_run() const { return should_run_; }
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
 
-  // Returns true iff this test will appear in the XML report.
-  bool is_reportable() const {
-    // For now, the XML report includes all tests matching the filter.
-    // In the future, we may trim tests that are excluded because of
-    // sharding.
-    return matches_filter_;
-  }
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
 
-  // Returns the result of the test.
-  const TestResult* result() const { return &result_; }
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
 
- private:
-#if GTEST_HAS_DEATH_TEST
-  friend class internal::DefaultDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-  friend class Test;
-  friend class TestCase;
-  friend class internal::UnitTestImpl;
-  friend class internal::StreamingListenerTest;
-  friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name,
-      const char* name,
-      const char* type_param,
-      const char* value_param,
-      internal::TypeId fixture_class_id,
-      Test::SetUpTestCaseFunc set_up_tc,
-      Test::TearDownTestCaseFunc tear_down_tc,
-      internal::TestFactoryBase* factory);
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
 
-  // Constructs a TestInfo object. The newly constructed instance assumes
-  // ownership of the factory object.
-  TestInfo(const std::string& test_case_name,
-           const std::string& name,
-           const char* a_type_param,   // NULL if not a type-parameterized test
-           const char* a_value_param,  // NULL if not a value-parameterized test
-           internal::TypeId fixture_class_id,
-           internal::TestFactoryBase* factory);
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
 
-  // Increments the number of death tests encountered in this test so
-  // far.
-  int increment_death_test_count() {
-    return result_.increment_death_test_count();
-  }
+ protected:
+  // Creates a Test object.
+  Test();
 
-  // Creates the test object, runs it, records its result, and then
-  // deletes it.
-  void Run();
+  // Sets up the test fixture.
+  virtual void SetUp();
 
-  static void ClearTestResult(TestInfo* test_info) {
-    test_info->result_.Clear();
-  }
+  // Tears down the test fixture.
+  virtual void TearDown();
 
-  // These fields are immutable properties of the test.
-  const std::string test_case_name_;     // Test case name
-  const std::string name_;               // Test name
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
-  // Text representation of the value parameter, or NULL if this is not a
-  // value-parameterized test.
-  const internal::scoped_ptr<const ::std::string> value_param_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
-  internal::TestFactoryBase* const factory_;  // The factory that creates
-                                              // the test object
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
 
-  // This field is mutable and needs to be reset before running the
-  // test for the second time.
-  TestResult result_;
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
-};
+  // Sets up, executes, and tears down the test.
+  void Run();
 
-// A test case, which consists of a vector of TestInfos.
-//
-// TestCase is not copyable.
-class GTEST_API_ TestCase {
- public:
-  // Creates a TestCase with the given name.
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
   //
-  // TestCase does NOT have a default constructor.  Always use this
-  // constructor to create a TestCase object.
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
   //
-  // Arguments:
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
   //
-  //   name:         name of the test case
-  //   a_type_param: the name of the test's type parameter, or NULL if
-  //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  TestCase(const char* name, const char* a_type_param,
-           Test::SetUpTestCaseFunc set_up_tc,
-           Test::TearDownTestCaseFunc tear_down_tc);
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 
-  // Destructor of TestCase.
-  virtual ~TestCase();
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
 
-  // Gets the name of the TestCase.
-  const char* name() const { return name_.c_str(); }
+typedef internal::TimeInMillis TimeInMillis;
 
-  // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test case.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
   }
 
-  // Returns true if any test in this test case should run.
-  bool should_run() const { return should_run_; }
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
 
-  // Gets the number of successful tests in this test case.
-  int successful_test_count() const;
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
 
-  // Gets the number of failed tests in this test case.
-  int failed_test_count() const;
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
 
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
 
-  // Gets the number of disabled tests in this test case.
-  int disabled_test_count() const;
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
 
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
 
-  // Get the number of tests in this test case that should run.
-  int test_to_run_count() const;
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
 
-  // Gets the number of all tests in this test case.
-  int total_test_count() const;
+  // Returns the number of the test properties.
+  int test_property_count() const;
 
-  // Returns true iff the test case passed.
+  // Returns true iff the test passed (i.e. no test part failed).
   bool Passed() const { return !Failed(); }
 
-  // Returns true iff the test case failed.
-  bool Failed() const { return failed_test_count() > 0; }
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo* GetTestInfo(int i) const;
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult& GetTestPartResult(int i) const;
 
-  // Returns the TestResult that holds test properties recorded during
-  // execution of SetUpTestCase and TearDownTestCase.
-  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
 
  private:
-  friend class Test;
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
   friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
 
-  // Gets the (mutable) vector of TestInfos in this TestCase.
-  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
 
-  // Gets the (immutable) vector of TestInfos in this TestCase.
-  const std::vector<TestInfo*>& test_info_list() const {
-    return test_info_list_;
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
   }
 
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo* GetMutableTestInfo(int i);
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
-  // Sets the should_run member.
-  void set_should_run(bool should) { should_run_ = should; }
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
 
-  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
-  // destruction of the TestCase object.
-  void AddTestInfo(TestInfo * test_info);
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
 
-  // Clears the results of all tests in this test case.
-  void ClearResult();
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
 
-  // Clears the results of all tests in the given test case.
-  static void ClearTestCaseResult(TestCase* test_case) {
-    test_case->ClearResult();
-  }
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
 
-  // Runs every test in this TestCase.
-  void Run();
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
 
-  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestCase().
-  void RunSetUpTestCase() { (*set_up_tc_)(); }
+  // Clears the test part results.
+  void ClearTestPartResults();
 
-  // Runs TearDownTestCase() for this TestCase.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestCase().
-  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+  // Clears the object.
+  void Clear();
 
-  // Returns true iff test passed.
-  static bool TestPassed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Passed();
-  }
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
 
-  // Returns true iff test failed.
-  static bool TestFailed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Failed();
-  }
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
 
-  // Returns true iff the test is disabled and will be reported in the XML
-  // report.
-  static bool TestReportableDisabled(const TestInfo* test_info) {
-    return test_info->is_reportable() && test_info->is_disabled_;
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
   }
 
-  // Returns true iff test is disabled.
-  static bool TestDisabled(const TestInfo* test_info) {
-    return test_info->is_disabled_;
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
   }
 
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
   // Returns true iff this test will appear in the XML report.
-  static bool TestReportable(const TestInfo* test_info) {
-    return test_info->is_reportable();
+  bool is_reportable() const {
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
   }
 
-  // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo* test_info) {
-    return test_info->should_run();
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
   }
 
-  // Shuffles the tests in this test case.
-  void ShuffleTests(internal::Random* random);
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
 
-  // Restores the test order to before the first shuffle.
-  void UnshuffleTests();
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
 
-  // Name of the test case.
-  std::string name_;
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const internal::scoped_ptr<const ::std::string> type_param_;
-  // The vector of TestInfos in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestInfo*> test_info_list_;
-  // Provides a level of indirection for the test list to allow easy
-  // shuffling and restoring the test order.  The i-th element in this
-  // vector is the index of the i-th test in the shuffled test list.
-  std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test case.
-  Test::SetUpTestCaseFunc set_up_tc_;
-  // Pointer to the function that tears down the test case.
-  Test::TearDownTestCaseFunc tear_down_tc_;
-  // True iff any test in this test case should run.
-  bool should_run_;
-  // Elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-  // Holds test properties recorded during execution of SetUpTestCase and
-  // TearDownTestCase.
-  TestResult ad_hoc_test_result_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  bool is_in_another_shard_;        // Will be run in another shard.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
 
-  // We disallow copying TestCases.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
 };
 
-// An Environment object is capable of setting up and tearing down an
-// environment.  You should subclass this to define your own
-// environment(s).
-//
-// An Environment object does the set-up and tear-down in virtual
-// methods SetUp() and TearDown() instead of the constructor and the
-// destructor, as:
+// A test case, which consists of a vector of TestInfos.
 //
-//   1. You cannot safely throw from a destructor.  This is a problem
-//      as in some cases Google Test is used where exceptions are enabled, and
-//      we may want to implement ASSERT_* using exceptions where they are
-//      available.
-//   2. You cannot use ASSERT_* directly in a constructor or
-//      destructor.
-class Environment {
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
  public:
-  // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment() {}
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
 
-  // Override this to define how to set up the environment.
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
   virtual void SetUp() {}
 
   // Override this to define how to tear down the environment.
@@ -19013,6 +20790,18 @@ class Environment {
   virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 };
 
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
 // The interface for tracing execution of tests. The methods are organized in
 // the order the corresponding events are fired.
 class TestEventListener {
@@ -19041,6 +20830,8 @@ class TestEventListener {
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
   // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
   virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
 
   // Fired after the test ends.
@@ -19207,14 +20998,12 @@ class GTEST_API_ UnitTest {
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
 
-#if GTEST_HAS_PARAM_TEST
   // Returns the ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Gets the number of successful test cases.
   int successful_test_case_count() const;
@@ -19314,11 +21103,11 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl() { return impl_; }
   const internal::UnitTestImpl* impl() const { return impl_; }
 
-  // These classes and funcions are friends as they need to access private
+  // These classes and functions are friends as they need to access private
   // members of UnitTest.
+  friend class ScopedTrace;
   friend class Test;
   friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
@@ -19395,172 +21184,70 @@ GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
 namespace internal {
 
-// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
-// value of type ToPrint that is an operand of a comparison assertion
-// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
-// the comparison, and is used to help determine the best way to
-// format the value.  In particular, when the value is a C string
-// (char pointer) and the other operand is an STL string object, we
-// want to format the C string as a string, since we know it is
-// compared by value with the string object.  If the value is a char
-// pointer but the other operand is not an STL string object, we don't
-// know whether the pointer is supposed to point to a NUL-terminated
-// string, and thus want to print it as a pointer to be safe.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
 
-// The default case.
-template <typename ToPrint, typename OtherOperand>
-class FormatForComparison {
- public:
-  static ::std::string Format(const ToPrint& value) {
-    return ::testing::PrintToString(value);
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
   }
-};
 
-// Array.
-template <typename ToPrint, size_t N, typename OtherOperand>
-class FormatForComparison<ToPrint[N], OtherOperand> {
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
  public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
-};
-
-// By default, print C string as pointers to be safe, as we don't know
-// whether they actually point to a NUL-terminated string.
 
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
-
-// If a C string is compared with an STL string object, we know it's meant
-// to point to a NUL-terminated string, and thus can print it as a string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
-#endif
-
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
-#endif
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
-#endif
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char* or void*, and print it as a C string when it is compared
-// against an std::string object, for example.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
-  return FormatForComparison<T1, T2>::Format(value);
-}
-
-// Separate the error generating code from the code path to reduce the stack
-// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
-// when calling EXPECT_* in a tight loop.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQFailure(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const T1& expected, const T2& actual) {
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_EQ.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-GTEST_DISABLE_MSC_WARNINGS_POP_()
-
-  return CmpHelperEQFailure(expected_expression, actual_expression, expected,
-                            actual);
-}
-
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
-
-// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
-// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
-// is a null pointer literal.  The following default implementation is
-// for lhs_is_null_literal being false.
-template <bool lhs_is_null_literal>
-class EqHelper {
- public:
-  // This templatized version is for the general case.
-  template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-
-  // With this overloaded version, we allow anonymous enums to be used
-  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
-  // enums can be implicitly cast to BiggestInt.
-  //
-  // Even though its body looks the same as the above version, we
-  // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-};
+};
 
 // This specialization is used when the first argument to ASSERT_EQ()
 // is a null pointer literal, like NULL, false, or 0.
@@ -19573,37 +21260,36 @@ class EqHelper<true> {
   // EXPECT_EQ(false, a_bool).
   template <typename T1, typename T2>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
       // The following line prevents this overload from being considered if T2
       // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
       // expands to Compare("", "", NULL, my_ptr), which requires a conversion
       // to match the Secret* in the other overload, which would otherwise make
       // this template match better.
       typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // This version will be picked when the second argument to ASSERT_EQ() is a
   // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
+      const char* lhs_expression,
+      const char* rhs_expression,
       // We used to have a second template parameter instead of Secret*.  That
       // template parameter would deduce to 'long', making this a better match
       // than the first overload even without the first overload's EnableIf.
       // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
       // non-pointer argument" (even a deduced integral argument), so the old
       // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
   }
 };
 
@@ -19662,18 +21348,18 @@ GTEST_IMPL_CMP_HELPER_(GT, >);
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
@@ -19695,10 +21381,10 @@ GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
@@ -19756,28 +21442,28 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
     return AssertionSuccess();
   }
 
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
 
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -19834,7 +21520,6 @@ class GTEST_API_ AssertHelper {
 
 }  // namespace internal
 
-#if GTEST_HAS_PARAM_TEST
 // The pure interface class that all value-parameterized tests inherit from.
 // A value-parameterized class must inherit from both ::testing::Test and
 // ::testing::WithParamInterface. In most cases that just means inheriting
@@ -19880,492 +21565,129 @@ class WithParamInterface {
   // references static data, to reduce the opportunity for incorrect uses
   // like writing 'WithParamInterface<bool>::GetParam()' for a test that
   // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const {
-    GTEST_CHECK_(parameter_ != NULL)
-        << "GetParam() can only be called inside a value-parameterized test "
-        << "-- did you intend to write TEST_P instead of TEST_F?";
-    return *parameter_;
-  }
-
- private:
-  // Sets parameter value. The caller is responsible for making sure the value
-  // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
-
-  // Static value used for accessing parameter during a test lifetime.
-  static const ParamType* parameter_;
-
-  // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
-};
-
-template <typename T>
-const T* WithParamInterface<T>::parameter_ = NULL;
-
-// Most value-parameterized classes can ignore the existence of
-// WithParamInterface, and can just inherit from ::testing::TestWithParam.
-
-template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Macros for indicating success/failure in test code.
-
-// ADD_FAILURE unconditionally adds a failure to the current test.
-// SUCCEED generates a success - it doesn't automatically make the
-// current test successful, as a test is only successful when it has
-// no failure.
-//
-// EXPECT_* verifies that a certain condition is satisfied.  If not,
-// it behaves like ADD_FAILURE.  In particular:
-//
-//   EXPECT_TRUE  verifies that a Boolean condition is true.
-//   EXPECT_FALSE verifies that a Boolean condition is false.
-//
-// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
-// that they will also abort the current function on failure.  People
-// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
-// writing data-driven tests often find themselves using ADD_FAILURE
-// and EXPECT_* more.
-
-// Generates a nonfatal failure with a generic message.
-#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
-
-// Generates a nonfatal failure at the given source file location with
-// a generic message.
-#define ADD_FAILURE_AT(file, line) \
-  GTEST_MESSAGE_AT_(file, line, "Failed", \
-                    ::testing::TestPartResult::kNonFatalFailure)
-
-// Generates a fatal failure with a generic message.
-#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
-
-// Define this macro to 1 to omit the definition of FAIL(), which is a
-// generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
-#endif
-
-// Generates a success with a generic message.
-#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
-
-// Define this macro to 1 to omit the definition of SUCCEED(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
-#endif
-
-// Macros for testing exceptions.
-//
-//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
-//         Tests that the statement throws the expected exception.
-//    * {ASSERT|EXPECT}_NO_THROW(statement):
-//         Tests that the statement doesn't throw any exception.
-//    * {ASSERT|EXPECT}_ANY_THROW(statement):
-//         Tests that the statement throws an exception.
-
-#define EXPECT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
-#define ASSERT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
-#define ASSERT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
-
-// Boolean assertions. Condition can be either a Boolean expression or an
-// AssertionResult. For more information on how to use AssertionResult with
-// these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_FATAL_FAILURE_)
-
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
-
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
 
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
 
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
 
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
-}
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
 
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+// Macros for indicating success/failure in test code.
 
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
 
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
 
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
-}
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
-                on_failure)
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
 
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
 
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
 
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
 
 // Macros for testing equalities and inequalities.
 //
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
 //
 // When they are not, Google Test prints both the tested expressions and
 // their actual values.  The values must be compatible built-in types,
@@ -20387,8 +21709,8 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //   are related, not how their content is related.  To compare two C
 //   strings by content, use {ASSERT|EXPECT}_STR*().
 //
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
 //   what the actual value is when it fails, and similarly for the
 //   other comparisons.
 //
@@ -20399,17 +21721,17 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // Examples:
 //
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
-#define EXPECT_EQ(expected, actual) \
+#define EXPECT_EQ(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
 #define EXPECT_LT(val1, val2) \
@@ -20419,10 +21741,10 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define EXPECT_GT(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
-#define GTEST_ASSERT_EQ(expected, actual) \
+#define GTEST_ASSERT_EQ(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -20477,29 +21799,29 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // These macros evaluate their arguments exactly once.
 
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define EXPECT_STRNE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define EXPECT_STRCASENE(s1, s2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define ASSERT_STRNE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define ASSERT_STRCASENE(s1, s2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
 //
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
 //         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
 //         Tests that two double values are almost equal.
 //    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
 //         Tests that v1 and v2 are within the given distance to each other.
@@ -20509,21 +21831,21 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(expected, actual)\
+#define EXPECT_FLOAT_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_DOUBLE_EQ(expected, actual)\
+#define EXPECT_DOUBLE_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_FLOAT_EQ(expected, actual)\
+#define ASSERT_FLOAT_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_DOUBLE_EQ(expected, actual)\
+#define ASSERT_DOUBLE_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
 #define EXPECT_NEAR(val1, val2, abs_error)\
   EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
@@ -20586,6 +21908,57 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 #define EXPECT_NO_FATAL_FAILURE(statement) \
     GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+  ScopedTrace(const char* file, int line, const ::string& message) {
+    PushTrace(file, line, message);
+  }
+#endif
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
 // Causes a trace (including the source file path, the current line
 // number, and the given message) to be included in every test failure
 // message generated by code in the current scope.  The effect is
@@ -20597,9 +21970,14 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
 // to appear in the same block - as long as they are on different
 // lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
 #define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
+
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
@@ -20679,7 +22057,7 @@ bool StaticAssertTypeEq() {
 // name of the test within the test case.
 //
 // A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
+// the test code between braces after using this macro.  Example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
@@ -20694,14 +22072,22 @@ bool StaticAssertTypeEq() {
 //   }
 //
 //   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
 //   }
 
 #define TEST_F(test_fixture, test_name)\
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
 
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
 }  // namespace testing
 
 // Use this function in main() to run all tests.  It returns 0 if all
@@ -20718,4 +22104,6 @@ inline int RUN_ALL_TESTS() {
   return ::testing::UnitTest::GetInstance()->Run();
 }
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
new file mode 100644
index 00000000000..2113f621e65
--- /dev/null
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
@@ -0,0 +1,37 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from %s\n", __FILE__);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 8973f926bc3..9d8eb1fa114 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -19,51 +19,51 @@ TEST(HashTest, Values) {
   using rocksdb::Hash;
   constexpr uint32_t kSeed = 0xbc9f1d34;  // Same as BloomHash.
 
-  EXPECT_EQ(Hash("", 0, kSeed), 3164544308);
-  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524);
-  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998);
-  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349);
-  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383);
-  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956);
-  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105);
-  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776);
-  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603);
-  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797);
-  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265);
-  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661);
-  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222);
-  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265);
-  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611);
-  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572);
-  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355);
-  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370);
-  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402);
-  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743);
-  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500);
-  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008);
-  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482);
-  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422);
-  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029);
-  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748);
-  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698);
-  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227);
-  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252);
-  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402);
-  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698);
-  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809);
-  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120);
-  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434);
-  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151);
-  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696);
-  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912);
+  EXPECT_EQ(Hash("", 0, kSeed), 3164544308u);
+  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524u);
+  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998u);
+  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349u);
+  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383u);
+  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956u);
+  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105u);
+  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776u);
+  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603u);
+  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797u);
+  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265u);
+  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661u);
+  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222u);
+  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265u);
+  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611u);
+  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572u);
+  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355u);
+  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370u);
+  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402u);
+  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743u);
+  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500u);
+  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008u);
+  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482u);
+  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422u);
+  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029u);
+  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748u);
+  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698u);
+  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227u);
+  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252u);
+  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402u);
+  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u);
+  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u);
+  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u);
+  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434u);
+  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151u);
+  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696u);
+  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912u);
   EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
-            3420325137);
+            3420325137u);
   EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
-            3427803584);
+            3427803584u);
   EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
-            1152407945);
+            1152407945u);
   EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
-            3382479516);
+            3382479516u);
 }
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 691047f5131..fd849011919 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -290,7 +290,7 @@ TEST_P(TransactionTest, WaitingTxn) {
         ASSERT_EQ(key, "foo");
         ASSERT_EQ(wait.size(), 1);
         ASSERT_EQ(wait[0], id1);
-        ASSERT_EQ(cf_id, 0);
+        ASSERT_EQ(cf_id, 0U);
       });
 
   get_perf_context()->Reset();
@@ -568,7 +568,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
     for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
-      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_cf_id, 0U);
       ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
       ASSERT_EQ(dl_node.m_exclusive, true);
 
@@ -775,7 +775,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
     for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
-      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_cf_id, 0u);
       ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
       ASSERT_EQ(dl_node.m_exclusive, true);
 
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index e6675dcc6ae..d9f9eabd978 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -568,7 +568,7 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() {
   input.remove_prefix(offset);
 
   // Loop through all entries in Rep and add each one to the index
-  int found = 0;
+  uint32_t found = 0;
   while (s.ok() && !input.empty()) {
     Slice key, value, blob, xid;
     uint32_t column_family_id = 0;  // default

From 108c619acbbbcef0e331f53270af8aad25150a97 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 9 Sep 2019 14:49:39 -0700
Subject: [PATCH 359/572] Add regression test for serialized Bloom filters
 (#5778)

Summary:
Check that we don't accidentally change the on-disk format of
existing Bloom filter implementations, including for various
CACHE_LINE_SIZE (by changing temporarily).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5778

Test Plan: thisisthetest

Differential Revision: D17269630

Pulled By: pdillinger

fbshipit-source-id: c77017662f010a77603b7d475892b1f0d5563d8b
---
 db/plain_table_db_test.cc |  15 +++-
 util/bloom_test.cc        | 164 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 171 insertions(+), 8 deletions(-)

diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index a2f19108079..52c7478a4c6 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -770,8 +770,19 @@ TEST_P(PlainTableDBTest, BloomSchema) {
     for (unsigned i = 0; i < 32; ++i) {
       // Known pattern of Bloom filter false positives can detect schema change
       // with high probability. Known FPs stuffed into bits:
-      bool expect_fp = (bloom_locality ? 2421694657UL : 1785868347UL)
-                       & (1UL << i);
+      uint32_t pattern;
+      if (!bloom_locality) {
+        pattern = 1785868347UL;
+      } else if (CACHE_LINE_SIZE == 64) {
+        pattern = 2421694657UL;
+      } else if (CACHE_LINE_SIZE == 128) {
+        pattern = 788710956UL;
+      } else {
+        ASSERT_EQ(CACHE_LINE_SIZE, 256);
+        pattern = 163905UL;
+      }
+      bool expect_fp = pattern & (1UL << i);
+      //fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
       expect_bloom_not_match = !expect_fp;
       ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
     }
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 688920ea08a..b759303996e 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -23,6 +23,7 @@ int main() {
 #include "table/full_filter_bits_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/hash.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@@ -55,7 +56,7 @@ static int NextLength(int length) {
 
 class BloomTest : public testing::Test {
  private:
-  const FilterPolicy* policy_;
+  std::unique_ptr<const FilterPolicy> policy_;
   std::string filter_;
   std::vector<std::string> keys_;
 
@@ -63,13 +64,20 @@ class BloomTest : public testing::Test {
   BloomTest() : policy_(
       NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
 
-  ~BloomTest() override { delete policy_; }
-
   void Reset() {
     keys_.clear();
     filter_.clear();
   }
 
+  void ResetPolicy(const FilterPolicy* policy = nullptr) {
+    if (policy == nullptr) {
+      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key));
+    } else {
+      policy_.reset(policy);
+    }
+    Reset();
+  }
+
   void Add(const Slice& s) {
     keys_.push_back(s.ToString());
   }
@@ -90,6 +98,10 @@ class BloomTest : public testing::Test {
     return filter_.size();
   }
 
+  Slice FilterData() const {
+    return Slice(filter_);
+  }
+
   void DumpFilter() {
     fprintf(stderr, "F(");
     for (size_t i = 0; i+1 < filter_.size(); i++) {
@@ -173,11 +185,62 @@ TEST_F(BloomTest, VaryingLengths) {
   ASSERT_LE(mediocre_filters, good_filters/5);
 }
 
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way
+TEST_F(BloomTest, Schema) {
+  char buffer[sizeof(int)];
+
+  ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
+
+  ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 969445585);
+
+  ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 1694458207);
+
+  ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
+
+  ResetPolicy(NewBloomFilterPolicy(10));
+  for (int key = 1; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 1908442116);
+
+  ResetPolicy(NewBloomFilterPolicy(10));
+  for (int key = 1; key < 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 3057004015U);
+
+  ResetPolicy();
+}
+
+
 // Different bits-per-byte
 
 class FullBloomTest : public testing::Test {
  private:
-  const FilterPolicy* policy_;
+  std::unique_ptr<const FilterPolicy> policy_;
   std::unique_ptr<FilterBitsBuilder> bits_builder_;
   std::unique_ptr<FilterBitsReader> bits_reader_;
   std::unique_ptr<const char[]> buf_;
@@ -190,8 +253,6 @@ class FullBloomTest : public testing::Test {
     Reset();
   }
 
-  ~FullBloomTest() override { delete policy_; }
-
   FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
     return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
   }
@@ -203,6 +264,15 @@ class FullBloomTest : public testing::Test {
     filter_size_ = 0;
   }
 
+  void ResetPolicy(const FilterPolicy* policy = nullptr) {
+    if (policy == nullptr) {
+      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, false));
+    } else {
+      policy_.reset(policy);
+    }
+    Reset();
+  }
+
   void Add(const Slice& s) {
     bits_builder_->AddKey(s);
   }
@@ -217,6 +287,10 @@ class FullBloomTest : public testing::Test {
     return filter_size_;
   }
 
+  Slice FilterData() {
+    return Slice(buf_.get(), filter_size_);
+  }
+
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
       Build();
@@ -305,6 +379,84 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
   ASSERT_LE(mediocre_filters, good_filters/5);
 }
 
+namespace {
+inline uint32_t SelectByCacheLineSize(uint32_t for64,
+                                  uint32_t for128,
+                                  uint32_t for256) {
+  (void)for64;
+  (void)for128;
+  (void)for256;
+#if CACHE_LINE_SIZE == 64
+  return for64;
+#elif CACHE_LINE_SIZE == 128
+  return for128;
+#elif CACHE_LINE_SIZE == 256
+  return for256;
+#else
+  #error "CACHE_LINE_SIZE unknown or unrecognized"
+#endif
+}
+} // namespace
+
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way
+TEST_F(FullBloomTest, Schema) {
+  char buffer[sizeof(int)];
+
+  // Use enough keys so that changing bits / key by 1 is guaranteed to
+  // change number of allocated cache lines. So keys > max cache line bits.
+
+  ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
+
+  ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(2092755149, 661139132, 1182970461));
+
+  ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
+
+  ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(1478976371, 2910591341U, 1182970461));
+
+  ResetPolicy(NewBloomFilterPolicy(10));
+  for (int key = 1; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U));
+
+  ResetPolicy(NewBloomFilterPolicy(10));
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()),
+            SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+
+  ResetPolicy();
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 7af6ced14baebc702700fd16417849ae711b9796 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 9 Sep 2019 15:24:54 -0700
Subject: [PATCH 360/572] Fix block allocation bug in new DynamicBloom (#5783)

Summary:
Bug found by valgrind. New DynamicBloom wasn't allocating in
block sizes. New assertion added that probes starting in final word
would be in bounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5783

Test Plan: ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 valgrind --leak-check=full ./dynamic_bloom_test

Differential Revision: D17270623

Pulled By: pdillinger

fbshipit-source-id: 1e0407504b875133a771383cd488c70f91be2b87
---
 util/dynamic_bloom.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 3c153c71942..0ad06942e7c 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -39,11 +39,18 @@ DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
   // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
   uint32_t block_bytes = /*bytes/u64*/ 8 *
                          /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
-  kLen = (total_bits + (/*bits/byte*/ 8 * block_bytes - 1)) /
-         /*bits/u64*/ 64;
+  uint32_t block_bits = block_bytes * 8;
+  uint32_t blocks = (total_bits + block_bits - 1) / block_bits;
+  uint32_t sz = blocks * block_bytes;
+  kLen = sz / /*bytes/u64*/8;
   assert(kLen > 0);
+#ifndef NDEBUG
+  for (uint32_t i = 0; i < kNumDoubleProbes; ++i) {
+    // Ensure probes starting at last word are in range
+    assert(((kLen - 1) ^ i) < kLen);
+  }
+#endif
 
-  uint32_t sz = kLen * /*bytes/u64*/ 8;
   // Padding to correct for allocation not originally aligned on block_bytes
   // boundary
   sz += block_bytes - 1;

From 699e1b5edee6ad913b7e9a93b6b3863f77cf7e75 Mon Sep 17 00:00:00 2001
From: Richard He <he@hrichard.com>
Date: Mon, 9 Sep 2019 18:11:02 -0700
Subject: [PATCH 361/572] Added support for SstFileReader JNI interface (#5556)

Summary:
Feature request as per https://github.com/facebook/rocksdb/issues/5538 issue.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5556

Differential Revision: D17219008

fbshipit-source-id: e31f18dec318416eac9dea8213bab31da96e1f3a
---
 java/CMakeLists.txt                           |   6 +
 java/Makefile                                 |   3 +
 java/rocksjni/sst_file_reader_iterator.cc     | 186 ++++++++++++++++++
 java/rocksjni/sst_file_readerjni.cc           | 110 +++++++++++
 .../main/java/org/rocksdb/SstFileReader.java  |  83 ++++++++
 .../org/rocksdb/SstFileReaderIterator.java    |  65 ++++++
 .../java/org/rocksdb/SstFileReaderTest.java   | 137 +++++++++++++
 src.mk                                        |   2 +
 8 files changed, 592 insertions(+)
 create mode 100644 java/rocksjni/sst_file_reader_iterator.cc
 create mode 100644 java/rocksjni/sst_file_readerjni.cc
 create mode 100644 java/src/main/java/org/rocksdb/SstFileReader.java
 create mode 100644 java/src/main/java/org/rocksdb/SstFileReaderIterator.java
 create mode 100644 java/src/test/java/org/rocksdb/SstFileReaderTest.java

diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index f00b6f7f919..b1f706c161e 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -51,6 +51,8 @@ set(JNI_NATIVE_SOURCES
         rocksjni/snapshot.cc
         rocksjni/sst_file_manager.cc
         rocksjni/sst_file_writerjni.cc
+        rocksjni/sst_file_readerjni.cc
+        rocksjni/sst_file_reader_iterator.cc
         rocksjni/statistics.cc
         rocksjni/statisticsjni.cc
         rocksjni/table.cc
@@ -194,6 +196,8 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/SstFileManager.java
   src/main/java/org/rocksdb/SstFileMetaData.java
   src/main/java/org/rocksdb/SstFileWriter.java
+  src/main/java/org/rocksdb/SstFileReader.java
+  src/main/java/org/rocksdb/SstFileReaderIterator.java
   src/main/java/org/rocksdb/StateType.java
   src/main/java/org/rocksdb/StatisticsCollectorCallback.java
   src/main/java/org/rocksdb/StatisticsCollector.java
@@ -436,6 +440,8 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7"
           org.rocksdb.Snapshot
           org.rocksdb.SstFileManager
           org.rocksdb.SstFileWriter
+          org.rocksdb.SstFileReader
+          org.rocksdb.SstFileReaderIterator
           org.rocksdb.Statistics
           org.rocksdb.StringAppendOperator
           org.rocksdb.TableFormatConfig
diff --git a/java/Makefile b/java/Makefile
index 7aa15bfd038..f8642c2d612 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -60,6 +60,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.Slice\
 	org.rocksdb.SstFileManager\
 	org.rocksdb.SstFileWriter\
+	org.rocksdb.SstFileReader\
+	org.rocksdb.SstFileReaderIterator\
 	org.rocksdb.Statistics\
 	org.rocksdb.ThreadStatus\
 	org.rocksdb.TimedEnv\
@@ -156,6 +158,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.SnapshotTest\
 	org.rocksdb.SstFileManagerTest\
 	org.rocksdb.SstFileWriterTest\
+	org.rocksdb.SstFileReaderTest\
 	org.rocksdb.TableFilterTest\
 	org.rocksdb.TimedEnvTest\
 	org.rocksdb.TransactionTest\
diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc
new file mode 100644
index 00000000000..7ab97e412ba
--- /dev/null
+++ b/java/rocksjni/sst_file_reader_iterator.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_SstFileReaderIterator.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/iterator.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong handle) {
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    assert(it != nullptr);
+    delete it;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+    return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+    reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+    reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                                  jlong handle) {
+    reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                                  jlong handle) {
+    reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong handle, jbyteArray jtarget,
+                                                  jint jtarget_len) {
+    jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+    if(target == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+    }
+
+    rocksdb::Slice target_slice(
+            reinterpret_cast<char*>(target), jtarget_len);
+
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    it->Seek(target_slice);
+
+    env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/,
+                                                         jlong handle,
+                                                         jbyteArray jtarget,
+                                                         jint jtarget_len) {
+    jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+    if(target == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+    }
+
+    rocksdb::Slice target_slice(
+            reinterpret_cast<char*>(target), jtarget_len);
+
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    it->SeekForPrev(target_slice);
+
+    env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, jobject /*jobj*/,
+                                                    jlong handle) {
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    rocksdb::Status s = it->status();
+
+    if (s.ok()) {
+        return;
+    }
+
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, jobject /*jobj*/,
+                                                       jlong handle) {
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    rocksdb::Slice key_slice = it->key();
+
+    jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+    if(jkey == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+    }
+    env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
+                            const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+    return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, jobject /*jobj*/,
+                                                         jlong handle) {
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    rocksdb::Slice value_slice = it->value();
+
+    jbyteArray jkeyValue =
+            env->NewByteArray(static_cast<jsize>(value_slice.size()));
+    if(jkeyValue == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+    }
+    env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+                            const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+    return jkeyValue;
+}
\ No newline at end of file
diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc
new file mode 100644
index 00000000000..32b914a2d2d
--- /dev/null
+++ b/java/rocksjni/sst_file_readerjni.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ rocksdb::SstFileReader methods
+// from Java side.
+
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_SstFileReader.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newSstFileReader
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
+                                                         jclass /*jcls*/,
+                                                          jlong joptions) {
+  auto *options = reinterpret_cast<const rocksdb::Options *>(joptions);
+  rocksdb::SstFileReader *sst_file_reader =
+      new rocksdb::SstFileReader(*options);
+  return reinterpret_cast<jlong>(sst_file_reader);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/,
+                                         jlong jhandle, jstring jfile_path) {
+  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  if (file_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Status s =
+      reinterpret_cast<rocksdb::SstFileReader *>(jhandle)->Open(file_path);
+  env->ReleaseStringUTFChars(jfile_path, file_path);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+* Class:     org_rocksdb_SstFileReader
+* Method:    newIterator
+* Signature: (JJ)J
+*/
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jlong jread_options_handle) {
+    auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
+    auto* read_options =
+        reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+    return reinterpret_cast<jlong>(sst_file_reader->NewIterator(*read_options));
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong jhandle) {
+    delete reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    verifyChecksum
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env,
+                                             jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
+  auto s = sst_file_reader->VerifyChecksum();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    getTableProperties
+ * Signature: (J)J
+ */
+jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
+  std::shared_ptr<const rocksdb::TableProperties> tp = sst_file_reader->GetTableProperties();
+  jobject jtable_properties = rocksdb::TablePropertiesJni::fromCppTableProperties(
+      env, *(tp.get()));
+  return jtable_properties;
+}
+
diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java
new file mode 100644
index 00000000000..66349f32f72
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/SstFileReader.java
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class SstFileReader extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public SstFileReader(final Options options) {
+    super(newSstFileReader(options.nativeHandle_));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for deleting the returned Iterator.
+   *
+   * @param readOptions Read options.
+   *
+   * @return instance of iterator object.
+   */
+  public SstFileReaderIterator newIterator(final ReadOptions readOptions) {
+    assert(isOwningHandle());
+    long iter = newIterator(nativeHandle_,
+        readOptions.nativeHandle_);
+    return new SstFileReaderIterator(this, iter);
+  }
+
+  /**
+     * Prepare SstFileReader to read a file.
+     *
+     * @param filePath the location of file
+     *
+     * @throws RocksDBException thrown if error happens in underlying
+     *    native library.
+   */
+  public void open(final String filePath) throws RocksDBException {
+    open(nativeHandle_, filePath);
+  }
+
+  /**
+   * Verify checksum
+   *
+   * @throws RocksDBException if the checksum is not valid
+   */
+  public void verifyChecksum() throws RocksDBException {
+    verifyChecksum(nativeHandle_);
+  }
+
+  /**
+   * Get the properties of the table.
+   *
+   *
+   * @return the properties
+   */
+  public TableProperties getTableProperties() throws RocksDBException {
+    return getTableProperties(nativeHandle_);
+  }
+
+
+
+  @Override protected final native void disposeInternal(final long handle);
+  private native long newIterator(final long handle,
+                                  final long readOptionsHandle);
+
+  private native void open(final long handle, final String filePath)
+        throws RocksDBException;
+
+  private native static long newSstFileReader(final long optionsHandle);
+  private native void verifyChecksum(final long handle) throws RocksDBException;
+  private native TableProperties getTableProperties(final long handle) throws RocksDBException;
+}
diff --git a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
new file mode 100644
index 00000000000..ef303f04293
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
+ * to access the contents of a Table or a DB.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see RocksObject
+ */
+public class SstFileReaderIterator extends AbstractRocksIterator<SstFileReader> {
+  protected SstFileReaderIterator(SstFileReader reader, long nativeHandle) {
+    super(reader, nativeHandle);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert(isOwningHandle());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() &amp;&amp; !AtStart()</p>
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert(isOwningHandle());
+    return value0(nativeHandle_);
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+}
diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
new file mode 100644
index 00000000000..5ccc8dec981
--- /dev/null
+++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.BytewiseComparator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class SstFileReaderTest {
+  private static final String SST_FILE_NAME = "test.sst";
+
+  class KeyValueWithOp {
+    KeyValueWithOp(String key, String value, OpType opType) {
+      this.key = key;
+      this.value = value;
+      this.opType = opType;
+    }
+
+    String getKey() {
+      return key;
+    }
+
+    String getValue() {
+      return value;
+    }
+
+    OpType getOpType() {
+      return opType;
+    }
+
+    private String key;
+    private String value;
+    private OpType opType;
+  }
+
+  @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
+
+  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES}
+
+  private File newSstFile(final List<KeyValueWithOp> keyValues) throws IOException, RocksDBException {
+    final EnvOptions envOptions = new EnvOptions();
+    final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    final Options options = new Options().setMergeOperator(stringAppendOperator);
+    SstFileWriter sstFileWriter;
+    sstFileWriter = new SstFileWriter(envOptions, options);
+
+    final File sstFile = parentFolder.newFile(SST_FILE_NAME);
+    try {
+      sstFileWriter.open(sstFile.getAbsolutePath());
+      for (KeyValueWithOp keyValue : keyValues) {
+        Slice keySlice = new Slice(keyValue.getKey());
+        Slice valueSlice = new Slice(keyValue.getValue());
+        byte[] keyBytes = keyValue.getKey().getBytes();
+        byte[] valueBytes = keyValue.getValue().getBytes();
+        switch (keyValue.getOpType()) {
+          case PUT:
+            sstFileWriter.put(keySlice, valueSlice);
+            break;
+          case PUT_BYTES:
+            sstFileWriter.put(keyBytes, valueBytes);
+            break;
+          case MERGE:
+            sstFileWriter.merge(keySlice, valueSlice);
+            break;
+          case MERGE_BYTES:
+            sstFileWriter.merge(keyBytes, valueBytes);
+            break;
+          case DELETE:
+            sstFileWriter.delete(keySlice);
+            break;
+          case DELETE_BYTES:
+            sstFileWriter.delete(keyBytes);
+            break;
+          default:
+            fail("Unsupported op type");
+        }
+        keySlice.close();
+        valueSlice.close();
+      }
+      sstFileWriter.finish();
+    } finally {
+      assertThat(sstFileWriter).isNotNull();
+      sstFileWriter.close();
+      options.close();
+      envOptions.close();
+    }
+    return sstFile;
+  }
+
+  @Test
+  public void readSstFile() throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+
+
+    final File sstFile = newSstFile(keyValues);
+    try(final StringAppendOperator stringAppendOperator =
+            new StringAppendOperator();
+        final Options options = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(stringAppendOperator);
+        final SstFileReader reader = new SstFileReader(options)
+    ) {
+      // Open the sst file and iterator
+      reader.open(sstFile.getAbsolutePath());
+      final ReadOptions readOptions = new ReadOptions();
+      final SstFileReaderIterator iterator = reader.newIterator(readOptions);
+
+      // Use the iterator to read sst file
+      iterator.seekToFirst();
+
+      // Verify Checksum
+      reader.verifyChecksum();
+
+      // Verify Table Properties
+      assertEquals(reader.getTableProperties().getNumEntries(), 1);
+
+      // Check key and value
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+    }
+  }
+
+}
diff --git a/src.mk b/src.mk
index 4c768975567..e37c8c9191c 100644
--- a/src.mk
+++ b/src.mk
@@ -476,6 +476,8 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/snapshot.cc                                   \
   java/rocksjni/sst_file_manager.cc                           \
   java/rocksjni/sst_file_writerjni.cc                         \
+  java/rocksjni/sst_file_readerjni.cc                         \
+  java/rocksjni/sst_file_reader_iterator.cc                   \
   java/rocksjni/statistics.cc                                 \
   java/rocksjni/statisticsjni.cc                              \
   java/rocksjni/table.cc                                      \

From 4d945c57aca17113910d072d86c117c6807d9303 Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Tue, 10 Sep 2019 09:40:21 -0700
Subject: [PATCH 362/572] do a bit less work in the normal case (#5695)

Summary:
i.e. if alive logfile is not being moved to archive while we are in GetSortedWalsOfType()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5695

Differential Revision: D17279489

Pulled By: vjnadimpalli

fbshipit-source-id: 02bcf920a75b812edba8b87c6079b4e6fd5e683c
---
 db/wal_manager.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 1074279d5ad..6c21ab4a00a 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -312,14 +312,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
       uint64_t size_bytes;
       s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
       // re-try in case the alive log file has been moved to archive.
-      std::string archived_file = ArchivedLogFileName(path, number);
-      if (!s.ok() && log_type == kAliveLogFile &&
-          env_->FileExists(archived_file).ok()) {
-        s = env_->GetFileSize(archived_file, &size_bytes);
-        if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
-          // oops, the file just got deleted from archived dir! move on
-          s = Status::OK();
-          continue;
+      if (!s.ok() && log_type == kAliveLogFile) {
+        std::string archived_file = ArchivedLogFileName(path, number);
+        if (env_->FileExists(archived_file).ok()) {
+          s = env_->GetFileSize(archived_file, &size_bytes);
+          if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+            // oops, the file just got deleted from archived dir! move on
+            s = Status::OK();
+            continue;
+          }
         }
       }
       if (!s.ok()) {
@@ -388,7 +389,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   if (type == kAliveLogFile) {
     std::string fname = LogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(fname, number, sequence);
-    if (env_->FileExists(fname).ok() && !s.ok()) {
+    if (!s.ok() && env_->FileExists(fname).ok()) {
       // return any error that is not caused by non-existing file
       return s;
     }

From eb9026f09bfd0582652cffa6a2390e9b55285b7f Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 10 Sep 2019 11:03:31 -0700
Subject: [PATCH 363/572] Add a db_bench benchmark to warm up the row cache

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5707

Differential Revision: D17242698

Pulled By: anand1976

fbshipit-source-id: 5d1bfda3c9e8f56176ae391cae6c91e6262016b8
---
 tools/db_bench_tool.cc | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 3c2de42ddc3..715fd842af9 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -103,6 +103,7 @@ DEFINE_string(
     "multireadrandom,"
     "mixgraph,"
     "readseq,"
+    "readtorowcache,"
     "readtocache,"
     "readreverse,"
     "readwhilewriting,"
@@ -2771,6 +2772,14 @@ class Benchmark {
         method = &Benchmark::WriteRandom;
       } else if (name == "readseq") {
         method = &Benchmark::ReadSequential;
+      } else if (name == "readtorowcache") {
+        if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
+          fprintf(stderr,
+                  "Please set use_existing_keys to true and specify a "
+                  "row cache size in readtorowcache benchmark\n");
+          exit(1);
+        }
+        method = &Benchmark::ReadToRowCache;
       } else if (name == "readtocache") {
         method = &Benchmark::ReadSequential;
         num_threads = 1;
@@ -4621,6 +4630,65 @@ class Benchmark {
     }
   }
 
+  void ReadToRowCache(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    int64_t key_rand = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+
+    while (key_rand < FLAGS_num) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      key_rand++;
+      read++;
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+                                 &pinnable_val);
+      } else {
+        pinnable_val.Reset();
+        s = db_with_cfh->db->Get(options,
+                                 db_with_cfh->db->DefaultColumnFamily(), key,
+                                 &pinnable_val);
+      }
+
+      if (s.ok()) {
+        found++;
+        bytes += key.size() + pinnable_val.size();
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+        abort();
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          read % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
+             found, read);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
+    }
+  }
+
   void ReadReverse(ThreadState* thread) {
     if (db_.db != nullptr) {
       ReadReverse(thread, db_.db);

From 2becafdb43ac2ddced7d4bffec9407b9b89fd2df Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 10 Sep 2019 14:32:38 -0700
Subject: [PATCH 364/572] Fix Appveyor build due to signed/unsigned comparison

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5788

Test Plan: Travis CI and Appveyor should complete successfully.

Differential Revision: D17287422

Pulled By: anand1976

fbshipit-source-id: d9408b692f78be95d0088b29b33f6a8ff40ec97b
---
 db/plain_table_db_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 52c7478a4c6..b4e983a7b2a 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -773,12 +773,12 @@ TEST_P(PlainTableDBTest, BloomSchema) {
       uint32_t pattern;
       if (!bloom_locality) {
         pattern = 1785868347UL;
-      } else if (CACHE_LINE_SIZE == 64) {
+      } else if (CACHE_LINE_SIZE == 64U) {
         pattern = 2421694657UL;
-      } else if (CACHE_LINE_SIZE == 128) {
+      } else if (CACHE_LINE_SIZE == 128U) {
         pattern = 788710956UL;
       } else {
-        ASSERT_EQ(CACHE_LINE_SIZE, 256);
+        ASSERT_EQ(CACHE_LINE_SIZE, 256U);
         pattern = 163905UL;
       }
       bool expect_fp = pattern & (1UL << i);

From c85c87a7185c161cdc8b53e13982009922280a42 Mon Sep 17 00:00:00 2001
From: tongyingrui <tongyingrui@163.com>
Date: Wed, 11 Sep 2019 12:02:49 -0700
Subject: [PATCH 365/572] test size was wrong in 'fillbatch' benchmark (#5198)

Summary:
for fillbatch benchmar, the numEntries should be [num_] but not [num_ / 1000] because numEntries is just the total entries we want to test
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5198

Differential Revision: D17274664

Pulled By: anand1976

fbshipit-source-id: f96e952babdbac63fb99d14e1254d478a10437be
---
 .../src/main/java/org/rocksdb/benchmark/DbBenchmark.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
index 67f6a5cc055..da515219fc5 100644
--- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -646,8 +646,8 @@ private void run() throws RocksDBException {
               currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
           break;
         case "fillbatch":
-          tasks.add(new WriteRandomTask(
-              currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
+          tasks.add(new WriteSequentialTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1000));
           break;
         case "fillrandom":
           tasks.add(new WriteRandomTask(
@@ -911,7 +911,7 @@ private enum Flag {
         "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
         "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
         "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
-        "\t\t                   in random key order in sync mode.\n" +
+        "\t\t                   in sequential key order in sync mode.\n" +
         "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
         "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
         "\t\treadseq          -- read N times sequentially.\n" +

From 20dd828c01a60ed8258723cec4e3ec58b5c5412a Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Wed, 11 Sep 2019 14:04:54 -0700
Subject: [PATCH 366/572] Avoid clock_gettime on pre-10.12 macOS versions
 (#5570)

Summary:
On older macOS like 10.10 we saw the following compiler error:

```
/go/src/github.com/cockroachdb/cockroach/c-deps/rocksdb/env/env_posix.cc:845:19:
error: use of undeclared identifier 'CLOCK_THREAD_CPUTIME_ID'
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
                  ^
```

According to mac's `man clock_gettime`: "These functions first appeared in Mac
OSX 10.12". So we should not try to compile it on earlier versions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5570

Test Plan:
verified it compiles now on 10.10. Also did some investigation to
ensure it does not cause regression on macOS 10.12+, although I do not
have access to such an environment to really test.

Differential Revision: D17322629

Pulled By: riversand963

fbshipit-source-id: e0a412223854f826b4d83e6d15c3739ff4620d7d
---
 env/env_posix.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/env/env_posix.cc b/env/env_posix.cc
index 7f7f6b2df5b..83e209bf1f8 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -37,6 +37,7 @@
 // Get nano time includes
 #if defined(OS_LINUX) || defined(OS_FREEBSD)
 #elif defined(__MACH__)
+#include <Availability.h>
 #include <mach/clock.h>
 #include <mach/mach.h>
 #else
@@ -938,7 +939,7 @@ class PosixEnv : public Env {
 
   uint64_t NowCPUNanos() override {
 #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \
-    defined(__MACH__)
+    (defined(__MACH__) && defined(__MAC_10_12))
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;

From dd2a35f13f53304b2e62b41d3fe501dd8473ea51 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Wed, 11 Sep 2019 14:11:38 -0700
Subject: [PATCH 367/572] Support partitioned index and filters in stress/crash
 tests (#4020)

Summary:
- In `db_stress`, support choosing index type and whether to enable filter partitioning, and randomly set those options in crash test
- When partitioned filter is enabled by crash test, force partitioned index to also be enabled since it's a prerequisite
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4020

Test Plan:
currently this is blocked on fixing the bug that crash test caught:

```
$ TEST_TMPDIR=/data/compaction_bench python ./tools/db_crashtest.py blackbox --simple --interval=10 --max_key=10000000
...
Verification failed for column family 0 key 937501: Value not found: NotFound:
Crash-recovery verification failed :(
```

Differential Revision: D8508683

Pulled By: maysamyabandeh

fbshipit-source-id: 0337e5d0558bcef26b1f3699f47265a2c1e99629
---
 tools/db_crashtest.py |  5 +++++
 tools/db_stress.cc    | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 73a46b4240d..697cd265d9a 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,6 +41,7 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
+    "index_type": lambda: random.randint(0, 2),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -48,6 +49,7 @@
     "mmap_read": lambda: random.randint(0, 1),
     "nooverwritepercent": 1,
     "open_files": lambda : random.choice([-1, 500000]),
+    "partition_filters": lambda: random.randint(0, 1),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -174,6 +176,9 @@ def finalize_and_sanitize(src_params):
         # Disable compaction TTL in FIFO compaction, because right
         # now assertion failures are triggered.
         dest_params["compaction_ttl"] = 0
+    if dest_params["partition_filters"] == 1:
+        dest_params["index_type"] = 2
+        dest_params["use_block_based_filter"] = 0
     return dest_params
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 8403eced173..bf1af305f14 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -353,6 +353,14 @@ DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
 DEFINE_bool(use_block_based_filter, false, "use block based filter"
               "instead of full filter for block based table");
 
+DEFINE_bool(partition_filters, false, "use partitioned filters "
+    "for block-based table");
+
+DEFINE_int32(
+    index_type,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions::kBinarySearch),
+    "Type of block-based table index (see `enum IndexType` in table.h)");
+
 DEFINE_string(db, "", "Use the db with the following name.");
 
 DEFINE_string(secondaries_base, "",
@@ -2777,6 +2785,9 @@ class StressTest {
       block_based_options.index_block_restart_interval =
           static_cast<int32_t>(FLAGS_index_block_restart_interval);
       block_based_options.filter_policy = filter_policy_;
+      block_based_options.partition_filters = FLAGS_partition_filters;
+      block_based_options.index_type =
+          static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
       options_.table_factory.reset(
           NewBlockBasedTableFactory(block_based_options));
       options_.db_write_buffer_size = FLAGS_db_write_buffer_size;

From fcda80fc33d0489a246f4e82c60e8c1e3f8e48b3 Mon Sep 17 00:00:00 2001
From: Wilfried Goesgens <willi@arangodb.com>
Date: Wed, 11 Sep 2019 17:58:34 -0700
Subject: [PATCH 368/572] record the timestamp on first configure (#4799)

Summary:
cmake doesn't re-generate the timestamp on subsequent builds causing rebuilds of the lib

This improves compile time turn-arounds if you have rocksdb as a compileable library include, since with the state its now it will re-generate the time stamp .cc file each time you build, and thus re-compile + re-link the rocksdb library though anything in the source actually changed.
The original timestamp is recorded into `CMakeCache.txt` and will remain there until you flush this cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4799

Differential Revision: D17290040

fbshipit-source-id: 28357fef3422693c9c19e88fa2873c8db0f662ed
---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c6c566e8a4..db9e371aaef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,7 +135,8 @@ else()
   endif()
 endif()
 
-string(TIMESTAMP GIT_DATE_TIME "%Y/%m/%d %H:%M:%S" UTC)
+string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC)
+set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb")
 
 find_package(Git)
 

From 9eb3e1f77d287baca3499f8c291be6a2c7c45c6e Mon Sep 17 00:00:00 2001
From: Shylock Hg <tcath2s@gmail.com>
Date: Wed, 11 Sep 2019 18:07:12 -0700
Subject: [PATCH 369/572] Use delete to disable automatic generated methods.
 (#5009)

Summary:
Use delete to disable automatic generated methods instead of private, and put the constructor together for more clear.This modification cause the unused field warning, so add unused attribute to disable this warning.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5009

Differential Revision: D17288733

fbshipit-source-id: 8a767ce096f185f1db01bd28fc88fef1cdd921f3
---
 db/compacted_db_impl.h                        |  8 ++---
 db/db_impl/db_impl.h                          |  8 ++---
 db/db_impl/db_impl_readonly.h                 |  8 ++---
 db/db_iter.cc                                 | 14 +++++---
 db/dbformat.h                                 |  7 ++--
 db/log_reader.h                               |  8 ++---
 db/log_writer.h                               |  8 ++---
 db/memtable.cc                                |  7 ++--
 db/memtable.h                                 |  7 ++--
 db/version_set.h                              | 18 +++++-----
 env/mock_env.cc                               |  7 ++--
 include/rocksdb/cache.h                       |  7 ++--
 include/rocksdb/cleanable.h                   |  4 +--
 include/rocksdb/db.h                          |  9 +++--
 include/rocksdb/env.h                         | 33 +++++++++----------
 include/rocksdb/iterator.h                    |  9 +++--
 include/rocksdb/utilities/transaction.h       |  7 ++--
 include/rocksdb/utilities/transaction_db.h    |  6 ++--
 include/rocksdb/write_buffer_manager.h        |  8 ++---
 memory/allocator.h                            |  8 ++---
 memtable/inlineskiplist.h                     |  7 ++--
 memtable/skiplist.h                           |  7 ++--
 monitoring/in_memory_stats_history.h          | 14 ++++----
 port/port_posix.h                             | 16 ++++-----
 port/win/port_win.h                           |  6 ++--
 table/block_based/block.h                     |  7 ++--
 table/block_based/block_based_filter_block.h  | 10 +++---
 table/block_based/block_based_table_builder.h |  6 ++--
 table/block_based/block_based_table_reader.h  |  7 ++--
 table/block_based/filter_block.h              |  9 +++--
 table/block_based/full_filter_block.h         |  8 ++---
 table/cuckoo/cuckoo_table_builder.h           |  7 ++--
 table/cuckoo/cuckoo_table_reader.cc           |  6 ++--
 table/full_filter_bits_builder.h              |  8 ++---
 table/internal_iterator.h                     | 10 +++---
 table/plain/plain_table_builder.h             |  7 ++--
 table/plain/plain_table_reader.cc             |  7 ++--
 util/bloom.cc                                 |  7 ++--
 util/concurrent_task_limiter_impl.h           |  9 +++--
 util/mutexlock.h                              | 28 +++++++++-------
 utilities/blob_db/blob_log_reader.h           |  5 ++-
 utilities/blob_db/blob_log_writer.h           |  5 ++-
 .../transactions/optimistic_transaction.h     |  9 +++--
 .../transactions/pessimistic_transaction.h    | 14 ++++----
 utilities/transactions/transaction_lock_mgr.h |  7 ++--
 utilities/transactions/write_prepared_txn.h   |  7 ++--
 46 files changed, 205 insertions(+), 219 deletions(-)

diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index e71ce249411..8a57c5b77eb 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -14,6 +14,10 @@ namespace rocksdb {
 class CompactedDBImpl : public DBImpl {
  public:
   CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
   virtual ~CompactedDBImpl();
 
   static Status Open(const Options& options, const std::string& dbname,
@@ -104,10 +108,6 @@ class CompactedDBImpl : public DBImpl {
   Version* version_;
   const Comparator* user_comparator_;
   LevelFilesBrief files_;
-
-  // No copying allowed
-  CompactedDBImpl(const CompactedDBImpl&);
-  void operator=(const CompactedDBImpl&);
 };
 }
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 1942f0979b0..3e45442caa9 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -128,6 +128,10 @@ class DBImpl : public DB {
  public:
   DBImpl(const DBOptions& options, const std::string& dbname,
          const bool seq_per_batch = false, const bool batch_per_txn = true);
+  // No copying allowed
+  DBImpl(const DBImpl&) = delete;
+  void operator=(const DBImpl&) = delete;
+
   virtual ~DBImpl();
 
   // ---- Implementations of the DB interface ----
@@ -1563,10 +1567,6 @@ class DBImpl : public DB {
 
   void WaitForBackgroundWork();
 
-  // No copying allowed
-  DBImpl(const DBImpl&);
-  void operator=(const DBImpl&);
-
   // Background threads call this function, which is just a wrapper around
   // the InstallSuperVersion() function. Background threads carry
   // sv_context which can have new_superversion already
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index ad307677ccc..9f7ad17a475 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -16,6 +16,10 @@ namespace rocksdb {
 class DBImplReadOnly : public DBImpl {
  public:
   DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&) = delete;
+  void operator=(const DBImplReadOnly&) = delete;
+
   virtual ~DBImplReadOnly();
 
   // Implementations of the DB interface
@@ -127,10 +131,6 @@ class DBImplReadOnly : public DBImpl {
 
  private:
   friend class DB;
-
-  // No copying allowed
-  DBImplReadOnly(const DBImplReadOnly&);
-  void operator=(const DBImplReadOnly&);
 };
 }  // namespace rocksdb
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 060138fd64b..96143161123 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -155,6 +155,10 @@ class DBIter final: public Iterator {
       iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
     }
   }
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
   ~DBIter() override {
     // Release pinned data if any
     if (pinned_iters_mgr_.PinningEnabled()) {
@@ -345,15 +349,17 @@ class DBIter final: public Iterator {
   ReadRangeDelAggregator range_del_agg_;
   LocalStatistics local_stats_;
   PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
   DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
   ColumnFamilyData* cfd_;
   // for diff snapshots we want the lower bound on the seqnum;
   // if this value > 0 iterator will return internal keys
   SequenceNumber start_seqnum_;
-
-  // No copying allowed
-  DBIter(const DBIter&);
-  void operator=(const DBIter&);
 };
 
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
diff --git a/db/dbformat.h b/db/dbformat.h
index 1d9b7ef7e3f..090d8c133f3 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -326,6 +326,9 @@ class IterKey {
         key_size_(0),
         buf_size_(sizeof(space_)),
         is_user_key_(true) {}
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
 
   ~IterKey() { ResetBuffer(); }
 
@@ -523,10 +526,6 @@ class IterKey {
   }
 
   void EnlargeBuffer(size_t key_size);
-
-  // No copying allowed
-  IterKey(const IterKey&) = delete;
-  void operator=(const IterKey&) = delete;
 };
 
 // Convert from a SliceTranform of user keys, to a SliceTransform of
diff --git a/db/log_reader.h b/db/log_reader.h
index bda9ac8bb35..efeb270e225 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -53,6 +53,9 @@ class Reader {
          // @lint-ignore TXT2 T25377293 Grandfathered in
          std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
          bool checksum, uint64_t log_num);
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  void operator=(const Reader&) = delete;
 
   virtual ~Reader();
 
@@ -148,11 +151,6 @@ class Reader {
   // buffer_ must be updated to remove the dropped bytes prior to invocation.
   void ReportCorruption(size_t bytes, const char* reason);
   void ReportDrop(size_t bytes, const Status& reason);
-
- private:
-  // No copying allowed
-  Reader(const Reader&);
-  void operator=(const Reader&);
 };
 
 class FragmentBufferedReader : public Reader {
diff --git a/db/log_writer.h b/db/log_writer.h
index 116d033584a..e5ed71a764d 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -73,6 +73,10 @@ class Writer {
   explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
                   uint64_t log_number, bool recycle_log_files,
                   bool manual_flush = false);
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  void operator=(const Writer&) = delete;
+
   ~Writer();
 
   Status AddRecord(const Slice& slice);
@@ -104,10 +108,6 @@ class Writer {
   // If true, it does not flush after each write. Instead it relies on the upper
   // layer to manually does the flush by calling ::WriteBuffer()
   bool manual_flush_;
-
-  // No copying allowed
-  Writer(const Writer&);
-  void operator=(const Writer&);
 };
 
 }  // namespace log
diff --git a/db/memtable.cc b/db/memtable.cc
index 21d3e347b27..33036ad9848 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -295,6 +295,9 @@ class MemTableIterator : public InternalIterator {
       iter_ = mem.table_->GetIterator(arena);
     }
   }
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&) = delete;
+  void operator=(const MemTableIterator&) = delete;
 
   ~MemTableIterator() override {
 #ifndef NDEBUG
@@ -408,10 +411,6 @@ class MemTableIterator : public InternalIterator {
   bool valid_;
   bool arena_mode_;
   bool value_pinned_;
-
-  // No copying allowed
-  MemTableIterator(const MemTableIterator&);
-  void operator=(const MemTableIterator&);
 };
 
 InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
diff --git a/db/memtable.h b/db/memtable.h
index c0baa9e173a..ed837e945c9 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -101,6 +101,9 @@ class MemTable {
                     const MutableCFOptions& mutable_cf_options,
                     WriteBufferManager* write_buffer_manager,
                     SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
 
   // Do not delete this MemTable unless Unref() indicates it not in use.
   ~MemTable();
@@ -503,10 +506,6 @@ class MemTable {
   void UpdateFlushState();
 
   void UpdateOldestKeyTime();
-
-  // No copying allowed
-  MemTable(const MemTable&);
-  MemTable& operator=(const MemTable&);
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/version_set.h b/db/version_set.h
index c5b391a15b3..24919a6020d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -109,6 +109,9 @@ class VersionStorageInfo {
                      CompactionStyle compaction_style,
                      VersionStorageInfo* src_vstorage,
                      bool _force_consistency_checks);
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
   ~VersionStorageInfo();
 
   void Reserve(int level, size_t size) { files_[level].reserve(size); }
@@ -542,9 +545,6 @@ class VersionStorageInfo {
 
   friend class Version;
   friend class VersionSet;
-  // No copying allowed
-  VersionStorageInfo(const VersionStorageInfo&) = delete;
-  void operator=(const VersionStorageInfo&) = delete;
 };
 
 using MultiGetRange = MultiGetContext::Range;
@@ -734,8 +734,8 @@ class Version {
   ~Version();
 
   // No copying allowed
-  Version(const Version&);
-  void operator=(const Version&);
+  Version(const Version&) = delete;
+  void operator=(const Version&) = delete;
 };
 
 struct ObsoleteFileInfo {
@@ -797,6 +797,10 @@ class VersionSet {
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller,
              BlockCacheTracer* const block_cache_tracer);
+  // No copying allowed
+  VersionSet(const VersionSet&) = delete;
+  void operator=(const VersionSet&) = delete;
+
   virtual ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
@@ -1143,10 +1147,6 @@ class VersionSet {
   BlockCacheTracer* const block_cache_tracer_;
 
  private:
-  // No copying allowed
-  VersionSet(const VersionSet&);
-  void operator=(const VersionSet&);
-
   // REQUIRES db mutex at beginning. may release and re-acquire db mutex
   Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
                                InstrumentedMutex* mu, Directory* db_directory,
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 793a0837ab8..6d3adc808ed 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -31,6 +31,9 @@ class MemFile {
         rnd_(static_cast<uint32_t>(
             MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
         fsynced_bytes_(0) {}
+  // No copying allowed.
+  MemFile(const MemFile&) = delete;
+  void operator=(const MemFile&) = delete;
 
   void Ref() {
     MutexLock lock(&mutex_);
@@ -154,10 +157,6 @@ class MemFile {
   // Private since only Unref() should be used to delete it.
   ~MemFile() { assert(refs_ == 0); }
 
-  // No copying allowed.
-  MemFile(const MemFile&);
-  void operator=(const MemFile&);
-
   Env* env_;
   const std::string fn_;
   mutable port::Mutex mutex_;
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 6bde575e0fc..d8093c7eabb 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -122,6 +122,9 @@ class Cache {
 
   Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
       : memory_allocator_(std::move(allocator)) {}
+  // No copying allowed
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
 
   // Destroys all existing entries by calling the "deleter"
   // function that was passed via the Insert() function.
@@ -253,10 +256,6 @@ class Cache {
   MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
  private:
-  // No copying allowed
-  Cache(const Cache&);
-  Cache& operator=(const Cache&);
-
   std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
 
diff --git a/include/rocksdb/cleanable.h b/include/rocksdb/cleanable.h
index 6dba8d9531c..3a111d545e2 100644
--- a/include/rocksdb/cleanable.h
+++ b/include/rocksdb/cleanable.h
@@ -23,12 +23,12 @@ namespace rocksdb {
 class Cleanable {
  public:
   Cleanable();
-  ~Cleanable();
-
   // No copy constructor and copy assignment allowed.
   Cleanable(Cleanable&) = delete;
   Cleanable& operator=(Cleanable&) = delete;
 
+  ~Cleanable();
+
   // Move constructor and move assignment is allowed.
   Cleanable(Cleanable&&);
   Cleanable& operator=(Cleanable&&);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 6aa05baae3a..a2961792b53 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -255,6 +255,10 @@ class DB {
                                    std::vector<std::string>* column_families);
 
   DB() {}
+  // No copying allowed
+  DB(const DB&) = delete;
+  void operator=(const DB&) = delete;
+
   virtual ~DB();
 
   // Create a column_family and return the handle of column family
@@ -1421,11 +1425,6 @@ class DB {
     return Status::NotSupported("Supported only by secondary instance");
   }
 #endif  // !ROCKSDB_LITE
-
- private:
-  // No copying allowed
-  DB(const DB&);
-  void operator=(const DB&);
 };
 
 // Destroy the contents of the specified database.
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 398a7ff511d..b3b30b5ac8c 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -141,6 +141,9 @@ class Env {
   };
 
   Env() : thread_status_updater_(nullptr) {}
+  // No copying allowed
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
 
   virtual ~Env();
 
@@ -527,11 +530,6 @@ class Env {
   // The pointer to an internal structure that will update the
   // status of each thread.
   ThreadStatusUpdater* thread_status_updater_;
-
- private:
-  // No copying allowed
-  Env(const Env&);
-  void operator=(const Env&);
 };
 
 // The factory function to construct a ThreadStatusUpdater.  Any Env
@@ -711,6 +709,9 @@ class WritableFile {
         io_priority_(Env::IO_TOTAL),
         write_hint_(Env::WLTH_NOT_SET),
         strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+  // No copying allowed
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
 
   virtual ~WritableFile();
 
@@ -870,9 +871,6 @@ class WritableFile {
  private:
   size_t last_preallocated_block_;
   size_t preallocation_block_size_;
-  // No copying allowed
-  WritableFile(const WritableFile&);
-  void operator=(const WritableFile&);
 
  protected:
   Env::IOPriority io_priority_;
@@ -884,6 +882,10 @@ class WritableFile {
 class RandomRWFile {
  public:
   RandomRWFile() {}
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&) = delete;
+  RandomRWFile& operator=(const RandomRWFile&) = delete;
+
   virtual ~RandomRWFile() {}
 
   // Indicates if the class makes use of direct I/O
@@ -914,10 +916,6 @@ class RandomRWFile {
 
   // If you're adding methods here, remember to add them to
   // RandomRWFileWrapper too.
-
-  // No copying allowed
-  RandomRWFile(const RandomRWFile&) = delete;
-  RandomRWFile& operator=(const RandomRWFile&) = delete;
 };
 
 // MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
@@ -975,6 +973,10 @@ class Logger {
 
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
       : closed_(false), log_level_(log_level) {}
+  // No copying allowed
+  Logger(const Logger&) = delete;
+  void operator=(const Logger&) = delete;
+
   virtual ~Logger();
 
   // Close the log file. Must be called before destructor. If the return
@@ -1016,9 +1018,6 @@ class Logger {
   bool closed_;
 
  private:
-  // No copying allowed
-  Logger(const Logger&);
-  void operator=(const Logger&);
   InfoLogLevel log_level_;
 };
 
@@ -1030,8 +1029,8 @@ class FileLock {
 
  private:
   // No copying allowed
-  FileLock(const FileLock&);
-  void operator=(const FileLock&);
+  FileLock(const FileLock&) = delete;
+  void operator=(const FileLock&) = delete;
 };
 
 class DynamicLibrary {
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index e99b434a019..162e262e328 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -28,6 +28,10 @@ namespace rocksdb {
 class Iterator : public Cleanable {
  public:
   Iterator() {}
+  // No copying allowed
+  Iterator(const Iterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
   virtual ~Iterator() {}
 
   // An iterator is either positioned at a key/value pair, or
@@ -104,11 +108,6 @@ class Iterator : public Cleanable {
   //   Get the user-key portion of the internal key at which the iteration
   //   stopped.
   virtual Status GetProperty(std::string prop_name, std::string* prop);
-
- private:
-  // No copying allowed
-  Iterator(const Iterator&);
-  void operator=(const Iterator&);
 };
 
 // Return an empty iterator (yields nothing).
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 95d299c1b59..44ce2801952 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -52,6 +52,10 @@ class TransactionNotifier {
 //  -Support for using Transactions with DBWithTTL
 class Transaction {
  public:
+  // No copying allowed
+  Transaction(const Transaction&) = delete;
+  void operator=(const Transaction&) = delete;
+
   virtual ~Transaction() {}
 
   // If a transaction has a snapshot set, the transaction will ensure that
@@ -529,9 +533,6 @@ class Transaction {
   friend class WriteUnpreparedTxnDB;
   friend class TransactionTest_TwoPhaseLogRollingTest_Test;
   friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
-  // No copying allowed
-  Transaction(const Transaction&);
-  void operator=(const Transaction&);
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index fa271c35d56..91a9cec2856 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -301,11 +301,9 @@ class TransactionDB : public StackableDB {
   // To Create an TransactionDB, call Open()
   // The ownership of db is transferred to the base StackableDB
   explicit TransactionDB(DB* db) : StackableDB(db) {}
-
- private:
   // No copying allowed
-  TransactionDB(const TransactionDB&);
-  void operator=(const TransactionDB&);
+  TransactionDB(const TransactionDB&) = delete;
+  void operator=(const TransactionDB&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h
index dea904c187e..a6c204a633b 100644
--- a/include/rocksdb/write_buffer_manager.h
+++ b/include/rocksdb/write_buffer_manager.h
@@ -26,6 +26,10 @@ class WriteBufferManager {
   // the memory allocated to the cache. It can be used even if _buffer_size = 0.
   explicit WriteBufferManager(size_t _buffer_size,
                               std::shared_ptr<Cache> cache = {});
+  // No copying allowed
+  WriteBufferManager(const WriteBufferManager&) = delete;
+  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
   ~WriteBufferManager();
 
   bool enabled() const { return buffer_size_ != 0; }
@@ -94,9 +98,5 @@ class WriteBufferManager {
 
   void ReserveMemWithCache(size_t mem);
   void FreeMemWithCache(size_t mem);
-
-  // No copying allowed
-  WriteBufferManager(const WriteBufferManager&) = delete;
-  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
 };
 }  // namespace rocksdb
diff --git a/memory/allocator.h b/memory/allocator.h
index 505d6ba2bbf..619cd66a5fd 100644
--- a/memory/allocator.h
+++ b/memory/allocator.h
@@ -33,6 +33,10 @@ class Allocator {
 class AllocTracker {
  public:
   explicit AllocTracker(WriteBufferManager* write_buffer_manager);
+  // No copying allowed
+  AllocTracker(const AllocTracker&) = delete;
+  void operator=(const AllocTracker&) = delete;
+
   ~AllocTracker();
   void Allocate(size_t bytes);
   // Call when we're finished allocating memory so we can free it from
@@ -48,10 +52,6 @@ class AllocTracker {
   std::atomic<size_t> bytes_allocated_;
   bool done_allocating_;
   bool freed_;
-
-  // No copying allowed
-  AllocTracker(const AllocTracker&);
-  void operator=(const AllocTracker&);
 };
 
 }  // namespace rocksdb
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index c3adb2ddbd7..faebad63ef9 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -74,6 +74,9 @@ class InlineSkipList {
   explicit InlineSkipList(Comparator cmp, Allocator* allocator,
                           int32_t max_height = 12,
                           int32_t branching_factor = 4);
+  // No copying allowed
+  InlineSkipList(const InlineSkipList&) = delete;
+  InlineSkipList& operator=(const InlineSkipList&) = delete;
 
   // Allocates a key and a skip-list node, returning a pointer to the key
   // portion of the node.  This method is thread-safe if the allocator
@@ -254,10 +257,6 @@ class InlineSkipList {
   // lowest_level (inclusive).
   void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
                              int recompute_level);
-
-  // No copying allowed
-  InlineSkipList(const InlineSkipList&);
-  InlineSkipList& operator=(const InlineSkipList&);
 };
 
 // Implementation details follow
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index 275daa7940f..5edfc10b7cb 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -51,6 +51,9 @@ class SkipList {
   // allocator must remain allocated for the lifetime of the skiplist object.
   explicit SkipList(Comparator cmp, Allocator* allocator,
                     int32_t max_height = 12, int32_t branching_factor = 4);
+  // No copying allowed
+  SkipList(const SkipList&) = delete;
+  void operator=(const SkipList&) = delete;
 
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
@@ -158,10 +161,6 @@ class SkipList {
   // Return the last node in the list.
   // Return head_ if list is empty.
   Node* FindLast() const;
-
-  // No copying allowed
-  SkipList(const SkipList&);
-  void operator=(const SkipList&);
 };
 
 // Implementation details follow
diff --git a/monitoring/in_memory_stats_history.h b/monitoring/in_memory_stats_history.h
index 8ccec146a96..37b50ca06de 100644
--- a/monitoring/in_memory_stats_history.h
+++ b/monitoring/in_memory_stats_history.h
@@ -34,6 +34,13 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
         db_impl_(db_impl) {
     AdvanceIteratorByTime(start_time_, end_time_);
   }
+  // no copying allowed
+  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
+  void operator=(const InMemoryStatsHistoryIterator&) = delete;
+  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
+  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
+      delete;
+
   ~InMemoryStatsHistoryIterator() override;
   bool Valid() const override;
   Status status() const override;
@@ -55,13 +62,6 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
   // between [start_time, end_time)
   void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
 
-  // No copying allowed
-  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
-  void operator=(const InMemoryStatsHistoryIterator&) = delete;
-  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
-  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
-      delete;
-
   uint64_t time_;
   uint64_t start_time_;
   uint64_t end_time_;
diff --git a/port/port_posix.h b/port/port_posix.h
index 63d7239fe6d..51eb241629a 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -104,6 +104,10 @@ class CondVar;
 class Mutex {
  public:
   explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
+  // No copying
+  Mutex(const Mutex&) = delete;
+  void operator=(const Mutex&) = delete;
+
   ~Mutex();
 
   void Lock();
@@ -118,15 +122,15 @@ class Mutex {
 #ifndef NDEBUG
   bool locked_;
 #endif
-
-  // No copying
-  Mutex(const Mutex&);
-  void operator=(const Mutex&);
 };
 
 class RWMutex {
  public:
   RWMutex();
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
+
   ~RWMutex();
 
   void ReadLock();
@@ -137,10 +141,6 @@ class RWMutex {
 
  private:
   pthread_rwlock_t mu_; // the underlying platform mutex
-
-  // No copying allowed
-  RWMutex(const RWMutex&);
-  void operator=(const RWMutex&);
 };
 
 class CondVar {
diff --git a/port/win/port_win.h b/port/win/port_win.h
index de41cdc7f01..1b302b3d211 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -180,6 +180,9 @@ class Mutex {
 class RWMutex {
  public:
   RWMutex() { InitializeSRWLock(&srwLock_); }
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
 
   void ReadLock() { AcquireSRWLockShared(&srwLock_); }
 
@@ -194,9 +197,6 @@ class RWMutex {
 
  private:
   SRWLOCK srwLock_;
-  // No copying allowed
-  RWMutex(const RWMutex&);
-  void operator=(const RWMutex&);
 };
 
 class CondVar {
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 3af92b6a262..3e19f9fdc66 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -147,6 +147,9 @@ class Block {
   explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
                  size_t read_amp_bytes_per_bit = 0,
                  Statistics* statistics = nullptr);
+  // No copying allowed
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
 
   ~Block();
 
@@ -222,10 +225,6 @@ class Block {
   const SequenceNumber global_seqno_;
 
   DataBlockHashIndex data_block_hash_index_;
-
-  // No copying allowed
-  Block(const Block&) = delete;
-  void operator=(const Block&) = delete;
 };
 
 template <class TValue>
diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
index 43dbc4f4f9f..ed409e041ee 100644
--- a/table/block_based/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -38,6 +38,9 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
  public:
   BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
                                const BlockBasedTableOptions& table_opt);
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete;
+  void operator=(const BlockBasedFilterBlockBuilder&) = delete;
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
@@ -68,10 +71,6 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
   std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
   size_t num_added_;  // Number of keys added
-
-  // No copying allowed
-  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
-  void operator=(const BlockBasedFilterBlockBuilder&);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -81,6 +80,9 @@ class BlockBasedFilterBlockReader
  public:
   BlockBasedFilterBlockReader(const BlockBasedTable* t,
                               CachableEntry<BlockContents>&& filter_block);
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete;
+  void operator=(const BlockBasedFilterBlockReader&) = delete;
 
   static std::unique_ptr<FilterBlockReader> Create(
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index a1ef3889112..bd099ab3dae 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -51,13 +51,13 @@ class BlockBasedTableBuilder : public TableBuilder {
       const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
       const uint64_t file_creation_time = 0);
 
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~BlockBasedTableBuilder();
-
   // No copying allowed
   BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
   BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
 
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 59575899aa2..c8e8ea006f4 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -265,6 +265,9 @@ class BlockBasedTable : public TableReader {
   Rep* rep_;
   explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
       : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
 
  private:
   friend class MockedBlockBasedTable;
@@ -458,10 +461,6 @@ class BlockBasedTable : public TableReader {
   void DumpKeyValue(const Slice& key, const Slice& value,
                     WritableFile* out_file);
 
-  // No copying allowed
-  explicit BlockBasedTable(const TableReader&) = delete;
-  void operator=(const TableReader&) = delete;
-
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
 };
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index a2871e6c8ea..38c3cc05fa2 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -52,6 +52,10 @@ using MultiGetRange = MultiGetContext::Range;
 class FilterBlockBuilder {
  public:
   explicit FilterBlockBuilder() {}
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+  void operator=(const FilterBlockBuilder&) = delete;
+
   virtual ~FilterBlockBuilder() {}
 
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
@@ -66,11 +70,6 @@ class FilterBlockBuilder {
     return ret;
   }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
-
- private:
-  // No copying allowed
-  FilterBlockBuilder(const FilterBlockBuilder&);
-  void operator=(const FilterBlockBuilder&);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 4d10a5a33ce..ae1e974f41f 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -40,6 +40,10 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
                                   bool whole_key_filtering,
                                   FilterBitsBuilder* filter_bits_builder);
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+  void operator=(const FullFilterBlockBuilder&) = delete;
+
   // bits_builder is created in filter_policy, it should be passed in here
   // directly. and be deleted here
   ~FullFilterBlockBuilder() {}
@@ -71,10 +75,6 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   std::unique_ptr<const char[]> filter_data_;
 
   void AddPrefix(const Slice& key);
-
-  // No copying allowed
-  FullFilterBlockBuilder(const FullFilterBlockBuilder&);
-  void operator=(const FullFilterBlockBuilder&);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/cuckoo/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h
index 3829541b39a..c42744de019 100644
--- a/table/cuckoo/cuckoo_table_builder.h
+++ b/table/cuckoo/cuckoo_table_builder.h
@@ -30,6 +30,9 @@ class CuckooTableBuilder: public TableBuilder {
                                                 uint64_t),
                      uint32_t column_family_id,
                      const std::string& column_family_name);
+  // No copying allowed
+  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+  void operator=(const CuckooTableBuilder&) = delete;
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~CuckooTableBuilder() {}
@@ -116,10 +119,6 @@ class CuckooTableBuilder: public TableBuilder {
   std::string smallest_user_key_ = "";
 
   bool closed_;  // Either Finish() or Abandon() has been called.
-
-  // No copying allowed
-  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
-  void operator=(const CuckooTableBuilder&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
index 30109ece6ce..982b14763f4 100644
--- a/table/cuckoo/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -197,6 +197,9 @@ void CuckooTableReader::Prepare(const Slice& key) {
 class CuckooTableIterator : public InternalIterator {
  public:
   explicit CuckooTableIterator(CuckooTableReader* reader);
+  // No copying allowed
+  CuckooTableIterator(const CuckooTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
   ~CuckooTableIterator() override {}
   bool Valid() const override;
   void SeekToFirst() override;
@@ -248,9 +251,6 @@ class CuckooTableIterator : public InternalIterator {
   uint32_t curr_key_idx_;
   Slice curr_value_;
   IterKey curr_key_;
-  // No copying allowed
-  CuckooTableIterator(const CuckooTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
 };
 
 CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
index 851ed1e2ab4..c719c698aac 100644
--- a/table/full_filter_bits_builder.h
+++ b/table/full_filter_bits_builder.h
@@ -23,6 +23,10 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
   explicit FullFilterBitsBuilder(const size_t bits_per_key,
                                  const size_t num_probes);
 
+  // No Copy allowed
+  FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
+  void operator=(const FullFilterBitsBuilder&) = delete;
+
   ~FullFilterBitsBuilder();
 
   virtual void AddKey(const Slice& key) override;
@@ -65,10 +69,6 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
 
   // Assuming single threaded access to this function.
   void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
-
-  // No Copy allowed
-  FullFilterBitsBuilder(const FullFilterBitsBuilder&);
-  void operator=(const FullFilterBitsBuilder&);
 };
 
 }  // namespace rocksdb
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 426ff396548..adcccf79592 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -27,6 +27,10 @@ class InternalIteratorBase : public Cleanable {
  public:
   InternalIteratorBase() : is_mutable_(true) {}
   InternalIteratorBase(bool _is_mutable) : is_mutable_(_is_mutable) {}
+  // No copying allowed
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+
   virtual ~InternalIteratorBase() {}
 
   // An iterator is either positioned at a key/value pair, or
@@ -156,12 +160,8 @@ class InternalIteratorBase : public Cleanable {
       Prev();
     }
   }
-  bool is_mutable_;
 
- private:
-  // No copying allowed
-  InternalIteratorBase(const InternalIteratorBase&) = delete;
-  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+  bool is_mutable_;
 };
 
 using InternalIterator = InternalIteratorBase<Slice>;
diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h
index ce2169a388b..f2cd6009eb3 100644
--- a/table/plain/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -45,6 +45,9 @@ class PlainTableBuilder: public TableBuilder {
       const std::string& column_family_name, uint32_t num_probes = 6,
       size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
       bool store_index_in_file = false);
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
@@ -131,10 +134,6 @@ class PlainTableBuilder: public TableBuilder {
   }
 
   bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
-
-  // No copying allowed
-  PlainTableBuilder(const PlainTableBuilder&) = delete;
-  void operator=(const PlainTableBuilder&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 3d5c4f2dbfb..2ac7cf2e33c 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -55,6 +55,10 @@ inline uint32_t GetFixed32Element(const char* base, size_t offset) {
 class PlainTableIterator : public InternalIterator {
  public:
   explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
   ~PlainTableIterator() override;
 
   bool Valid() const override;
@@ -86,9 +90,6 @@ class PlainTableIterator : public InternalIterator {
   Slice key_;
   Slice value_;
   Status status_;
-  // No copying allowed
-  PlainTableIterator(const PlainTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
 };
 
 extern const uint64_t kPlainTableMagicNumber;
diff --git a/util/bloom.cc b/util/bloom.cc
index f859ab7dd64..23607a51e4a 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -171,6 +171,9 @@ class FullFilterBitsReader : public FilterBitsReader {
       }
     }
   }
+  // No Copy allowed
+  FullFilterBitsReader(const FullFilterBitsReader&) = delete;
+  void operator=(const FullFilterBitsReader&) = delete;
 
   ~FullFilterBitsReader() override {}
 
@@ -244,10 +247,6 @@ class FullFilterBitsReader : public FilterBitsReader {
 
   void FilterPrepare(const uint32_t& hash, const Slice& filter,
                      const uint32_t& num_lines, uint32_t* bit_offset);
-
-  // No Copy allowed
-  FullFilterBitsReader(const FullFilterBitsReader&);
-  void operator=(const FullFilterBitsReader&);
 };
 
 void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h
index 515f1481e04..91b7bbe3d89 100644
--- a/util/concurrent_task_limiter_impl.h
+++ b/util/concurrent_task_limiter_impl.h
@@ -22,6 +22,10 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
  public:
   explicit ConcurrentTaskLimiterImpl(const std::string& name,
                                      int32_t max_outstanding_task);
+  // No copying allowed
+  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(
+      const ConcurrentTaskLimiterImpl&) = delete;
 
   virtual ~ConcurrentTaskLimiterImpl();
 
@@ -44,11 +48,6 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
   std::string name_;
   std::atomic<int32_t> max_outstanding_tasks_;
   std::atomic<int32_t> outstanding_tasks_;
-
-  // No copying allowed
-  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
-  ConcurrentTaskLimiterImpl& operator=(
-      const ConcurrentTaskLimiterImpl&) = delete;
 };
 
 class TaskLimiterToken {
diff --git a/util/mutexlock.h b/util/mutexlock.h
index 640cef3daf7..90e6c8b99c8 100644
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
@@ -31,13 +31,14 @@ class MutexLock {
   explicit MutexLock(port::Mutex *mu) : mu_(mu) {
     this->mu_->Lock();
   }
+  // No copying allowed
+  MutexLock(const MutexLock&) = delete;
+  void operator=(const MutexLock&) = delete;
+
   ~MutexLock() { this->mu_->Unlock(); }
 
  private:
   port::Mutex *const mu_;
-  // No copying allowed
-  MutexLock(const MutexLock&);
-  void operator=(const MutexLock&);
 };
 
 //
@@ -50,13 +51,14 @@ class ReadLock {
   explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->ReadLock();
   }
+  // No copying allowed
+  ReadLock(const ReadLock&) = delete;
+  void operator=(const ReadLock&) = delete;
+
   ~ReadLock() { this->mu_->ReadUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  ReadLock(const ReadLock&);
-  void operator=(const ReadLock&);
 };
 
 //
@@ -65,13 +67,14 @@ class ReadLock {
 class ReadUnlock {
  public:
   explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  // No copying allowed
+  ReadUnlock(const ReadUnlock &) = delete;
+  ReadUnlock &operator=(const ReadUnlock &) = delete;
+
   ~ReadUnlock() { mu_->ReadUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  ReadUnlock(const ReadUnlock &) = delete;
-  ReadUnlock &operator=(const ReadUnlock &) = delete;
 };
 
 //
@@ -84,13 +87,14 @@ class WriteLock {
   explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->WriteLock();
   }
+  // No copying allowed
+  WriteLock(const WriteLock&) = delete;
+  void operator=(const WriteLock&) = delete;
+
   ~WriteLock() { this->mu_->WriteUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  WriteLock(const WriteLock&);
-  void operator=(const WriteLock&);
 };
 
 //
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
index 45e2e955145..af7971554ce 100644
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@@ -42,13 +42,12 @@ class Reader {
   // "*file" must remain live while this Reader is in use.
   Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
          Statistics* statistics);
-
-  ~Reader() = default;
-
   // No copying allowed
   Reader(const Reader&) = delete;
   Reader& operator=(const Reader&) = delete;
 
+  ~Reader() = default;
+
   Status ReadHeader(BlobLogHeader* header);
 
   // Read the next record into *record.  Returns true if read
diff --git a/utilities/blob_db/blob_log_writer.h b/utilities/blob_db/blob_log_writer.h
index dccac355cb2..faf2cfd26cd 100644
--- a/utilities/blob_db/blob_log_writer.h
+++ b/utilities/blob_db/blob_log_writer.h
@@ -39,13 +39,12 @@ class Writer {
   Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
          Statistics* statistics, uint64_t log_number, uint64_t bpsync,
          bool use_fsync, uint64_t boffset = 0);
-
-  ~Writer() = default;
-
   // No copying allowed
   Writer(const Writer&) = delete;
   Writer& operator=(const Writer&) = delete;
 
+  ~Writer() = default;
+
   static void ConstructBlobHeader(std::string* buf, const Slice& key,
                                   const Slice& val, uint64_t expiration);
 
diff --git a/utilities/transactions/optimistic_transaction.h b/utilities/transactions/optimistic_transaction.h
index 445979b9616..b6e249f3ca8 100644
--- a/utilities/transactions/optimistic_transaction.h
+++ b/utilities/transactions/optimistic_transaction.h
@@ -31,6 +31,9 @@ class OptimisticTransaction : public TransactionBaseImpl {
   OptimisticTransaction(OptimisticTransactionDB* db,
                         const WriteOptions& write_options,
                         const OptimisticTransactionOptions& txn_options);
+  // No copying allowed
+  OptimisticTransaction(const OptimisticTransaction&) = delete;
+  void operator=(const OptimisticTransaction&) = delete;
 
   virtual ~OptimisticTransaction();
 
@@ -52,7 +55,7 @@ class OptimisticTransaction : public TransactionBaseImpl {
                  const bool assume_tracked = false) override;
 
  private:
-  OptimisticTransactionDB* const txn_db_;
+  ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_;
 
   friend class OptimisticTransactionCallback;
 
@@ -71,10 +74,6 @@ class OptimisticTransaction : public TransactionBaseImpl {
                           const Slice& /* unused */) override {
     // Nothing to unlock.
   }
-
-  // No copying allowed
-  OptimisticTransaction(const OptimisticTransaction&);
-  void operator=(const OptimisticTransaction&);
 };
 
 // Used at commit time to trigger transaction validation
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index 1f851818e9d..c91d9217b2b 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -40,6 +40,9 @@ class PessimisticTransaction : public TransactionBaseImpl {
   PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
                          const TransactionOptions& txn_options,
                          const bool init = true);
+  // No copying allowed
+  PessimisticTransaction(const PessimisticTransaction&) = delete;
+  void operator=(const PessimisticTransaction&) = delete;
 
   virtual ~PessimisticTransaction();
 
@@ -193,16 +196,15 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
                           const Slice& key) override;
-
-  // No copying allowed
-  PessimisticTransaction(const PessimisticTransaction&);
-  void operator=(const PessimisticTransaction&);
 };
 
 class WriteCommittedTxn : public PessimisticTransaction {
  public:
   WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options,
                     const TransactionOptions& txn_options);
+  // No copying allowed
+  WriteCommittedTxn(const WriteCommittedTxn&) = delete;
+  void operator=(const WriteCommittedTxn&) = delete;
 
   virtual ~WriteCommittedTxn() {}
 
@@ -216,10 +218,6 @@ class WriteCommittedTxn : public PessimisticTransaction {
   Status CommitInternal() override;
 
   Status RollbackInternal() override;
-
-  // No copying allowed
-  WriteCommittedTxn(const WriteCommittedTxn&);
-  void operator=(const WriteCommittedTxn&);
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h
index b1235760421..16a40d7b7a1 100644
--- a/utilities/transactions/transaction_lock_mgr.h
+++ b/utilities/transactions/transaction_lock_mgr.h
@@ -57,6 +57,9 @@ class TransactionLockMgr {
   TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes,
                      int64_t max_num_locks, uint32_t max_num_deadlocks,
                      std::shared_ptr<TransactionDBMutexFactory> factory);
+  // No copying allowed
+  TransactionLockMgr(const TransactionLockMgr&) = delete;
+  void operator=(const TransactionLockMgr&) = delete;
 
   ~TransactionLockMgr();
 
@@ -149,10 +152,6 @@ class TransactionLockMgr {
                         const autovector<TransactionID>& wait_ids);
   void DecrementWaitersImpl(const PessimisticTransaction* txn,
                             const autovector<TransactionID>& wait_ids);
-
-  // No copying allowed
-  TransactionLockMgr(const TransactionLockMgr&);
-  void operator=(const TransactionLockMgr&);
 };
 
 }  //  namespace rocksdb
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index c574f62310f..e5dadabc483 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -42,6 +42,9 @@ class WritePreparedTxn : public PessimisticTransaction {
  public:
   WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
                    const TransactionOptions& txn_options);
+  // No copying allowed
+  WritePreparedTxn(const WritePreparedTxn&) = delete;
+  void operator=(const WritePreparedTxn&) = delete;
 
   virtual ~WritePreparedTxn() {}
 
@@ -106,10 +109,6 @@ class WritePreparedTxn : public PessimisticTransaction {
 
   virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
 
-  // No copying allowed
-  WritePreparedTxn(const WritePreparedTxn&);
-  void operator=(const WritePreparedTxn&);
-
   WritePreparedTxnDB* wpt_db_;
   // Number of sub-batches in prepare
   size_t prepare_batch_cnt_ = 0;

From d05c0fe4d10969080df8fed21e05f53aa7ff805a Mon Sep 17 00:00:00 2001
From: Ronak Sisodia <ronaksisodia07@gmail.com>
Date: Wed, 11 Sep 2019 18:26:22 -0700
Subject: [PATCH 370/572] Option to make write group size configurable (#5759)

Summary:
The max batch size that we can write to the WAL is controlled by a static manner. So if the leader write is less than 128 KB we will have the batch size as leader write size + 128 KB else the limit will be 1 MB. Both of them are statically defined.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5759

Differential Revision: D17329298

fbshipit-source-id: a3d910629d8d8ca84ea39ad89c2b2d284571ded5
---
 db/write_thread.cc               | 16 ++++++++++------
 db/write_thread.h                |  5 +++++
 include/rocksdb/options.h        |  7 +++++++
 options/db_options.cc            |  6 ++++++
 options/db_options.h             |  1 +
 options/options_helper.cc        |  5 +++++
 options/options_settable_test.cc |  1 +
 7 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/db/write_thread.cc b/db/write_thread.cc
index 5ee9439048b..1ded68fde3b 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -22,6 +22,8 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options)
       allow_concurrent_memtable_write_(
           db_options.allow_concurrent_memtable_write),
       enable_pipelined_write_(db_options.enable_pipelined_write),
+      max_write_batch_group_size_bytes(
+          db_options.max_write_batch_group_size_bytes),
       newest_writer_(nullptr),
       newest_memtable_writer_(nullptr),
       last_sequence_(0),
@@ -406,9 +408,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
   // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128 << 10)) {
-    max_size = size + (128 << 10);
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
   }
 
   leader->write_group = write_group;
@@ -485,9 +488,10 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader,
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
   // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128 << 10)) {
-    max_size = size + (128 << 10);
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
   }
 
   leader->write_group = write_group;
diff --git a/db/write_thread.h b/db/write_thread.h
index dc9c22ff87e..e1db970663b 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -360,6 +360,11 @@ class WriteThread {
   // Enable pipelined write to WAL and memtable.
   const bool enable_pipelined_write_;
 
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
   // Points to the newest pending writer. Only leader can remove
   // elements, adding can be done lock-free by anybody.
   std::atomic<Writer*> newest_writer_;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index cf1d1f6ecc5..a7e8af16e4b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -953,6 +953,13 @@ struct DBOptions {
   // Default: true
   bool enable_write_thread_adaptive_yield = true;
 
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  //
+  // Default: 1 MB
+  uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
   // The maximum number of microseconds that a write operation will use
   // a yielding spin loop to coordinate with other write threads before
   // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
diff --git a/options/db_options.cc b/options/db_options.cc
index a39294211a0..ca2800d0784 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -44,6 +44,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       table_cache_numshardbits(options.table_cache_numshardbits),
       wal_ttl_seconds(options.WAL_ttl_seconds),
       wal_size_limit_mb(options.WAL_size_limit_MB),
+      max_write_batch_group_size_bytes(
+          options.max_write_batch_group_size_bytes),
       manifest_preallocation_size(options.manifest_preallocation_size),
       allow_mmap_reads(options.allow_mmap_reads),
       allow_mmap_writes(options.allow_mmap_writes),
@@ -153,6 +155,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "                      Options.WAL_size_limit_MB: %" PRIu64,
                    wal_size_limit_mb);
+  ROCKS_LOG_HEADER(log,
+                   "                       "
+                   "Options.max_write_batch_group_size_bytes: %" PRIu64,
+                   max_write_batch_group_size_bytes);
   ROCKS_LOG_HEADER(
       log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
       manifest_preallocation_size);
diff --git a/options/db_options.h b/options/db_options.h
index 98a790705dc..7c71b12a0cc 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -43,6 +43,7 @@ struct ImmutableDBOptions {
   int table_cache_numshardbits;
   uint64_t wal_ttl_seconds;
   uint64_t wal_size_limit_mb;
+  uint64_t max_write_batch_group_size_bytes;
   size_t manifest_preallocation_size;
   bool allow_mmap_reads;
   bool allow_mmap_writes;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 91ae2f8b551..42695a613b7 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -110,6 +110,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.allow_concurrent_memtable_write;
   options.enable_write_thread_adaptive_yield =
       immutable_db_options.enable_write_thread_adaptive_yield;
+  options.max_write_batch_group_size_bytes =
+      immutable_db_options.max_write_batch_group_size_bytes;
   options.write_thread_max_yield_usec =
       immutable_db_options.write_thread_max_yield_usec;
   options.write_thread_slow_yield_usec =
@@ -1611,6 +1613,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"write_thread_slow_yield_usec",
          {offsetof(struct DBOptions, write_thread_slow_yield_usec),
           OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"max_write_batch_group_size_bytes",
+         {offsetof(struct DBOptions, max_write_batch_group_size_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
         {"write_thread_max_yield_usec",
          {offsetof(struct DBOptions, write_thread_max_yield_usec),
           OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index a4d4b99107d..2208b4f3013 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -229,6 +229,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "delete_obsolete_files_period_micros=4294967758;"
                              "WAL_ttl_seconds=4295008036;"
                              "WAL_size_limit_MB=4295036161;"
+                             "max_write_batch_group_size_bytes=1048576;"
                              "wal_dir=path/to/wal_dir;"
                              "db_write_buffer_size=2587;"
                              "max_subcompactions=64330;"

From e8c2e68b4e3cd9a47e6873a74cfad193af7c25fa Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Wed, 11 Sep 2019 18:35:03 -0700
Subject: [PATCH 371/572] Fix RocksDB bug in block_cache_trace_analyzer.cc on
 Windows (#5786)

Summary:
This is required to compile on Windows with Visual Studio 2015.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5786

Differential Revision: D17335994

fbshipit-source-id: 8f9568310bc6f697e312b5e24ad465e9084f0011
---
 tools/block_cache_analyzer/block_cache_trace_analyzer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 2c078e5f8ec..0dd99dab802 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -1830,8 +1830,8 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
             hist_naccess_per_key.Add(caller_access.second);
           }
         }
-        uint64_t avg_accesses = hist_naccess_per_key.Average();
-        uint64_t stdev_accesses = hist_naccess_per_key.StandardDeviation();
+        uint64_t avg_accesses = static_cast<uint64_t>(hist_naccess_per_key.Average());
+        uint64_t stdev_accesses = static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
         avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
         cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
         stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);

From d35ffd569c20cf721d58c7ce1e3a73f31a322c32 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 12 Sep 2019 12:10:05 -0700
Subject: [PATCH 372/572] Temporarily disable hash index in stress tests
 (#5792)

Summary:
PR https://github.com/facebook/rocksdb/issues/4020 implicitly enabled the hash index as well in stress/crash
tests, resulting in assertion failures in Block. This patch disables
the hash index until we can pinpoint the root cause of these issues.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5792

Test Plan:
Ran tools/db_crashtest.py and made sure it only uses index types 0 and 2
(binary search and partitioned index).

Differential Revision: D17346777

Pulled By: ltamasi

fbshipit-source-id: b4318f37f1fda3ee1bbff4ef2c2f556ca9e6b551
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 697cd265d9a..4a393b5357f 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,7 +41,8 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
-    "index_type": lambda: random.randint(0, 2),
+    # Temporarily disable hash index
+    "index_type": lambda: random.choice([0, 2]),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,

From a378a4c2ac8232cbaccc14b9282c09c98a259fd6 Mon Sep 17 00:00:00 2001
From: HouBingjian <54057110+houbingjian@users.noreply.github.com>
Date: Thu, 12 Sep 2019 16:52:41 -0700
Subject: [PATCH 373/572] arm64 crc prefetch optimise (#5773)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
prefetch data for following block，avoid cache miss when doing crc caculate

I do performance test at kunpeng-920 server(arm-v8, 64core@2.6GHz)
./db_bench --benchmarks=crc32c --block_size=500000000
before optimise : 587313.500 micros/op 1 ops/sec;  811.9 MB/s (500000000 per op)
after optimise  : 289248.500 micros/op 3 ops/sec; 1648.5 MB/s (500000000 per op)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5773

Differential Revision: D17347339

fbshipit-source-id: bfcd74f0f0eb4b322b959be68019ddcaae1e3341
---
 util/crc32c_arm64.cc |  5 ++++-
 util/crc32c_arm64.h  | 11 +++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index d346c2612f0..79081298d4e 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -64,7 +64,10 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
      */
     uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
 
-    /* First 8 bytei for better pipelining */
+    /* Prefetch data for following block to avoid cache miss */
+    PREF1KL1((uint8_t *)buf64, 1024);
+
+    /* First 8 byte for better pipelining */
     crc0 = crc32c_u64(crc, *buf64++);
 
     /* 3 blocks crc32c parallel computation
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index fb727ce4020..2594f247083 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -17,6 +17,17 @@
 #define crc32c_u16(crc, v) __crc32ch(crc, v)
 #define crc32c_u32(crc, v) __crc32cw(crc, v)
 #define crc32c_u64(crc, v) __crc32cd(crc, v)
+#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \
+        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(buffer,PREF_OFFSET) \
+        PREF4X64L1(buffer,(PREF_OFFSET), 0) \
+        PREF4X64L1(buffer,(PREF_OFFSET), 4) \
+        PREF4X64L1(buffer,(PREF_OFFSET), 8) \
+        PREF4X64L1(buffer,(PREF_OFFSET), 12)
 
 extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
 extern uint32_t crc32c_runtime_check(void);

From 1a928c22a0fb37072b91c2aca667c9a551cca048 Mon Sep 17 00:00:00 2001
From: Lingjing You <clylj135@gmail.com>
Date: Thu, 12 Sep 2019 16:53:31 -0700
Subject: [PATCH 374/572] Add insert hints for each writebatch (#5728)

Summary:
Add insert hints for each writebatch so that they can be used in concurrent write, and add write option to enable it.

Bench result (qps):

`./db_bench --benchmarks=fillseq -allow_concurrent_memtable_write=true -num=4000000 -batch-size=1 -threads=1 -db=/data3/ylj/tmp -write_buffer_size=536870912 -num_column_families=4`

master:

| batch size \ thread num | 1       | 2       | 4       | 8       |
| ----------------------- | ------- | ------- | ------- | ------- |
| 1                       | 387883  | 220790  | 308294  | 490998  |
| 10                      | 1397208 | 978911  | 1275684 | 1733395 |
| 100                     | 2045414 | 1589927 | 1798782 | 2681039 |
| 1000                    | 2228038 | 1698252 | 1839877 | 2863490 |

fillseq with writebatch hint:

| batch size \ thread num | 1       | 2       | 4       | 8       |
| ----------------------- | ------- | ------- | ------- | ------- |
| 1                       | 286005  | 223570  | 300024  | 466981  |
| 10                      | 970374  | 813308  | 1399299 | 1753588 |
| 100                     | 1962768 | 1983023 | 2676577 | 3086426 |
| 1000                    | 2195853 | 2676782 | 3231048 | 3638143 |
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5728

Differential Revision: D17297240

fbshipit-source-id: b053590a6d77871f1ef2f911a7bd013b3899b26c
---
 db/c.cc                         |  5 +++++
 db/db_impl/db_impl_write.cc     | 12 ++++++----
 db/memtable.cc                  |  6 +++--
 db/memtable.h                   |  3 ++-
 db/write_batch.cc               | 39 ++++++++++++++++++++++++++++-----
 db/write_batch_internal.h       |  3 ++-
 include/rocksdb/c.h             |  3 +++
 include/rocksdb/memtablerep.h   | 22 +++++++++++++++++++
 include/rocksdb/options.h       | 10 +++++++++
 memtable/inlineskiplist.h       | 33 ++++++++++++++++++++++++++++
 memtable/inlineskiplist_test.cc | 26 ++++++++++++++++++----
 memtable/skiplistrep.cc         |  9 ++++++++
 tools/db_bench_tool.cc          |  4 ++--
 13 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index 66e3892af1f..b1fe7601923 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3224,6 +3224,11 @@ void rocksdb_writeoptions_set_low_pri(
   opt->rep.low_pri = v;
 }
 
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.memtable_insert_hint_per_batch = v;
+}
+
 rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
   return new rocksdb_compactoptions_t;
 }
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index fcf34f83f30..2f6d35d17c0 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -174,7 +174,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
           true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
-          batch_per_txn_);
+          batch_per_txn_, write_options.memtable_insert_hint_per_batch);
 
       PERF_TIMER_START(write_pre_and_post_process_time);
     }
@@ -397,7 +397,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
               &trim_history_scheduler_,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
               this, true /*concurrent_memtable_writes*/, seq_per_batch_,
-              w.batch_cnt, batch_per_txn_);
+              w.batch_cnt, batch_per_txn_,
+              write_options.memtable_insert_hint_per_batch);
         }
       }
       if (seq_used != nullptr) {
@@ -564,7 +565,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     w.status = WriteBatchInternal::InsertInto(
         &w, w.sequence, &column_family_memtables, &flush_scheduler_,
         &trim_history_scheduler_, write_options.ignore_missing_column_families,
-        0 /*log_number*/, this, true /*concurrent_memtable_writes*/);
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
       MemTableInsertStatusCheck(w.status);
       versions_->SetLastSequence(w.write_group->last_sequence);
@@ -603,7 +606,8 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
         &w, w.sequence, &column_family_memtables, &flush_scheduler_,
         &trim_history_scheduler_, write_options.ignore_missing_column_families,
         0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
-        seq_per_batch_, sub_batch_cnt);
+        seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
 
     WriteStatusCheck(w.status);
     if (write_options.disableWAL) {
diff --git a/db/memtable.cc b/db/memtable.cc
index 33036ad9848..dd660451468 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -469,7 +469,7 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
 bool MemTable::Add(SequenceNumber s, ValueType type,
                    const Slice& key, /* user key */
                    const Slice& value, bool allow_concurrent,
-                   MemTablePostProcessInfo* post_process_info) {
+                   MemTablePostProcessInfo* post_process_info, void** hint) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
   //  key bytes    : char[internal_key.size()]
@@ -547,7 +547,9 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
     assert(post_process_info == nullptr);
     UpdateFlushState();
   } else {
-    bool res = table->InsertKeyConcurrently(handle);
+    bool res = (hint == nullptr)
+                   ? table->InsertKeyConcurrently(handle)
+                   : table->InsertKeyWithHintConcurrently(handle, hint);
     if (UNLIKELY(!res)) {
       return res;
     }
diff --git a/db/memtable.h b/db/memtable.h
index ed837e945c9..f316ab8e29a 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -182,7 +182,8 @@ class MemTable {
   // the <key, seq> already exists.
   bool Add(SequenceNumber seq, ValueType type, const Slice& key,
            const Slice& value, bool allow_concurrent = false,
-           MemTablePostProcessInfo* post_process_info = nullptr);
+           MemTablePostProcessInfo* post_process_info = nullptr,
+           void** hint = nullptr);
 
   // Used to Get value associated with key or Get Merge Operands associated
   // with key.
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 225e3e94706..1b878f3b093 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -39,6 +39,7 @@
 #include <stack>
 #include <stdexcept>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #include "db/column_family.h"
@@ -1225,6 +1226,22 @@ class MemTableInserter : public WriteBatch::Handler {
   DupDetector       duplicate_detector_;
   bool              dup_dectector_on_;
 
+  bool hint_per_batch_;
+  bool hint_created_;
+  // Hints for this batch
+  using HintMap = std::unordered_map<MemTable*, void*>;
+  using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+  HintMapType hint_;
+
+  HintMap& GetHintMap() {
+    assert(hint_per_batch_);
+    if (!hint_created_) {
+      new (&hint_) HintMap();
+      hint_created_ = true;
+    }
+    return *reinterpret_cast<HintMap*>(&hint_);
+  }
+
   MemPostInfoMap& GetPostMap() {
     assert(concurrent_memtable_writes_);
     if(!post_info_created_) {
@@ -1258,7 +1275,7 @@ class MemTableInserter : public WriteBatch::Handler {
                    uint64_t recovering_log_number, DB* db,
                    bool concurrent_memtable_writes,
                    bool* has_valid_writes = nullptr, bool seq_per_batch = false,
-                   bool batch_per_txn = true)
+                   bool batch_per_txn = true, bool hint_per_batch = false)
       : sequence_(_sequence),
         cf_mems_(cf_mems),
         flush_scheduler_(flush_scheduler),
@@ -1282,7 +1299,9 @@ class MemTableInserter : public WriteBatch::Handler {
         write_before_prepare_(!batch_per_txn),
         unprepared_batch_(false),
         duplicate_detector_(),
-        dup_dectector_on_(false) {
+        dup_dectector_on_(false),
+        hint_per_batch_(hint_per_batch),
+        hint_created_(false) {
     assert(cf_mems_);
   }
 
@@ -1295,6 +1314,12 @@ class MemTableInserter : public WriteBatch::Handler {
       reinterpret_cast<MemPostInfoMap*>
         (&mem_post_info_map_)->~MemPostInfoMap();
     }
+    if (hint_created_) {
+      for (auto iter : GetHintMap()) {
+        delete[] reinterpret_cast<char*>(iter.second);
+      }
+      reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+    }
     delete rebuilding_trx_;
   }
 
@@ -1404,7 +1429,8 @@ class MemTableInserter : public WriteBatch::Handler {
     if (!moptions->inplace_update_support) {
       bool mem_res =
           mem->Add(sequence_, value_type, key, value,
-                   concurrent_memtable_writes_, get_post_process_info(mem));
+                   concurrent_memtable_writes_, get_post_process_info(mem),
+                   hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
       if (UNLIKELY(!mem_res)) {
         assert(seq_per_batch_);
         ret_status = Status::TryAgain("key+seq exists");
@@ -1487,7 +1513,8 @@ class MemTableInserter : public WriteBatch::Handler {
     MemTable* mem = cf_mems_->GetMemTable();
     bool mem_res =
         mem->Add(sequence_, delete_type, key, value,
-                 concurrent_memtable_writes_, get_post_process_info(mem));
+                 concurrent_memtable_writes_, get_post_process_info(mem),
+                 hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
     if (UNLIKELY(!mem_res)) {
       assert(seq_per_batch_);
       ret_status = Status::TryAgain("key+seq exists");
@@ -1962,7 +1989,7 @@ Status WriteBatchInternal::InsertInto(
     TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t log_number, DB* db,
     bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
-    bool batch_per_txn) {
+    bool batch_per_txn, bool hint_per_batch) {
 #ifdef NDEBUG
   (void)batch_cnt;
 #endif
@@ -1971,7 +1998,7 @@ Status WriteBatchInternal::InsertInto(
       sequence, memtables, flush_scheduler, trim_history_scheduler,
       ignore_missing_column_families, log_number, db,
       concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
-      batch_per_txn);
+      batch_per_txn, hint_per_batch);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
   Status s = writer->batch->Iterate(&inserter);
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 1d742fee190..3810c672272 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -188,7 +188,8 @@ class WriteBatchInternal {
                            uint64_t log_number = 0, DB* db = nullptr,
                            bool concurrent_memtable_writes = false,
                            bool seq_per_batch = false, size_t batch_cnt = 0,
-                           bool batch_per_txn = true);
+                           bool batch_per_txn = true,
+                           bool hint_per_batch = false);
 
   static Status Append(WriteBatch* dst, const WriteBatch* src,
                        const bool WAL_only = false);
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index d7f13f8edb2..525e3813833 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1263,6 +1263,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
     rocksdb_writeoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+                                                        unsigned char);
 
 /* Compact range options */
 
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 328422f5703..7f18a581e97 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -120,6 +120,28 @@ class MemTableRep {
     return true;
   }
 
+  // Same as ::InsertWithHint, but allow concurrnet write
+  //
+  // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+  // the hint will be updated to reflect the last insert location. The hint is
+  // owned by the caller and it is the caller's responsibility to delete the
+  // hint later.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to InsertConcurrently() by default.
+  virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    InsertConcurrently(handle);
+  }
+
+  // Same as ::InsertWithHintConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+    InsertWithHintConcurrently(handle, hint);
+    return true;
+  }
+
   // Like Insert(handle), but may be called concurrent with other calls
   // to InsertConcurrently for other handles.
   //
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a7e8af16e4b..8c08b50d156 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1356,6 +1356,15 @@ struct WriteOptions {
   // Default: false
   bool low_pri;
 
+  // If true, this writebatch will maintain the last insert positions of each
+  // memtable as hints in concurrent write. It can improve write performance
+  // in concurrent writes if keys in one writebatch are sequential. In
+  // non-concurrent writes (when concurrent_memtable_writes is false) this
+  // option will be ignored.
+  //
+  // Default: false
+  bool memtable_insert_hint_per_batch;
+
   // Timestamp of write operation, e.g. Put. All timestamps of the same
   // database must share the same length and format. The user is also
   // responsible for providing a customized compare function via Comparator to
@@ -1373,6 +1382,7 @@ struct WriteOptions {
         ignore_missing_column_families(false),
         no_slowdown(false),
         low_pri(false),
+        memtable_insert_hint_per_batch(false),
         timestamp(nullptr) {}
 };
 
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index faebad63ef9..91ab3d75460 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -86,6 +86,9 @@ class InlineSkipList {
   // Allocate a splice using allocator.
   Splice* AllocateSplice();
 
+  // Allocate a splice on heap.
+  Splice* AllocateSpliceOnHeap();
+
   // Inserts a key allocated by AllocateKey, after the actual key value
   // has been filled in.
   //
@@ -105,6 +108,12 @@ class InlineSkipList {
   // REQUIRES: no concurrent calls to any of inserts.
   bool InsertWithHint(const char* key, void** hint);
 
+  // Like InsertConcurrently, but with a hint
+  //
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  // REQUIRES: no concurrent calls that use same hint
+  bool InsertWithHintConcurrently(const char* key, void** hint);
+
   // Like Insert, but external synchronization is not required.
   bool InsertConcurrently(const char* key);
 
@@ -642,6 +651,18 @@ InlineSkipList<Comparator>::AllocateSplice() {
   return splice;
 }
 
+template <class Comparator>
+typename InlineSkipList<Comparator>::Splice*
+InlineSkipList<Comparator>::AllocateSpliceOnHeap() {
+  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
+  char* raw = new char[sizeof(Splice) + array_size * 2];
+  Splice* splice = reinterpret_cast<Splice*>(raw);
+  splice->height_ = 0;
+  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
+  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
+  return splice;
+}
+
 template <class Comparator>
 bool InlineSkipList<Comparator>::Insert(const char* key) {
   return Insert<false>(key, seq_splice_, false);
@@ -668,6 +689,18 @@ bool InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
   return Insert<false>(key, splice, true);
 }
 
+template <class Comparator>
+bool InlineSkipList<Comparator>::InsertWithHintConcurrently(const char* key,
+                                                            void** hint) {
+  assert(hint != nullptr);
+  Splice* splice = reinterpret_cast<Splice*>(*hint);
+  if (splice == nullptr) {
+    splice = AllocateSpliceOnHeap();
+    *hint = reinterpret_cast<void*>(splice);
+  }
+  return Insert<true>(key, splice, true);
+}
+
 template <class Comparator>
 template <bool prefetch_before>
 void InlineSkipList<Comparator>::FindSpliceForLevel(const DecodedKey& key,
diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc
index 9670f3fc64c..a3ae4149877 100644
--- a/memtable/inlineskiplist_test.cc
+++ b/memtable/inlineskiplist_test.cc
@@ -412,12 +412,18 @@ class ConcurrentTest {
   }
 
   // REQUIRES: No concurrent calls for the same k
-  void ConcurrentWriteStep(uint32_t k) {
+  void ConcurrentWriteStep(uint32_t k, bool use_hint = false) {
     const int g = current_.Get(k) + 1;
     const Key new_key = MakeKey(k, g);
     char* buf = list_.AllocateKey(sizeof(Key));
     memcpy(buf, &new_key, sizeof(Key));
-    list_.InsertConcurrently(buf);
+    if (use_hint) {
+      void* hint = nullptr;
+      list_.InsertWithHintConcurrently(buf, &hint);
+      delete[] reinterpret_cast<char*>(hint);
+    } else {
+      list_.InsertConcurrently(buf);
+    }
     ASSERT_EQ(g, current_.Get(k) + 1);
     current_.Set(k, g);
   }
@@ -508,6 +514,7 @@ TEST_F(InlineSkipTest, ConcurrentInsertWithoutThreads) {
 class TestState {
  public:
   ConcurrentTest t_;
+  bool use_hint_;
   int seed_;
   std::atomic<bool> quit_flag_;
   std::atomic<uint32_t> next_writer_;
@@ -575,7 +582,7 @@ static void ConcurrentReader(void* arg) {
 static void ConcurrentWriter(void* arg) {
   TestState* state = reinterpret_cast<TestState*>(arg);
   uint32_t k = state->next_writer_++ % ConcurrentTest::K;
-  state->t_.ConcurrentWriteStep(k);
+  state->t_.ConcurrentWriteStep(k, state->use_hint_);
   state->AdjustPendingWriters(-1);
 }
 
@@ -600,7 +607,8 @@ static void RunConcurrentRead(int run) {
   }
 }
 
-static void RunConcurrentInsert(int run, int write_parallelism = 4) {
+static void RunConcurrentInsert(int run, bool use_hint = false,
+                                int write_parallelism = 4) {
   Env::Default()->SetBackgroundThreads(1 + write_parallelism,
                                        Env::Priority::LOW);
   const int seed = test::RandomSeed() + (run * 100);
@@ -612,6 +620,7 @@ static void RunConcurrentInsert(int run, int write_parallelism = 4) {
       fprintf(stderr, "Run %d of %d\n", i, N);
     }
     TestState state(seed + 1);
+    state.use_hint_ = use_hint;
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
     for (int k = 0; k < kSize; k += write_parallelism) {
@@ -635,6 +644,15 @@ TEST_F(InlineSkipTest, ConcurrentRead5) { RunConcurrentRead(5); }
 TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); }
 TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); }
 TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); }
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint1) {
+  RunConcurrentInsert(1, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint2) {
+  RunConcurrentInsert(2, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint3) {
+  RunConcurrentInsert(3, true);
+}
 
 #endif  // ROCKSDB_VALGRIND_RUN
 }  // namespace rocksdb
diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc
index 3955217cce7..55d3cd7a658 100644
--- a/memtable/skiplistrep.cc
+++ b/memtable/skiplistrep.cc
@@ -50,6 +50,15 @@ class SkipListRep : public MemTableRep {
    return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
  }
 
+ void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
+   skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
+ }
+
+ bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
+   return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
+                                                hint);
+ }
+
  void InsertConcurrently(KeyHandle handle) override {
    skip_list_.InsertConcurrently(static_cast<char*>(handle));
  }
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 715fd842af9..caf73e7f75c 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4677,8 +4677,8 @@ class Benchmark {
     }
 
     char msg[100];
-    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
-             found, read);
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+             read);
 
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);

From aa2486b23cda405952e826f6163c8a2b4967defc Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 13 Sep 2019 10:24:38 -0700
Subject: [PATCH 375/572] Refactor some confusing logic in PlainTableReader

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5780

Test Plan: existing plain table unit test

Differential Revision: D17368629

Pulled By: pdillinger

fbshipit-source-id: f25409cdc2f39ebe8d5cbb599cf820270e6b5d26
---
 Makefile                                |   5 +
 db/db_properties_test.cc                |   6 +-
 port/port_posix.h                       |  23 +++-
 table/full_filter_bits_builder.h        |   8 +-
 table/plain/plain_table_bloom.cc        |   6 +-
 table/plain/plain_table_bloom.h         |  63 +++------
 table/plain/plain_table_reader.cc       |  50 ++++---
 table/plain/plain_table_reader.h        |   9 +-
 third-party/folly/folly/ConstexprMath.h |  28 ++++
 util/bloom.cc                           | 169 +++++++-----------------
 util/bloom_impl.h                       | 140 ++++++++++++++++++++
 util/bloom_test.cc                      |   3 +-
 12 files changed, 295 insertions(+), 215 deletions(-)
 create mode 100644 util/bloom_impl.h

diff --git a/Makefile b/Makefile
index dc90bdc9ae6..898570f84ae 100644
--- a/Makefile
+++ b/Makefile
@@ -332,6 +332,11 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
 	endif
 endif
 
+ifdef TEST_CACHE_LINE_SIZE
+  PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+  PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+endif
+
 # This (the first rule) must depend on "all".
 default: all
 
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 956accef821..0c3bb891464 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -212,7 +212,8 @@ void VerifySimilar(uint64_t a, uint64_t b, double bias) {
 
 void VerifyTableProperties(const TableProperties& base_tp,
                            const TableProperties& new_tp,
-                           double filter_size_bias = 0.1,
+                           double filter_size_bias =
+                               CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
                            double index_size_bias = 0.1,
                            double data_size_bias = 0.1,
                            double num_data_blocks_bias = 0.05) {
@@ -266,7 +267,8 @@ void GetExpectedTableProperties(
        // discount 1 byte as value size is not encoded in value delta encoding
        (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
-      kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
 }
 }  // anonymous namespace
 
diff --git a/port/port_posix.h b/port/port_posix.h
index 51eb241629a..49d2b9ae854 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -178,22 +178,31 @@ typedef pthread_once_t OnceType;
 extern void InitOnce(OnceType* once, void (*initializer)());
 
 #ifndef CACHE_LINE_SIZE
-  #if defined(__s390__)
-    #define CACHE_LINE_SIZE 256U
-  #elif defined(__powerpc__) || defined(__aarch64__)
-    #define CACHE_LINE_SIZE 128U
+  // To test behavior with non-native cache line size, e.g. for
+  // Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
+  // This disables ALIGN_AS to keep it from failing compilation.
+  #ifdef TEST_CACHE_LINE_SIZE
+    #define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
+    #define ALIGN_AS(n) /*empty*/
   #else
-    #define CACHE_LINE_SIZE 64U
+    #if defined(__s390__)
+      #define CACHE_LINE_SIZE 256U
+    #elif defined(__powerpc__) || defined(__aarch64__)
+      #define CACHE_LINE_SIZE 128U
+    #else
+      #define CACHE_LINE_SIZE 64U
+    #endif
+    #define ALIGN_AS(n) alignas(n)
   #endif
 #endif
 
+static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
+              "Cache line size must be a power of 2 number of bytes");
 
 extern void *cacheline_aligned_alloc(size_t size);
 
 extern void cacheline_aligned_free(void *memblock);
 
-#define ALIGN_AS(n) alignas(n)
-
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
 extern void Crash(const std::string& srcfile, int srcline);
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
index c719c698aac..5e0b9bb1944 100644
--- a/table/full_filter_bits_builder.h
+++ b/table/full_filter_bits_builder.h
@@ -20,8 +20,8 @@ class Slice;
 
 class FullFilterBitsBuilder : public FilterBitsBuilder {
  public:
-  explicit FullFilterBitsBuilder(const size_t bits_per_key,
-                                 const size_t num_probes);
+  explicit FullFilterBitsBuilder(const int bits_per_key,
+                                 const int num_probes);
 
   // No Copy allowed
   FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
@@ -56,8 +56,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
 
  private:
   friend class FullFilterBlockTest_DuplicateEntries_Test;
-  size_t bits_per_key_;
-  size_t num_probes_;
+  int bits_per_key_;
+  int num_probes_;
   std::vector<uint32_t> hash_entries_;
 
   // Get totalbits that optimized for cpu cache line
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
index 778b3b558ad..542010169dd 100644
--- a/table/plain/plain_table_bloom.cc
+++ b/table/plain/plain_table_bloom.cc
@@ -33,9 +33,9 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
     : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
 
-void PlainTableBloomV1::SetRawData(unsigned char* raw_data, uint32_t total_bits,
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
                               uint32_t num_blocks) {
-  data_ = reinterpret_cast<uint8_t*>(raw_data);
+  data_ = raw_data;
   kTotalBits = total_bits;
   kNumBlocks = num_blocks;
 }
@@ -63,7 +63,7 @@ void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
   if (kNumBlocks > 0 && cache_line_offset > 0) {
     raw += CACHE_LINE_SIZE - cache_line_offset;
   }
-  data_ = reinterpret_cast<uint8_t*>(raw);
+  data_ = raw;
 }
 
 void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index b9248cdaf12..271aa8f4feb 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -10,8 +10,11 @@
 #include "rocksdb/slice.h"
 
 #include "port/port.h"
+#include "util/bloom_impl.h"
 #include "util/hash.h"
 
+#include "third-party/folly/folly/ConstexprMath.h"
+
 #include <memory>
 
 namespace rocksdb {
@@ -51,10 +54,10 @@ class PlainTableBloomV1 {
   uint32_t GetNumBlocks() const { return kNumBlocks; }
 
   Slice GetRawData() const {
-    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
+    return Slice(data_, GetTotalBits() / 8);
   }
 
-  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
+  void SetRawData(char* raw_data, uint32_t total_bits,
                   uint32_t num_blocks = 0);
 
   uint32_t GetTotalBits() const { return kTotalBits; }
@@ -66,7 +69,10 @@ class PlainTableBloomV1 {
   uint32_t kNumBlocks;
   const uint32_t kNumProbes;
 
-  uint8_t* data_;
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      folly::constexpr_log2(CACHE_LINE_SIZE);
 };
 
 #if defined(_MSC_VER)
@@ -76,8 +82,9 @@ class PlainTableBloomV1 {
 #endif
 inline void PlainTableBloomV1::Prefetch(uint32_t h) {
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    PREFETCH(&(data_[b / 8]), 0, 3);
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/true>::PrepareHashMayMatch(
+      h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
   }
 }
 #if defined(_MSC_VER)
@@ -86,54 +93,22 @@ inline void PlainTableBloomV1::Prefetch(uint32_t h) {
 
 inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
   assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      //  to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+               h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
   } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
-    }
+    return LegacyNoLocalityBloomImpl::HashMayMatch(
+               h, kTotalBits, kNumProbes, data_);
   }
-  return true;
 }
 
 inline void PlainTableBloomV1::AddHash(uint32_t h) {
   assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      // to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      data_[bitpos / 8] |= (1 << (bitpos % 8));
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
+    LegacyLocalityBloomImpl<true>::AddHash(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
   } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      data_[bitpos / 8] |= (1 << (bitpos % 8));
-      h += delta;
-    }
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
   }
 }
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 2ac7cf2e33c..ed3be5b933c 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -260,23 +260,19 @@ Status PlainTableReader::PopulateIndexRecordList(
   return s;
 }
 
-void PlainTableReader::AllocateAndFillBloom(
-    int bloom_bits_per_key, int num_prefixes, size_t huge_page_tlb_size,
-    std::vector<uint32_t>* prefix_hashes) {
-  if (!IsTotalOrderMode()) {
-    uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
-    if (bloom_total_bits > 0) {
-      enable_bloom_ = true;
-      bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
-                          huge_page_tlb_size, ioptions_.info_log);
-      FillBloom(prefix_hashes);
-    }
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+                                     size_t huge_page_tlb_size) {
+  uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+  if (bloom_total_bits > 0) {
+    enable_bloom_ = true;
+    bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                        huge_page_tlb_size, ioptions_.info_log);
   }
 }
 
-void PlainTableReader::FillBloom(std::vector<uint32_t>* prefix_hashes) {
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
   assert(bloom_.IsInitialized());
-  for (auto prefix_hash : *prefix_hashes) {
+  for (const auto prefix_hash : prefix_hashes) {
     bloom_.AddHash(prefix_hash);
   }
 }
@@ -354,14 +350,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   if (!index_in_file) {
     // Allocate bloom filter here for total order mode.
     if (IsTotalOrderMode()) {
-      uint32_t num_bloom_bits =
-          static_cast<uint32_t>(table_properties_->num_entries) *
-          bloom_bits_per_key;
-      if (num_bloom_bits > 0) {
-        enable_bloom_ = true;
-        bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
-                            huge_page_tlb_size, ioptions_.info_log);
-      }
+      AllocateBloom(bloom_bits_per_key,
+                    static_cast<uint32_t>(table_properties_->num_entries),
+                    huge_page_tlb_size);
     }
   } else if (bloom_in_file) {
     enable_bloom_ = true;
@@ -377,8 +368,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     }
     // cast away const qualifier, because bloom_ won't be changed
     bloom_.SetRawData(
-        const_cast<unsigned char*>(
-            reinterpret_cast<const unsigned char*>(bloom_block->data())),
+        const_cast<char*>(bloom_block->data()),
         static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
   } else {
     // Index in file but no bloom in file. Disable bloom filter in this case.
@@ -392,6 +382,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
 
   std::vector<uint32_t> prefix_hashes;
   if (!index_in_file) {
+    // Populates _bloom if enabled (total order mode)
     s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
     if (!s.ok()) {
       return s;
@@ -404,10 +395,15 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   }
 
   if (!index_in_file) {
-    // Calculated bloom filter size and allocate memory for
-    // bloom filter based on the number of prefixes, then fill it.
-    AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
-                         huge_page_tlb_size, &prefix_hashes);
+    if (!IsTotalOrderMode()) {
+      // Calculated bloom filter size and allocate memory for
+      // bloom filter based on the number of prefixes, then fill it.
+      AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                    huge_page_tlb_size);
+      if (enable_bloom_) {
+        FillBloom(prefix_hashes);
+      }
+    }
   }
 
   // Fill two table properties.
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index 02539cc6970..f95616cc57f 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -209,12 +209,11 @@ class PlainTableReader: public TableReader {
   Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
                                  std::vector<uint32_t>* prefix_hashes);
 
-  // Internal helper function to allocate memory for bloom filter and fill it
-  void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
-                            size_t huge_page_tlb_size,
-                            std::vector<uint32_t>* prefix_hashes);
+  // Internal helper function to allocate memory for bloom filter
+  void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+                     size_t huge_page_tlb_size);
 
-  void FillBloom(std::vector<uint32_t>* prefix_hashes);
+  void FillBloom(const std::vector<uint32_t>& prefix_hashes);
 
   // Read the key and value at `offset` to parameters for keys, the and
   // `seekable`.
diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h
index b125c5f423b..6e3a9966e38 100644
--- a/third-party/folly/folly/ConstexprMath.h
+++ b/third-party/folly/folly/ConstexprMath.h
@@ -14,4 +14,32 @@ template <typename T, typename... Ts>
 constexpr T constexpr_max(T a, T b, Ts... ts) {
   return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...);
 }
+
+namespace detail {
+template <typename T>
+constexpr T constexpr_log2_(T a, T e) {
+  return e == T(1) ? a : constexpr_log2_(a + T(1), e / T(2));
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil_(T l2, T t) {
+  return l2 + T(T(1) << l2 < t ? 1 : 0);
+}
+
+template <typename T>
+constexpr T constexpr_square_(T t) {
+  return t * t;
+}
+} // namespace detail
+
+template <typename T>
+constexpr T constexpr_log2(T t) {
+  return detail::constexpr_log2_(T(0), t);
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil(T t) {
+  return detail::constexpr_log2_ceil_(constexpr_log2(t), t);
+}
+
 } // namespace folly
diff --git a/util/bloom.cc b/util/bloom.cc
index 23607a51e4a..1548f7c2519 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -13,16 +13,19 @@
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/full_filter_bits_builder.h"
+#include "third-party/folly/folly/ConstexprMath.h"
+#include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
+typedef LegacyLocalityBloomImpl</*ExtraRotates*/false> LegacyFullFilterImpl;
 class BlockBasedFilterBlockBuilder;
 class FullFilterBlockBuilder;
 
-FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
-                                             const size_t num_probes)
+FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
+                                             const int num_probes)
     : bits_per_key_(bits_per_key), num_probes_(num_probes) {
   assert(bits_per_key_);
   }
@@ -74,7 +77,7 @@ uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
                                                uint32_t* num_lines) {
   assert(bits_per_key_);
   if (num_entry != 0) {
-    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
+    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
 
     *total_bits = GetTotalBitsForLocality(total_bits_tmp);
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
@@ -124,24 +127,16 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
 #endif
   assert(num_lines > 0 && total_bits > 0);
 
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
-
-  for (uint32_t i = 0; i < num_probes_; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    // to a simple operation by compiler.
-    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-    data[bitpos / 8] |= (1 << (bitpos % 8));
-
-    h += delta;
-  }
+  LegacyFullFilterImpl::AddHash(
+      h, num_lines, num_probes_, data,
+      folly::constexpr_log2(CACHE_LINE_SIZE));
 }
 
 namespace {
 class FullFilterBitsReader : public FilterBitsReader {
  public:
   explicit FullFilterBitsReader(const Slice& contents)
-      : data_(const_cast<char*>(contents.data())),
+      : data_(contents.data()),
         data_len_(static_cast<uint32_t>(contents.size())),
         num_probes_(0),
         num_lines_(0),
@@ -177,16 +172,23 @@ class FullFilterBitsReader : public FilterBitsReader {
 
   ~FullFilterBitsReader() override {}
 
-  bool MayMatch(const Slice& entry) override {
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
     if (data_len_ <= 5) {   // remain same with original filter
       return false;
     }
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return true;
-    uint32_t hash = BloomHash(entry);
-    uint32_t bit_offset;
-    FilterPrepare(hash, Slice(data_, data_len_), num_lines_, &bit_offset);
-    return HashMayMatch(hash, Slice(data_, data_len_), num_probes_, bit_offset);
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyFullFilterImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/&byte_offset, log2_cache_line_size_);
+    return LegacyFullFilterImpl::HashMayMatchPrepared(
+             hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
   }
 
   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
@@ -202,16 +204,18 @@ class FullFilterBitsReader : public FilterBitsReader {
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return;
     uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
-    uint32_t bit_offsets[MultiGetContext::MAX_BATCH_SIZE];
+    uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
     for (int i = 0; i < num_keys; ++i) {
       hashes[i] = BloomHash(*keys[i]);
-      FilterPrepare(hashes[i], Slice(data_, data_len_), num_lines_,
-                    &bit_offsets[i]);
+      LegacyFullFilterImpl::PrepareHashMayMatch(
+          hashes[i], num_lines_, data_,
+          /*out*/&byte_offsets[i], log2_cache_line_size_);
     }
 
     for (int i = 0; i < num_keys; ++i) {
-      if (!HashMayMatch(hashes[i], Slice(data_, data_len_), num_probes_,
-                        bit_offsets[i])) {
+      if (!LegacyFullFilterImpl::HashMayMatchPrepared(
+               hashes[i], num_probes_,
+               data_ + byte_offsets[i], log2_cache_line_size_)) {
         may_match[i] = false;
       }
     }
@@ -219,38 +223,20 @@ class FullFilterBitsReader : public FilterBitsReader {
 
  private:
   // Filter meta data
-  char* data_;
+  const char* data_;
   uint32_t data_len_;
-  size_t num_probes_;
+  int num_probes_;
   uint32_t num_lines_;
   uint32_t log2_cache_line_size_;
 
   // Get num_probes, and num_lines from filter
   // If filter format broken, set both to 0.
-  void GetFilterMeta(const Slice& filter, size_t* num_probes,
+  void GetFilterMeta(const Slice& filter, int* num_probes,
                              uint32_t* num_lines);
-
-  // "filter" contains the data appended by a preceding call to
-  // FilterBitsBuilder::Finish. This method must return true if the key was
-  // passed to FilterBitsBuilder::AddKey. This method may return true or false
-  // if the key was not on the list, but it should aim to return false with a
-  // high probability.
-  //
-  // hash: target to be checked
-  // filter: the whole filter, including meta data bytes
-  // num_probes: number of probes, read before hand
-  // num_lines: filter metadata, read before hand
-  // Before calling this function, need to ensure the input meta data
-  // is valid.
-  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
-                    const size_t& num_probes, const uint32_t& bit_offset);
-
-  void FilterPrepare(const uint32_t& hash, const Slice& filter,
-                     const uint32_t& num_lines, uint32_t* bit_offset);
 };
 
 void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
-    size_t* num_probes, uint32_t* num_lines) {
+    int* num_probes, uint32_t* num_lines) {
   uint32_t len = static_cast<uint32_t>(filter.size());
   if (len <= 5) {
     // filter is empty or broken
@@ -263,54 +249,6 @@ void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
   *num_lines = DecodeFixed32(filter.data() + len - 4);
 }
 
-void FullFilterBitsReader::FilterPrepare(const uint32_t& hash,
-                                         const Slice& filter,
-                                         const uint32_t& num_lines,
-                                         uint32_t* bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_lines != 0 && (len - 5) % num_lines == 0);
-
-  uint32_t h = hash;
-  // Left shift by an extra 3 to convert bytes to bits
-  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
-  PREFETCH(&filter.data()[b / 8], 0 /* rw */, 1 /* locality */);
-  PREFETCH(&filter.data()[b / 8 + (1 << log2_cache_line_size_) - 1],
-      0 /* rw */, 1 /* locality */);
-  *bit_offset = b;
-}
-
-bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
-                                        const Slice& filter,
-                                        const size_t& num_probes,
-                                        const uint32_t& bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return false;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_probes != 0);
-  const char* data = filter.data();
-
-  uint32_t h = hash;
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-
-  for (uint32_t i = 0; i < num_probes; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    //  to a simple and operation by compiler.
-    const uint32_t bitpos =
-        bit_offset + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
-    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
-      return false;
-    }
-
-    h += delta;
-  }
-
-  return true;
-}
-
 // An implementation of filter policy
 class BloomFilterPolicy : public FilterPolicy {
  public:
@@ -326,56 +264,43 @@ class BloomFilterPolicy : public FilterPolicy {
 
   void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
-    size_t bits = n * bits_per_key_;
+    uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
 
     // For small n, we can see a very high false positive rate.  Fix it
     // by enforcing a minimum bloom filter length.
     if (bits < 64) bits = 64;
 
-    size_t bytes = (bits + 7) / 8;
+    uint32_t bytes = (bits + 7) / 8;
     bits = bytes * 8;
 
     const size_t init_size = dst->size();
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
-    for (size_t i = 0; i < static_cast<size_t>(n); i++) {
-      // Use double-hashing to generate a sequence of hash values.
-      // See analysis in [Kirsch,Mitzenmacher 2006].
-      uint32_t h = hash_func_(keys[i]);
-      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-      for (size_t j = 0; j < num_probes_; j++) {
-        const uint32_t bitpos = h % bits;
-        array[bitpos/8] |= (1 << (bitpos % 8));
-        h += delta;
-      }
+    for (int i = 0; i < n; i++) {
+      LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits,
+                                         num_probes_, array);
     }
   }
 
   bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
-    if (len < 2) return false;
+    if (len < 2 || len > 0xffffffffU) { return false; }
 
     const char* array = bloom_filter.data();
-    const size_t bits = (len - 1) * 8;
+    const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
 
     // Use the encoded k so that we can read filters generated by
     // bloom filters created using different parameters.
-    const size_t k = array[len-1];
+    const int k = static_cast<uint8_t>(array[len-1]);
     if (k > 30) {
       // Reserved for potentially new encodings for short bloom filters.
       // Consider it a match.
       return true;
     }
-
-    uint32_t h = hash_func_(key);
-    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-    for (size_t j = 0; j < k; j++) {
-      const uint32_t bitpos = h % bits;
-      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
-      h += delta;
-    }
-    return true;
+    // NB: using k not num_probes_
+    return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits,
+                                                   k, array);
   }
 
   FilterBitsBuilder* GetFilterBitsBuilder() const override {
@@ -394,15 +319,15 @@ class BloomFilterPolicy : public FilterPolicy {
   bool UseBlockBasedBuilder() { return use_block_based_builder_; }
 
  private:
-  size_t bits_per_key_;
-  size_t num_probes_;
+  int bits_per_key_;
+  int num_probes_;
   uint32_t (*hash_func_)(const Slice& key);
 
   const bool use_block_based_builder_;
 
   void initialize() {
     // We intentionally round down to reduce probing cost a little bit
-    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
     if (num_probes_ < 1) num_probes_ = 1;
     if (num_probes_ > 30) num_probes_ = 30;
   }
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
new file mode 100644
index 00000000000..13c7a7ec6b0
--- /dev/null
+++ b/util/bloom_impl.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Implementation details of various Bloom filter implementations used in
+// RocksDB. (DynamicBloom is in a separate file for now because it
+// supports concurrent write.)
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+// A legacy Bloom filter implementation with no locality of probes (slow).
+// It uses double hashing to generate a sequence of hash values.
+// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
+// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
+//
+// DO NOT REUSE - faster and more predictably accurate implementations
+// are available at
+// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
+// See e.g. RocksDB DynamicBloom.
+//
+class LegacyNoLocalityBloomImpl {
+public:
+  static inline void AddHash(uint32_t h, uint32_t total_bits,
+                             int num_probes, char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      data[bitpos/8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
+                                  int num_probes, const char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      if ((data[bitpos/8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+
+// A legacy Bloom filter implementation with probes local to a single
+// cache line (fast). Because SST files might be transported between
+// platforms, the cache line size is a parameter rather than hard coded.
+// (But if specified as a constant parameter, an optimizing compiler
+// should take advantage of that.)
+//
+// When ExtraRotates is false, this implementation is notably deficient in
+// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
+// increment being zero (when cache line size is 512 bits). Thus, there's a
+// 1/512 chance of probing only one index, which we'd expect to incur about
+// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
+// https://github.com/facebook/rocksdb/issues/4120
+//
+// DO NOT REUSE - faster and more predictably accurate implementations
+// are available at
+// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
+// See e.g. RocksDB DynamicBloom.
+//
+template <bool ExtraRotates>
+class LegacyLocalityBloomImpl {
+private:
+  static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
+    uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
+    return offset_h % num_lines;
+  }
+public:
+  static inline void AddHash(uint32_t h, uint32_t num_lines,
+                             int num_probes, char *data,
+                             int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    char *data_at_offset =
+        data + (GetLine(h, num_lines) << log2_cache_line_bytes);
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+  }
+
+  static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
+                                         const char *data,
+                                         uint32_t /*out*/*byte_offset,
+                                         int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1),
+             0 /* rw */, 1 /* locality */);
+    *byte_offset = b;
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
+                                  int num_probes, const char *data,
+                                  int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    return HashMayMatchPrepared(h, num_probes,
+                                data + b, log2_cache_line_bytes);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
+                                          const char *data_at_offset,
+                                          int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index b759303996e..26146152106 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -352,7 +352,8 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5)) << length;
+    ASSERT_LE(FilterSize(),
+              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
 
     // All added keys must match
     for (int i = 0; i < length; i++) {

From 27f516acc8173176055e3d163e348b6ea5048c10 Mon Sep 17 00:00:00 2001
From: Ronak Sisodia <ronaksisodia07@gmail.com>
Date: Fri, 13 Sep 2019 10:41:32 -0700
Subject: [PATCH 376/572] Update HISTORY.md for option to make write group size
 configurable (#5798)

Summary:
Update HISTORY.md for option to make write group size configurable .
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5798

Differential Revision: D17369062

fbshipit-source-id: 390a3fa0b01675e91879486a729cf2cc7624d106
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index 4a3d715a0c1..f2e736ddff3 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
 ### New Features
+* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 ### Public API Change

From 9ba88a1e5daa2db4cbd5b2c6b494924eb8969889 Mon Sep 17 00:00:00 2001
From: Lingjing You <clylj135@gmail.com>
Date: Fri, 13 Sep 2019 10:47:47 -0700
Subject: [PATCH 377/572] Update history.md for option
 memtable_insert_hint_per_batch (#5799)

Summary:
Update history.md for option memtable_insert_hint_per_batch
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5799

Differential Revision: D17369186

fbshipit-source-id: 71d82f9d99d9a52d1475d1b0153670957b6111e9
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index f2e736ddff3..0a33faaa4d0 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
+* Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.

From 6a171724b776e55a4fdd535d78e2d1e8b6c74d74 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 13 Sep 2019 11:04:52 -0700
Subject: [PATCH 378/572] Clean up + fix build scripts re: USE_SSE= and
 PORTABLE= (#5800)

Summary:
In preparing to utilize a new Intel instruction extension, I
noticed problems with the existing build script in regard to the
existing utilized extensions, either with USE_SSE or PORTABLE flags.

* PORTABLE=0 was interpreted the same as PORTABLE=1. Now empty and 0
mean the same. (I guess you were not supposed to set PORTABLE= if you
wanted non-portable--except that...)
* The Facebook build script extensions would set PORTABLE=1 even if
it's already set in a make var or environment. Now it does not override
a non-empty setting, so use PORTABLE=0 for fully optimized build,
overriding Facebook environment default.
* Put in an explanation of the USE_SSE flag where it's used by
build_detect_platform, and cleaned up some confusing/redundant
associated logic.
* If USE_SSE was set and expected intrinsics were not available,
build_detect_platform would exit early but build would proceed with
broken, incomplete configuration. Now warning is gracefully recovered.
* If USE_SSE was set and expected intrinsics were not available,
build would still try to use flags like -msse4.2 etc. which could lead
to unexpected compilation failure or binary incompatibility. Now those
flags are not used if the warning is issued.

This should not break or change existing, valid build scripts.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5800

Test Plan: manual case testing

Differential Revision: D17369543

Pulled By: pdillinger

fbshipit-source-id: 4ee244911680ae71144d272c40aceea548e3ce88
---
 build_tools/build_detect_platform        | 40 +++++++++++++++++-------
 build_tools/fbcode_config.sh             |  7 +++--
 build_tools/fbcode_config4.8.1.sh        |  7 +++--
 build_tools/fbcode_config_platform007.sh |  7 +++--
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 7b18a5d5f59..00fa2b26423 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -533,7 +533,7 @@ if test "$USE_HDFS"; then
   JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
 fi
 
-if test -z "$PORTABLE"; then
+if test "0$PORTABLE" -eq 0; then
   if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
     # Tune for this POWER processor, treating '+' models as base models
     POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
@@ -547,16 +547,34 @@ if test -z "$PORTABLE"; then
     COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" == "IOS" ]; then
     COMMON_FLAGS="$COMMON_FLAGS"
-  elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then
+  elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
+    # TODO: Not sure why we don't use -march=native on these OSes
+    if test "$USE_SSE"; then
+      TRY_SSE_ETC="1"
+    fi
+  else
     COMMON_FLAGS="$COMMON_FLAGS -march=native "
-  elif test "$USE_SSE"; then
-    COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
   fi
-elif test "$USE_SSE"; then
-  COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
+else
+  # PORTABLE=1
+  if test "$USE_SSE"; then
+    TRY_SSE_ETC="1"
+  fi
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+if test "$TRY_SSE_ETC"; then
+  # The USE_SSE flag now means "attempt to compile with widely-available
+  # Intel architecture extensions utilized by specific optimizations in the
+  # source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
+  # It doesn't even really check that your current CPU is compatible.
+  #
+  # SSE4.2 available since nehalem, ca. 2008-2010
+  TRY_SSE42="-msse4.2"
+  # PCLMUL available since westmere, ca. 2010-2011
+  TRY_PCLMUL="-mpclmul"
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   #include <nmmintrin.h>
   int main() {
@@ -565,13 +583,12 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
   }
 EOF
 if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42"
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
 elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
-  exit 1
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   #include <wmmintrin.h>
   int main() {
@@ -583,10 +600,9 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
   }
 EOF
 if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_PCLMUL"
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
 elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
-  exit 1
 fi
 
 # iOS doesn't support thread-local storage, but this check would erroneously
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index c2c39db48fe..4834be5f4c1 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -86,9 +86,10 @@ else
 fi
 CFLAGS+=" -DTBB"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh
index 80fbdf431b9..5f0813a041e 100644
--- a/build_tools/fbcode_config4.8.1.sh
+++ b/build_tools/fbcode_config4.8.1.sh
@@ -53,9 +53,10 @@ LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
 TBB_INCLUDE=" -isystem $TBB_BASE/include/"
 TBB_LIBS="$TBB_BASE/lib/libtbb.a"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
diff --git a/build_tools/fbcode_config_platform007.sh b/build_tools/fbcode_config_platform007.sh
index 9da23fd843f..51edf134fcb 100644
--- a/build_tools/fbcode_config_platform007.sh
+++ b/build_tools/fbcode_config_platform007.sh
@@ -86,9 +86,10 @@ else
 fi
 CFLAGS+=" -DTBB"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"

From 83a6a614e9bf5f3f06abc265b736e868acee498b Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Fri, 13 Sep 2019 13:48:04 -0700
Subject: [PATCH 379/572] Refactor ArenaWrappedDBIter into separate files
 (#5801)

Summary:
Move definition and implementation for ArenaWrappedDBIter into its own .h/.cc files. Also, change inlining of functions to better comply with the Google C++ style guide.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5801

Test Plan: make check

Differential Revision: D17371012

Pulled By: anand1976

fbshipit-source-id: c1361abc2851575111e357a63d88be3b3d6cb341
---
 CMakeLists.txt                                |   1 +
 TARGETS                                       |   1 +
 db/arena_wrapped_db_iter.cc                   | 106 ++++
 db/arena_wrapped_db_iter.h                    | 112 ++++
 db/db_blob_index_test.cc                      |   1 +
 db/db_impl/db_impl.cc                         |   1 +
 db/db_impl/db_impl_readonly.cc                |   1 +
 db/db_impl/db_impl_secondary.cc               |   2 +-
 db/db_iter.cc                                 | 481 +++---------------
 db/db_iter.h                                  | 339 +++++++++---
 db/db_iterator_test.cc                        |   1 +
 src.mk                                        |   1 +
 utilities/blob_db/blob_db_iterator.h          |   1 +
 .../transactions/write_prepared_txn_db.cc     |   1 +
 .../transactions/write_unprepared_txn_db.cc   |   1 +
 15 files changed, 551 insertions(+), 499 deletions(-)
 create mode 100644 db/arena_wrapped_db_iter.cc
 create mode 100644 db/arena_wrapped_db_iter.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index db9e371aaef..9076af3ab08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -481,6 +481,7 @@ set(SOURCES
         cache/clock_cache.cc
         cache/lru_cache.cc
         cache/sharded_cache.cc
+        db/arena_wrapped_db_iter.cc
         db/builder.cc
         db/c.cc
         db/column_family.cc
diff --git a/TARGETS b/TARGETS
index 0a838877569..b603d027f06 100644
--- a/TARGETS
+++ b/TARGETS
@@ -108,6 +108,7 @@ cpp_library(
         "cache/clock_cache.cc",
         "cache/lru_cache.cc",
         "cache/sharded_cache.cc",
+        "db/arena_wrapped_db_iter.cc",
         "db/builder.cc",
         "db/c.cc",
         "db/column_family.cc",
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
new file mode 100644
index 00000000000..6a1635f62c7
--- /dev/null
+++ b/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace rocksdb {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+                                              std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = ToString(sv_number_);
+    }
+    return Status::OK();
+  }
+  return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
+                              const ImmutableCFOptions& cf_options,
+                              const MutableCFOptions& mutable_cf_options,
+                              const SequenceNumber& sequence,
+                              uint64_t max_sequential_skip_in_iteration,
+                              uint64_t version_number,
+                              ReadCallback* read_callback, DBImpl* db_impl,
+                              ColumnFamilyData* cfd, bool allow_blob,
+                              bool allow_refresh) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+                              cf_options.user_comparator, nullptr, sequence,
+                              true, max_sequential_skip_in_iteration,
+                              read_callback, db_impl, cfd, allow_blob);
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
+  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  if (sv_number_ != cur_sv_number) {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         allow_refresh_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+        latest_seq);
+    SetIterUnderDBIter(internal_iter);
+  } else {
+    db_iter_->set_sequence(latest_seq);
+    db_iter_->set_valid(false);
+  }
+  return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+    bool allow_blob, bool allow_refresh) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, allow_blob, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
+                           allow_blob);
+  }
+
+  return iter;
+}
+
+}  // namespace rocksdb
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
new file mode 100644
index 00000000000..6dbd64521be
--- /dev/null
+++ b/db/arena_wrapped_db_iter.h
@@ -0,0 +1,112 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class Arena;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+    return db_iter_->GetRangeDelAggregator();
+  }
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(InternalIterator* iter) {
+    static_cast<DBIter*>(db_iter_)->SetIter(iter);
+  }
+
+  virtual bool Valid() const override { return db_iter_->Valid(); }
+  virtual void SeekToFirst() override { db_iter_->SeekToFirst(); }
+  virtual void SeekToLast() override { db_iter_->SeekToLast(); }
+  virtual void Seek(const Slice& target) override { db_iter_->Seek(target); }
+  virtual void SeekForPrev(const Slice& target) override {
+    db_iter_->SeekForPrev(target);
+  }
+  virtual void Next() override { db_iter_->Next(); }
+  virtual void Prev() override { db_iter_->Prev(); }
+  virtual Slice key() const override { return db_iter_->key(); }
+  virtual Slice value() const override { return db_iter_->value(); }
+  virtual Status status() const override { return db_iter_->status(); }
+  bool IsBlob() const { return db_iter_->IsBlob(); }
+
+  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  virtual Status Refresh() override;
+
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableCFOptions& cf_options,
+            const MutableCFOptions& mutable_cf_options,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool allow_blob, bool allow_refresh);
+
+  // Store some parameters so we can refresh the iterator at a later point
+  // with these same params
+  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, ReadCallback* read_callback,
+                        bool allow_blob) {
+    read_options_ = read_options;
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    read_callback_ = read_callback;
+    allow_blob_ = allow_blob;
+  }
+
+ private:
+  DBIter* db_iter_;
+  Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  ReadCallback* read_callback_;
+  bool allow_blob_ = false;
+  bool allow_refresh_ = true;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
+    bool allow_refresh = true);
+}  // namespace rocksdb
diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
index e9618885a2d..0cdc93dde87 100644
--- a/db/db_blob_index_test.cc
+++ b/db/db_blob_index_test.cc
@@ -12,6 +12,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/column_family.h"
 #include "db/db_iter.h"
 #include "db/db_test_util.h"
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index fd1f0b45c43..8f104f707a9 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/builder.h"
 #include "db/compaction/compaction_job.h"
 #include "db/db_info_dumper.h"
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index 6db498397ce..d989bd8c771 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/db_impl/db_impl_readonly.h"
 
 #include "db/compacted_db_impl.h"
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 1a55c328ee5..da4a1da3a24 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -7,7 +7,7 @@
 
 #include <cinttypes>
 
-#include "db/db_iter.h"
+#include "db/arena_wrapped_db_iter.h"
 #include "db/merge_context.h"
 #include "logging/auto_roll_logger.h"
 #include "monitoring/perf_context_imp.h"
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 96143161123..8af13b7fa9c 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -46,323 +46,72 @@ static void DumpInternalIter(Iterator* iter) {
 }
 #endif
 
-// Memtables and sstables that make the DB representation contain
-// (userkey,seq,type) => uservalue entries.  DBIter
-// combines multiple entries for the same userkey found in the DB
-// representation into a single entry while accounting for sequence
-// numbers, deletion markers, overwrites, etc.
-class DBIter final: public Iterator {
- public:
-  // The following is grossly complicated. TODO: clean it up
-  // Which direction is the iterator currently moving?
-  // (1) When moving forward:
-  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
-  //        positioned at the exact entry that yields this->key(), this->value()
-  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
-  //        positioned immediately after the last entry that contributed to the
-  //        current this->value(). That entry may or may not have key equal to
-  //        this->key().
-  // (2) When moving backwards, the internal iterator is positioned
-  //     just before all entries whose user key == this->key().
-  enum Direction {
-    kForward,
-    kReverse
-  };
-
-  // LocalStatistics contain Statistics counters that will be aggregated per
-  // each iterator instance and then will be sent to the global statistics when
-  // the iterator is destroyed.
-  //
-  // The purpose of this approach is to avoid perf regression happening
-  // when multiple threads bump the atomic counters from a DBIter::Next().
-  struct LocalStatistics {
-    explicit LocalStatistics() { ResetCounters(); }
-
-    void ResetCounters() {
-      next_count_ = 0;
-      next_found_count_ = 0;
-      prev_count_ = 0;
-      prev_found_count_ = 0;
-      bytes_read_ = 0;
-      skip_count_ = 0;
-    }
-
-    void BumpGlobalStatistics(Statistics* global_statistics) {
-      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
-      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
-      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
-      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
-      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
-      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
-      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
-      ResetCounters();
-    }
-
-    // Map to Tickers::NUMBER_DB_NEXT
-    uint64_t next_count_;
-    // Map to Tickers::NUMBER_DB_NEXT_FOUND
-    uint64_t next_found_count_;
-    // Map to Tickers::NUMBER_DB_PREV
-    uint64_t prev_count_;
-    // Map to Tickers::NUMBER_DB_PREV_FOUND
-    uint64_t prev_found_count_;
-    // Map to Tickers::ITER_BYTES_READ
-    uint64_t bytes_read_;
-    // Map to Tickers::NUMBER_ITER_SKIP
-    uint64_t skip_count_;
-  };
-
-  DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableCFOptions& cf_options,
-         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations,
-         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-         bool allow_blob)
-      : env_(_env),
-        logger_(cf_options.info_log),
-        user_comparator_(cmp),
-        merge_operator_(cf_options.merge_operator),
-        iter_(iter),
-        read_callback_(read_callback),
-        sequence_(s),
-        statistics_(cf_options.statistics),
-        num_internal_keys_skipped_(0),
-        iterate_lower_bound_(read_options.iterate_lower_bound),
-        iterate_upper_bound_(read_options.iterate_upper_bound),
-        direction_(kForward),
-        valid_(false),
-        current_entry_is_merged_(false),
-        is_key_seqnum_zero_(false),
-        prefix_same_as_start_(read_options.prefix_same_as_start),
-        pin_thru_lifetime_(read_options.pin_data),
-        total_order_seek_(read_options.total_order_seek),
-        allow_blob_(allow_blob),
-        is_blob_(false),
-        arena_mode_(arena_mode),
-        range_del_agg_(&cf_options.internal_comparator, s),
-        db_impl_(db_impl),
-        cfd_(cfd),
-        start_seqnum_(read_options.iter_start_seqnum) {
-    RecordTick(statistics_, NO_ITERATOR_CREATED);
-    prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
-    max_skip_ = max_sequential_skip_in_iterations;
-    max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
-    if (pin_thru_lifetime_) {
-      pinned_iters_mgr_.StartPinning();
-    }
-    if (iter_.iter()) {
-      iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
-    }
-  }
-  // No copying allowed
-  DBIter(const DBIter&) = delete;
-  void operator=(const DBIter&) = delete;
-
-  ~DBIter() override {
-    // Release pinned data if any
-    if (pinned_iters_mgr_.PinningEnabled()) {
-      pinned_iters_mgr_.ReleasePinnedData();
-    }
-    RecordTick(statistics_, NO_ITERATOR_DELETED);
-    ResetInternalKeysSkippedCounter();
-    local_stats_.BumpGlobalStatistics(statistics_);
-    iter_.DeleteIter(arena_mode_);
-  }
-  virtual void SetIter(InternalIterator* iter) {
-    assert(iter_.iter() == nullptr);
-    iter_.Set(iter);
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+       const ImmutableCFOptions& cf_options,
+       const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+       InternalIterator* iter, SequenceNumber s, bool arena_mode,
+       uint64_t max_sequential_skip_in_iterations,
+       ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+       bool allow_blob)
+    : env_(_env),
+      logger_(cf_options.info_log),
+      user_comparator_(cmp),
+      merge_operator_(cf_options.merge_operator),
+      iter_(iter),
+      read_callback_(read_callback),
+      sequence_(s),
+      statistics_(cf_options.statistics),
+      num_internal_keys_skipped_(0),
+      iterate_lower_bound_(read_options.iterate_lower_bound),
+      iterate_upper_bound_(read_options.iterate_upper_bound),
+      direction_(kForward),
+      valid_(false),
+      current_entry_is_merged_(false),
+      is_key_seqnum_zero_(false),
+      prefix_same_as_start_(read_options.prefix_same_as_start),
+      pin_thru_lifetime_(read_options.pin_data),
+      total_order_seek_(read_options.total_order_seek),
+      allow_blob_(allow_blob),
+      is_blob_(false),
+      arena_mode_(arena_mode),
+      range_del_agg_(&cf_options.internal_comparator, s),
+      db_impl_(db_impl),
+      cfd_(cfd),
+      start_seqnum_(read_options.iter_start_seqnum) {
+  RecordTick(statistics_, NO_ITERATOR_CREATED);
+  prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
+  max_skip_ = max_sequential_skip_in_iterations;
+  max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
+  if (pin_thru_lifetime_) {
+    pinned_iters_mgr_.StartPinning();
+  }
+  if (iter_.iter()) {
     iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
   }
-  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
-    return &range_del_agg_;
-  }
-
-  bool Valid() const override { return valid_; }
-  Slice key() const override {
-    assert(valid_);
-    if(start_seqnum_ > 0) {
-      return saved_key_.GetInternalKey();
-    } else {
-      return saved_key_.GetUserKey();
-    }
-  }
-  Slice value() const override {
-    assert(valid_);
-    if (current_entry_is_merged_) {
-      // If pinned_value_ is set then the result of merge operator is one of
-      // the merge operands and we should return it.
-      return pinned_value_.data() ? pinned_value_ : saved_value_;
-    } else if (direction_ == kReverse) {
-      return pinned_value_;
-    } else {
-      return iter_.value();
-    }
-  }
-  Status status() const override {
-    if (status_.ok()) {
-      return iter_.status();
-    } else {
-      assert(!valid_);
-      return status_;
-    }
-  }
-  bool IsBlob() const {
-    assert(valid_ && (allow_blob_ || !is_blob_));
-    return is_blob_;
-  }
-
-  Status GetProperty(std::string prop_name, std::string* prop) override {
-    if (prop == nullptr) {
-      return Status::InvalidArgument("prop is nullptr");
-    }
-    if (prop_name == "rocksdb.iterator.super-version-number") {
-      // First try to pass the value returned from inner iterator.
-      return iter_.iter()->GetProperty(prop_name, prop);
-    } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
-      if (valid_) {
-        *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
-      } else {
-        *prop = "Iterator is not valid.";
-      }
-      return Status::OK();
-    } else if (prop_name == "rocksdb.iterator.internal-key") {
-      *prop = saved_key_.GetUserKey().ToString();
-      return Status::OK();
-    }
-    return Status::InvalidArgument("Unidentified property.");
-  }
-
-  inline void Next() final override;
-  inline void Prev() final override;
-  inline void Seek(const Slice& target) final override;
-  inline void SeekForPrev(const Slice& target) final override;
-  inline void SeekToFirst() final override;
-  inline void SeekToLast() final override;
-  Env* env() { return env_; }
-  void set_sequence(uint64_t s) {
-    sequence_ = s;
-    if (read_callback_) {
-      read_callback_->Refresh(s);
-    }
-  }
-  void set_valid(bool v) { valid_ = v; }
-
- private:
-  // For all methods in this block:
-  // PRE: iter_->Valid() && status_.ok()
-  // Return false if there was an error, and status() is non-ok, valid_ = false;
-  // in this case callers would usually stop what they were doing and return.
-  bool ReverseToForward();
-  bool ReverseToBackward();
-  bool FindValueForCurrentKey();
-  bool FindValueForCurrentKeyUsingSeek();
-  bool FindUserKeyBeforeSavedKey();
-  inline bool FindNextUserEntry(bool skipping, bool prefix_check);
-  inline bool FindNextUserEntryInternal(bool skipping, bool prefix_check);
-  bool ParseKey(ParsedInternalKey* key);
-  bool MergeValuesNewToOld();
-
-  void PrevInternal();
-  bool TooManyInternalKeysSkipped(bool increment = true);
-  inline bool IsVisible(SequenceNumber sequence);
-
-  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
-  // is called
-  void TempPinData() {
-    if (!pin_thru_lifetime_) {
-      pinned_iters_mgr_.StartPinning();
-    }
-  }
+}
 
-  // Release blocks pinned by TempPinData()
-  void ReleaseTempPinnedData() {
-    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
-      pinned_iters_mgr_.ReleasePinnedData();
-    }
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
   }
-
-  inline void ClearSavedValue() {
-    if (saved_value_.capacity() > 1048576) {
-      std::string empty;
-      swap(empty, saved_value_);
-    } else {
-      saved_value_.clear();
-    }
-  }
-
-  inline void ResetInternalKeysSkippedCounter() {
-    local_stats_.skip_count_ += num_internal_keys_skipped_;
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    return iter_.iter()->GetProperty(prop_name, prop);
+  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
     if (valid_) {
-      local_stats_.skip_count_--;
+      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+    } else {
+      *prop = "Iterator is not valid.";
     }
-    num_internal_keys_skipped_ = 0;
+    return Status::OK();
+  } else if (prop_name == "rocksdb.iterator.internal-key") {
+    *prop = saved_key_.GetUserKey().ToString();
+    return Status::OK();
   }
+  return Status::InvalidArgument("Unidentified property.");
+}
 
-  const SliceTransform* prefix_extractor_;
-  Env* const env_;
-  Logger* logger_;
-  UserComparatorWrapper user_comparator_;
-  const MergeOperator* const merge_operator_;
-  IteratorWrapper iter_;
-  ReadCallback* read_callback_;
-  // Max visible sequence number. It is normally the snapshot seq unless we have
-  // uncommitted data in db as in WriteUnCommitted.
-  SequenceNumber sequence_;
-
-  IterKey saved_key_;
-  // Reusable internal key data structure. This is only used inside one function
-  // and should not be used across functions. Reusing this object can reduce
-  // overhead of calling construction of the function if creating it each time.
-  ParsedInternalKey ikey_;
-  std::string saved_value_;
-  Slice pinned_value_;
-  // for prefix seek mode to support prev()
-  Statistics* statistics_;
-  uint64_t max_skip_;
-  uint64_t max_skippable_internal_keys_;
-  uint64_t num_internal_keys_skipped_;
-  const Slice* iterate_lower_bound_;
-  const Slice* iterate_upper_bound_;
-
-  IterKey prefix_start_buf_;
-
-  Status status_;
-  Slice prefix_start_key_;
-  Direction direction_;
-  bool valid_;
-  bool current_entry_is_merged_;
-  // True if we know that the current entry's seqnum is 0.
-  // This information is used as that the next entry will be for another
-  // user key.
-  bool is_key_seqnum_zero_;
-  const bool prefix_same_as_start_;
-  // Means that we will pin all data blocks we read as long the Iterator
-  // is not deleted, will be true if ReadOptions::pin_data is true
-  const bool pin_thru_lifetime_;
-  const bool total_order_seek_;
-  bool allow_blob_;
-  bool is_blob_;
-  bool arena_mode_;
-  // List of operands for merge operator.
-  MergeContext merge_context_;
-  ReadRangeDelAggregator range_del_agg_;
-  LocalStatistics local_stats_;
-  PinnedIteratorsManager pinned_iters_mgr_;
-#ifdef ROCKSDB_LITE
-  ROCKSDB_FIELD_UNUSED
-#endif
-  DBImpl* db_impl_;
-#ifdef ROCKSDB_LITE
-  ROCKSDB_FIELD_UNUSED
-#endif
-  ColumnFamilyData* cfd_;
-  // for diff snapshots we want the lower bound on the seqnum;
-  // if this value > 0 iterator will return internal keys
-  SequenceNumber start_seqnum_;
-};
-
-inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
   if (!ParseInternalKey(iter_.key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
     valid_ = false;
@@ -429,13 +178,13 @@ void DBIter::Next() {
 // keys against the prefix of the seeked key. Set to false when
 // performing a seek without a key (e.g. SeekToFirst). Set to
 // prefix_same_as_start_ for other iterations.
-inline bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
+bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
   PERF_TIMER_GUARD(find_next_user_entry_time);
   return FindNextUserEntryInternal(skipping, prefix_check);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
-inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
+bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_.Valid());
   assert(status_.ok());
@@ -1523,114 +1272,4 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
   return db_iter;
 }
 
-ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
-
-ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
-  return db_iter_->GetRangeDelAggregator();
-}
-
-void ArenaWrappedDBIter::SetIterUnderDBIter(InternalIterator* iter) {
-  static_cast<DBIter*>(db_iter_)->SetIter(iter);
-}
-
-inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
-inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
-inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
-inline void ArenaWrappedDBIter::Seek(const Slice& target) {
-  db_iter_->Seek(target);
-}
-inline void ArenaWrappedDBIter::SeekForPrev(const Slice& target) {
-  db_iter_->SeekForPrev(target);
-}
-inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
-inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
-inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
-inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
-inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
-bool ArenaWrappedDBIter::IsBlob() const { return db_iter_->IsBlob(); }
-inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
-                                              std::string* prop) {
-  if (prop_name == "rocksdb.iterator.super-version-number") {
-    // First try to pass the value returned from inner iterator.
-    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
-      *prop = ToString(sv_number_);
-    }
-    return Status::OK();
-  }
-  return db_iter_->GetProperty(prop_name, prop);
-}
-
-void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
-                              const ImmutableCFOptions& cf_options,
-                              const MutableCFOptions& mutable_cf_options,
-                              const SequenceNumber& sequence,
-                              uint64_t max_sequential_skip_in_iteration,
-                              uint64_t version_number,
-                              ReadCallback* read_callback, DBImpl* db_impl,
-                              ColumnFamilyData* cfd, bool allow_blob,
-                              bool allow_refresh) {
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
-                              cf_options.user_comparator, nullptr, sequence,
-                              true, max_sequential_skip_in_iteration,
-                              read_callback, db_impl, cfd, allow_blob);
-  sv_number_ = version_number;
-  allow_refresh_ = allow_refresh;
-}
-
-Status ArenaWrappedDBIter::Refresh() {
-  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
-    return Status::NotSupported("Creating renew iterator is not allowed.");
-  }
-  assert(db_iter_ != nullptr);
-  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
-  // correct behavior. Will be corrected automatically when we take a snapshot
-  // here for the case of WritePreparedTxnDB.
-  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
-  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
-  if (sv_number_ != cur_sv_number) {
-    Env* env = db_iter_->env();
-    db_iter_->~DBIter();
-    arena_.~Arena();
-    new (&arena_) Arena();
-
-    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
-    if (read_callback_) {
-      read_callback_->Refresh(latest_seq);
-    }
-    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
-         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
-         allow_refresh_);
-
-    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
-        latest_seq);
-    SetIterUnderDBIter(internal_iter);
-  } else {
-    db_iter_->set_sequence(latest_seq);
-    db_iter_->set_valid(false);
-  }
-  return Status::OK();
-}
-
-ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-    bool allow_blob, bool allow_refresh) {
-  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
-             max_sequential_skip_in_iterations, version_number, read_callback,
-             db_impl, cfd, allow_blob, allow_refresh);
-  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
-                           allow_blob);
-  }
-
-  return iter;
-}
-
 }  // namespace rocksdb
diff --git a/db/db_iter.h b/db/db_iter.h
index 6a4bf8a5507..d1b60ca8fe1 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -17,6 +17,7 @@
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -46,9 +47,268 @@ namespace rocksdb {
 //    key: BBB  value: v1
 //    key: BBC  value: v1
 //
-class Arena;
-class DBIter;
 
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final: public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  // LocalStatistics contain Statistics counters that will be aggregated per
+  // each iterator instance and then will be sent to the global statistics when
+  // the iterator is destroyed.
+  //
+  // The purpose of this approach is to avoid perf regression happening
+  // when multiple threads bump the atomic counters from a DBIter::Next().
+  struct LocalStatistics {
+    explicit LocalStatistics() { ResetCounters(); }
+
+    void ResetCounters() {
+      next_count_ = 0;
+      next_found_count_ = 0;
+      prev_count_ = 0;
+      prev_found_count_ = 0;
+      bytes_read_ = 0;
+      skip_count_ = 0;
+    }
+
+    void BumpGlobalStatistics(Statistics* global_statistics) {
+      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+      ResetCounters();
+    }
+
+    // Map to Tickers::NUMBER_DB_NEXT
+    uint64_t next_count_;
+    // Map to Tickers::NUMBER_DB_NEXT_FOUND
+    uint64_t next_found_count_;
+    // Map to Tickers::NUMBER_DB_PREV
+    uint64_t prev_count_;
+    // Map to Tickers::NUMBER_DB_PREV_FOUND
+    uint64_t prev_found_count_;
+    // Map to Tickers::ITER_BYTES_READ
+    uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
+  };
+
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableCFOptions& cf_options,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, SequenceNumber s, bool arena_mode,
+         uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool allow_blob);
+
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
+  ~DBIter() override {
+    // Release pinned data if any
+    if (pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
+    local_stats_.BumpGlobalStatistics(statistics_);
+    iter_.DeleteIter(arena_mode_);
+  }
+  virtual void SetIter(InternalIterator* iter) {
+    assert(iter_.iter() == nullptr);
+    iter_.Set(iter);
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+    return &range_del_agg_;
+  }
+
+  bool Valid() const override { return valid_; }
+  Slice key() const override {
+    assert(valid_);
+    if(start_seqnum_ > 0) {
+      return saved_key_.GetInternalKey();
+    } else {
+      return saved_key_.GetUserKey();
+    }
+  }
+  Slice value() const override {
+    assert(valid_);
+    if (current_entry_is_merged_) {
+      // If pinned_value_ is set then the result of merge operator is one of
+      // the merge operands and we should return it.
+      return pinned_value_.data() ? pinned_value_ : saved_value_;
+    } else if (direction_ == kReverse) {
+      return pinned_value_;
+    } else {
+      return iter_.value();
+    }
+  }
+  Status status() const override {
+    if (status_.ok()) {
+      return iter_.status();
+    } else {
+      assert(!valid_);
+      return status_;
+    }
+  }
+  bool IsBlob() const {
+    assert(valid_ && (allow_blob_ || !is_blob_));
+    return is_blob_;
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  void Next() final override;
+  void Prev() final override;
+  void Seek(const Slice& target) final override;
+  void SeekForPrev(const Slice& target) final override;
+  void SeekToFirst() final override;
+  void SeekToLast() final override;
+  Env* env() { return env_; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
+  void set_valid(bool v) { valid_ = v; }
+
+ private:
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  bool FindUserKeyBeforeSavedKey();
+  bool FindNextUserEntry(bool skipping, bool prefix_check);
+  bool FindNextUserEntryInternal(bool skipping, bool prefix_check);
+  bool ParseKey(ParsedInternalKey* key);
+  bool MergeValuesNewToOld();
+
+  void PrevInternal();
+  bool TooManyInternalKeysSkipped(bool increment = true);
+  bool IsVisible(SequenceNumber sequence);
+
+  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+  // is called
+  void TempPinData() {
+    if (!pin_thru_lifetime_) {
+      pinned_iters_mgr_.StartPinning();
+    }
+  }
+
+  // Release blocks pinned by TempPinData()
+  void ReleaseTempPinnedData() {
+    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
+    num_internal_keys_skipped_ = 0;
+  }
+
+  const SliceTransform* prefix_extractor_;
+  Env* const env_;
+  Logger* logger_;
+  UserComparatorWrapper user_comparator_;
+  const MergeOperator* const merge_operator_;
+  IteratorWrapper iter_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
+  SequenceNumber sequence_;
+
+  IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
+  std::string saved_value_;
+  Slice pinned_value_;
+  // for prefix seek mode to support prev()
+  Statistics* statistics_;
+  uint64_t max_skip_;
+  uint64_t max_skippable_internal_keys_;
+  uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
+  const Slice* iterate_upper_bound_;
+
+  IterKey prefix_start_buf_;
+
+  Status status_;
+  Slice prefix_start_key_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
+  const bool prefix_same_as_start_;
+  // Means that we will pin all data blocks we read as long the Iterator
+  // is not deleted, will be true if ReadOptions::pin_data is true
+  const bool pin_thru_lifetime_;
+  const bool total_order_seek_;
+  bool allow_blob_;
+  bool is_blob_;
+  bool arena_mode_;
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  ReadRangeDelAggregator range_del_agg_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  ColumnFamilyData* cfd_;
+  // for diff snapshots we want the lower bound on the seqnum;
+  // if this value > 0 iterator will return internal keys
+  SequenceNumber start_seqnum_;
+};
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified `sequence` number
 // into appropriate user keys.
@@ -61,79 +321,4 @@ extern Iterator* NewDBIterator(
     ReadCallback* read_callback, DBImpl* db_impl = nullptr,
     ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
 
-// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
-// iterator is supposed be allocated. This class is used as an entry point of
-// a iterator hierarchy whose memory can be allocated inline. In that way,
-// accessing the iterator tree can be more cache friendly. It is also faster
-// to allocate.
-// When using the class's Iterator interface, the behavior is exactly
-// the same as the inner DBIter.
-class ArenaWrappedDBIter : public Iterator {
- public:
-  virtual ~ArenaWrappedDBIter();
-
-  // Get the arena to be used to allocate memory for DBIter to be wrapped,
-  // as well as child iterators in it.
-  virtual Arena* GetArena() { return &arena_; }
-  virtual ReadRangeDelAggregator* GetRangeDelAggregator();
-
-  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
-  // a merging iterator.
-  virtual void SetIterUnderDBIter(InternalIterator* iter);
-  virtual bool Valid() const override;
-  virtual void SeekToFirst() override;
-  virtual void SeekToLast() override;
-  virtual void Seek(const Slice& target) override;
-  virtual void SeekForPrev(const Slice& target) override;
-  virtual void Next() override;
-  virtual void Prev() override;
-  virtual Slice key() const override;
-  virtual Slice value() const override;
-  virtual Status status() const override;
-  virtual Status Refresh() override;
-  bool IsBlob() const;
-
-  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
-
-  void Init(Env* env, const ReadOptions& read_options,
-            const ImmutableCFOptions& cf_options,
-            const MutableCFOptions& mutable_cf_options,
-            const SequenceNumber& sequence,
-            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-            bool allow_blob, bool allow_refresh);
-
-  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, ReadCallback* read_callback,
-                        bool allow_blob) {
-    read_options_ = read_options;
-    db_impl_ = db_impl;
-    cfd_ = cfd;
-    read_callback_ = read_callback;
-    allow_blob_ = allow_blob;
-  }
-
- private:
-  DBIter* db_iter_;
-  Arena arena_;
-  uint64_t sv_number_;
-  ColumnFamilyData* cfd_ = nullptr;
-  DBImpl* db_impl_ = nullptr;
-  ReadOptions read_options_;
-  ReadCallback* read_callback_;
-  bool allow_blob_ = false;
-  bool allow_refresh_ = true;
-};
-
-// Generate the arena wrapped iterator class.
-// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
-// be supported.
-extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
-    bool allow_refresh = true);
 }  // namespace rocksdb
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 997b38602c4..9ea7ea0d9f3 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -9,6 +9,7 @@
 
 #include <functional>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
 #include "db/db_test_util.h"
 #include "port/port.h"
diff --git a/src.mk b/src.mk
index e37c8c9191c..7a555253daf 100644
--- a/src.mk
+++ b/src.mk
@@ -3,6 +3,7 @@ LIB_SOURCES =                                                   \
   cache/clock_cache.cc                                          \
   cache/lru_cache.cc                                            \
   cache/sharded_cache.cc                                        \
+  db/arena_wrapped_db_iter.cc                                   \
   db/builder.cc                                                 \
   db/c.cc                                                       \
   db/column_family.cc                                           \
diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h
index 1565c670b18..f332445deac 100644
--- a/utilities/blob_db/blob_db_iterator.h
+++ b/utilities/blob_db/blob_db_iterator.h
@@ -6,6 +6,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include "db/arena_wrapped_db_iter.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
 #include "util/stop_watch.h"
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index f58305de2ea..d06c2b62d2b 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -13,6 +13,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index f6809faebdf..b883f44967d 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -5,6 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "db/arena_wrapped_db_iter.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"

From 97631357aa274d06a7ab09b3cde7b909262cc4dd Mon Sep 17 00:00:00 2001
From: Igor Canadi <igor@rockset.com>
Date: Fri, 13 Sep 2019 14:48:18 -0700
Subject: [PATCH 380/572] Allow ingesting overlapping files (#5539)

Summary:
Currently IngestExternalFile() fails when its input files' ranges overlap. This condition doesn't need to hold for files that are to be ingested in L0, though.

This commit allows overlapping files and forces their target level to L0.

Additionally, ingest job's completion is logged to EventLogger, analogous to flush and compaction jobs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5539

Differential Revision: D17370660

Pulled By: riversand963

fbshipit-source-id: 749a3899b17d1be267a5afd5b0a99d96b38ab2f3
---
 db/db_impl/db_impl.cc                 | 24 +++++++------
 db/external_sst_file_basic_test.cc    | 41 +++++++++++++++++++++
 db/external_sst_file_ingestion_job.cc | 52 +++++++++++++++++++++------
 db/external_sst_file_ingestion_job.h  | 17 ++++++---
 4 files changed, 108 insertions(+), 26 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8f104f707a9..4bcab93a867 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3763,9 +3763,9 @@ Status DBImpl::IngestExternalFiles(
   std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
   for (const auto& arg : args) {
     auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
-    ingestion_jobs.emplace_back(env_, versions_.get(), cfd,
-                                immutable_db_options_, env_options_,
-                                &snapshots_, arg.options, &directories_);
+    ingestion_jobs.emplace_back(
+        env_, versions_.get(), cfd, immutable_db_options_, env_options_,
+        &snapshots_, arg.options, &directories_, &event_logger_);
   }
   std::vector<std::pair<bool, Status>> exec_results;
   for (size_t i = 0; i != num_cfs; ++i) {
@@ -3895,19 +3895,21 @@ Status DBImpl::IngestExternalFiles(
       }
     }
     if (status.ok()) {
-      bool should_increment_last_seqno =
-          ingestion_jobs[0].ShouldIncrementLastSequence();
+      int consumed_seqno_count =
+          ingestion_jobs[0].ConsumedSequenceNumbersCount();
 #ifndef NDEBUG
       for (size_t i = 1; i != num_cfs; ++i) {
-        assert(should_increment_last_seqno ==
-               ingestion_jobs[i].ShouldIncrementLastSequence());
+        assert(!!consumed_seqno_count ==
+               !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
+        consumed_seqno_count +=
+            ingestion_jobs[i].ConsumedSequenceNumbersCount();
       }
 #endif
-      if (should_increment_last_seqno) {
+      if (consumed_seqno_count > 0) {
         const SequenceNumber last_seqno = versions_->LastSequence();
-        versions_->SetLastAllocatedSequence(last_seqno + 1);
-        versions_->SetLastPublishedSequence(last_seqno + 1);
-        versions_->SetLastSequence(last_seqno + 1);
+        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
       }
       autovector<ColumnFamilyData*> cfds_to_commit;
       autovector<const MutableCFOptions*> mutable_cf_options_list;
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index c85d7394ea7..43a003a85cc 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -1070,6 +1070,47 @@ TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
   } while (ChangeOptionsForFileIngestionTest());
 }
 
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+  Options options = CurrentOptions();
+
+  std::vector<std::string> files;
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "z"));
+    ASSERT_OK(sst_file_writer.Put("i", "m"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+  }
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("i", "k"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "z");
+  ASSERT_EQ(Get("i"), "k");
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(total_keys, 2);
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
                         testing::Values(std::make_tuple(true, true),
                                         std::make_tuple(true, false),
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 03bcd424022..3926d7fa9ff 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -71,11 +71,16 @@ Status ExternalSstFileIngestionJob::Prepare(
     for (size_t i = 0; i < num_files - 1; i++) {
       if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
                             sorted_files[i + 1]->smallest_internal_key) >= 0) {
-        return Status::NotSupported("Files have overlapping ranges");
+        files_overlap_ = true;
+        break;
       }
     }
   }
 
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
+  }
+
   for (IngestedFileInfo& f : files_to_ingest_) {
     if (f.num_entries == 0 && f.num_range_deletions == 0) {
       return Status::InvalidArgument("File contain no entries");
@@ -212,7 +217,7 @@ Status ExternalSstFileIngestionJob::Run() {
   }
   // It is safe to use this instead of LastAllocatedSequence since we are
   // the only active writer, and hence they are equal
-  const SequenceNumber last_seqno = versions_->LastSequence();
+  SequenceNumber last_seqno = versions_->LastSequence();
   edit_.SetColumnFamily(cfd_->GetID());
   // The levels that the files will be ingested into
 
@@ -222,8 +227,8 @@ Status ExternalSstFileIngestionJob::Run() {
       status = CheckLevelForIngestedBehindFile(&f);
     } else {
       status = AssignLevelAndSeqnoForIngestedFile(
-         super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
-         &f, &assigned_seqno);
+          super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+          last_seqno, &f, &assigned_seqno);
     }
     if (!status.ok()) {
       return status;
@@ -231,8 +236,10 @@ Status ExternalSstFileIngestionJob::Run() {
     status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
                              &assigned_seqno);
-    if (assigned_seqno == last_seqno + 1) {
-      consumed_seqno_ = true;
+    if (assigned_seqno > last_seqno) {
+      assert(assigned_seqno == last_seqno + 1);
+      last_seqno = assigned_seqno;
+      ++consumed_seqno_count_;
     }
     if (!status.ok()) {
       return status;
@@ -250,6 +257,13 @@ void ExternalSstFileIngestionJob::UpdateStats() {
   uint64_t total_keys = 0;
   uint64_t total_l0_files = 0;
   uint64_t total_time = env_->NowMicros() - job_start_time_;
+
+  EventLoggerStream stream = event_logger_->Log();
+  stream << "event"
+         << "ingest_finished";
+  stream << "files_ingested";
+  stream.StartArray();
+
   for (IngestedFileInfo& f : files_to_ingest_) {
     InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
     stats.micros = total_time;
@@ -277,7 +291,18 @@ void ExternalSstFileIngestionJob::UpdateStats() {
         "(global_seqno=%" PRIu64 ")\n",
         f.external_file_path.c_str(), f.picked_level,
         f.internal_file_path.c_str(), f.assigned_seqno);
+    stream << "file" << f.internal_file_path << "level" << f.picked_level;
   }
+  stream.EndArray();
+
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
   cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
                                      total_keys);
   cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
@@ -301,7 +326,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
                        f.internal_file_path.c_str(), s.ToString().c_str());
       }
     }
-    consumed_seqno_ = false;
+    consumed_seqno_count_ = 0;
+    files_overlap_ = false;
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
     for (IngestedFileInfo& f : files_to_ingest_) {
@@ -479,13 +505,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
 
 Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
-    IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno) {
+    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+    SequenceNumber* assigned_seqno) {
   Status status;
   *assigned_seqno = 0;
-  const SequenceNumber last_seqno = versions_->LastSequence();
   if (force_global_seqno) {
     *assigned_seqno = last_seqno + 1;
-    if (compaction_style == kCompactionStyleUniversal) {
+    if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
       file_to_ingest->picked_level = 0;
       return status;
     }
@@ -547,6 +573,12 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
       target_level = lvl;
     }
   }
+  // If files overlap, we have to ingest them at level 0 and assign the newest
+  // sequence number
+  if (files_overlap_) {
+    target_level = 0;
+    *assigned_seqno = last_seqno + 1;
+  }
  TEST_SYNC_POINT_CALLBACK(
       "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
       &overlap_with_db);
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 4f9fac2416d..90b8326bbef 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -12,6 +12,7 @@
 #include "db/dbformat.h"
 #include "db/internal_stats.h"
 #include "db/snapshot_impl.h"
+#include "logging/event_logger.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -71,7 +72,7 @@ class ExternalSstFileIngestionJob {
       const ImmutableDBOptions& db_options, const EnvOptions& env_options,
       SnapshotList* db_snapshots,
       const IngestExternalFileOptions& ingestion_options,
-      Directories* directories)
+      Directories* directories, EventLogger* event_logger)
       : env_(env),
         versions_(versions),
         cfd_(cfd),
@@ -80,8 +81,9 @@ class ExternalSstFileIngestionJob {
         db_snapshots_(db_snapshots),
         ingestion_options_(ingestion_options),
         directories_(directories),
+        event_logger_(event_logger),
         job_start_time_(env_->NowMicros()),
-        consumed_seqno_(false) {
+        consumed_seqno_count_(0) {
     assert(directories != nullptr);
   }
 
@@ -116,8 +118,8 @@ class ExternalSstFileIngestionJob {
     return files_to_ingest_;
   }
 
-  // Whether to increment VersionSet's seqno after this job runs
-  bool ShouldIncrementLastSequence() const { return consumed_seqno_; }
+  // How many sequence numbers did we consume as part of the ingest job?
+  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
 
  private:
   // Open the external file and populate `file_to_ingest` with all the
@@ -132,6 +134,7 @@ class ExternalSstFileIngestionJob {
   Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
                                             bool force_global_seqno,
                                             CompactionStyle compaction_style,
+                                            SequenceNumber last_seqno,
                                             IngestedFileInfo* file_to_ingest,
                                             SequenceNumber* assigned_seqno);
 
@@ -163,9 +166,13 @@ class ExternalSstFileIngestionJob {
   autovector<IngestedFileInfo> files_to_ingest_;
   const IngestExternalFileOptions& ingestion_options_;
   Directories* directories_;
+  EventLogger* event_logger_;
   VersionEdit edit_;
   uint64_t job_start_time_;
-  bool consumed_seqno_;
+  int consumed_seqno_count_;
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+  // ingested in L0
+  bool files_overlap_{false};
 };
 
 }  // namespace rocksdb

From bf5dbc17e3f0a66fab786f84c2f18344559b340e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 13 Sep 2019 15:59:16 -0700
Subject: [PATCH 381/572] merging_iterator.cc: Small refactoring (#5793)

Summary:
1. Put the similar logic of adding valid iterator to heap and check invalid iterator's status code to the same helper functions.
2. Because of 1, in the changing direction case, move around the places where we check status a little bit so that we can call the helper function there too. The logic would only divert in the case where the iterator is valid but status is not OK, which is not expected to happen. Add an assertion for that.
3. Put the logic of changing direction from forward to backward to a separate function so the unlikely code path is not in Prev().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5793

Test Plan: run all existing tests.

Differential Revision: D17374397

fbshipit-source-id: d595ffcf156095c4bd0f5532bacba854482a2332
---
 table/merging_iterator.cc | 138 ++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 67 deletions(-)

diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 1a0d4df8995..82329ae8ab9 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -51,12 +51,7 @@ class MergingIterator : public InternalIterator {
       children_[i].Set(children[i]);
     }
     for (auto& child : children_) {
-      if (child.Valid()) {
-        assert(child.status().ok());
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMinHeapOrCheckStatus(&child);
     }
     current_ = CurrentForward();
   }
@@ -74,12 +69,9 @@ class MergingIterator : public InternalIterator {
       iter->SetPinnedItersMgr(pinned_iters_mgr_);
     }
     auto new_wrapper = children_.back();
+    AddToMinHeapOrCheckStatus(&new_wrapper);
     if (new_wrapper.Valid()) {
-      assert(new_wrapper.status().ok());
-      minHeap_.push(&new_wrapper);
       current_ = CurrentForward();
-    } else {
-      considerStatus(new_wrapper.status());
     }
   }
 
@@ -98,12 +90,7 @@ class MergingIterator : public InternalIterator {
     status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToFirst();
-      if (child.Valid()) {
-        assert(child.status().ok());
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMinHeapOrCheckStatus(&child);
     }
     direction_ = kForward;
     current_ = CurrentForward();
@@ -115,12 +102,7 @@ class MergingIterator : public InternalIterator {
     status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToLast();
-      if (child.Valid()) {
-        assert(child.status().ok());
-        maxHeap_->push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMaxHeapOrCheckStatus(&child);
     }
     direction_ = kReverse;
     current_ = CurrentReverse();
@@ -147,16 +129,16 @@ class MergingIterator : public InternalIterator {
           comparator_->Compare(target, child.key()) > 0 ||
           child.iter()->is_mutable()) {
         PERF_TIMER_GUARD(seek_child_seek_time);
+
         child.Seek(target);
+
         PERF_COUNTER_ADD(seek_child_seek_count, 1);
       }
-
-      if (child.Valid()) {
-        assert(child.status().ok());
+      {
+        // Strictly, we timed slightly more than min heap operation,
+        // but these operations are very cheap.
         PERF_TIMER_GUARD(seek_min_heap_time);
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
+        AddToMinHeapOrCheckStatus(&child);
       }
     }
     direction_ = kForward;
@@ -178,12 +160,9 @@ class MergingIterator : public InternalIterator {
       }
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
-      if (child.Valid()) {
-        assert(child.status().ok());
+      {
         PERF_TIMER_GUARD(seek_max_heap_time);
-        maxHeap_->push(&child);
-      } else {
-        considerStatus(child.status());
+        AddToMaxHeapOrCheckStatus(&child);
       }
     }
     direction_ = kReverse;
@@ -246,35 +225,7 @@ class MergingIterator : public InternalIterator {
     if (direction_ != kReverse) {
       // Otherwise, retreat the non-current children.  We retreat current_
       // just after the if-block.
-      ClearHeaps();
-      InitMaxHeap();
-      Slice target = key();
-      for (auto& child : children_) {
-        if (&child != current_) {
-          child.SeekForPrev(target);
-          TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
-          considerStatus(child.status());
-          if (child.Valid() && comparator_->Equal(target, child.key())) {
-            child.Prev();
-            considerStatus(child.status());
-          }
-        }
-        if (child.Valid()) {
-          assert(child.status().ok());
-          maxHeap_->push(&child);
-        }
-      }
-      direction_ = kReverse;
-      if (!prefix_seek_mode_) {
-        // Note that we don't do assert(current_ == CurrentReverse()) here
-        // because it is possible to have some keys larger than the seek-key
-        // inserted between Seek() and SeekToLast(), which makes current_ not
-        // equal to CurrentReverse().
-        current_ = CurrentReverse();
-      }
-      // The loop advanced all non-current children to be < key() so current_
-      // should still be strictly the smallest key.
-      assert(current_ == CurrentReverse());
+      SwitchToBackward();
     }
 
     // For the heap modifications below to be correct, current_ must be the
@@ -370,8 +321,20 @@ class MergingIterator : public InternalIterator {
   std::unique_ptr<MergerMaxIterHeap> maxHeap_;
   PinnedIteratorsManager* pinned_iters_mgr_;
 
+  // In forward direction, process a child that is not in the min heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMinHeapOrCheckStatus(IteratorWrapper*);
+
+  // In backward direction, process a child that is not in the max heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMaxHeapOrCheckStatus(IteratorWrapper*);
+
   void SwitchToForward();
 
+  // Switch the direction from forward to backward without changing the
+  // position. Iterator should still be valid.
+  void SwitchToBackward();
+
   IteratorWrapper* CurrentForward() const {
     assert(direction_ == kForward);
     return !minHeap_.empty() ? minHeap_.top() : nullptr;
@@ -384,6 +347,24 @@ class MergingIterator : public InternalIterator {
   }
 };
 
+void MergingIterator::AddToMinHeapOrCheckStatus(IteratorWrapper* child) {
+  if (child->Valid()) {
+    assert(child->status().ok());
+    minHeap_.push(child);
+  } else {
+    considerStatus(child->status());
+  }
+}
+
+void MergingIterator::AddToMaxHeapOrCheckStatus(IteratorWrapper* child) {
+  if (child->Valid()) {
+    assert(child->status().ok());
+    maxHeap_->push(child);
+  } else {
+    considerStatus(child->status());
+  }
+}
+
 void MergingIterator::SwitchToForward() {
   // Otherwise, advance the non-current children.  We advance current_
   // just after the if-block.
@@ -392,19 +373,42 @@ void MergingIterator::SwitchToForward() {
   for (auto& child : children_) {
     if (&child != current_) {
       child.Seek(target);
-      considerStatus(child.status());
       if (child.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.status().ok());
         child.Next();
-        considerStatus(child.status());
       }
     }
-    if (child.Valid()) {
-      minHeap_.push(&child);
-    }
+    AddToMinHeapOrCheckStatus(&child);
   }
   direction_ = kForward;
 }
 
+void MergingIterator::SwitchToBackward() {
+  ClearHeaps();
+  InitMaxHeap();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child != current_) {
+      child.SeekForPrev(target);
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+      if (child.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.status().ok());
+        child.Prev();
+      }
+    }
+    AddToMaxHeapOrCheckStatus(&child);
+  }
+  direction_ = kReverse;
+  if (!prefix_seek_mode_) {
+    // Note that we don't do assert(current_ == CurrentReverse()) here
+    // because it is possible to have some keys larger than the seek-key
+    // inserted between Seek() and SeekToLast(), which makes current_ not
+    // equal to CurrentReverse().
+    current_ = CurrentReverse();
+  }
+  assert(current_ == CurrentReverse());
+}
+
 void MergingIterator::ClearHeaps() {
   minHeap_.clear();
   if (maxHeap_) {

From 2ed91622fbdcedc5e56006369d08b9819b02d1eb Mon Sep 17 00:00:00 2001
From: "Peter (Stig) Edwards" <thatsafunnyname@gmail.com>
Date: Fri, 13 Sep 2019 16:29:16 -0700
Subject: [PATCH 382/572] sst_dump recompress show #blocks compressed and not
 compressed (#5791)

Summary:
Closes https://github.com/facebook/rocksdb/issues/1474
Helps show when the 12.5% threshold for GoodCompressionRatio (originally from ldb) is hit.

Example output:

```
> ./sst_dump --file=/tmp/test.sst --command=recompress
from [] to []
Process /tmp/test.sst
Sst file format: block-based
Block Size: 16384
Compression: kNoCompression           Size:  122579836 Blocks:   2300 Compressed:      0 (  0.0%) Not compressed (ratio):   2300 (100.0%) Not compressed (abort):      0 (  0.0%)
Compression: kSnappyCompression       Size:   46289962 Blocks:   2300 Compressed:   2119 ( 92.1%) Not compressed (ratio):    181 (  7.9%) Not compressed (abort):      0 (  0.0%)
Compression: kZlibCompression         Size:   29689825 Blocks:   2300 Compressed:   2301 (100.0%) Not compressed (ratio):      0 (  0.0%) Not compressed (abort):      0 (  0.0%)
Unsupported compression type: kBZip2Compression.
Compression: kLZ4Compression          Size:   44785490 Blocks:   2300 Compressed:   1950 ( 84.8%) Not compressed (ratio):    350 ( 15.2%) Not compressed (abort):      0 (  0.0%)
Compression: kLZ4HCCompression        Size:   37498895 Blocks:   2300 Compressed:   2301 (100.0%) Not compressed (ratio):      0 (  0.0%) Not compressed (abort):      0 (  0.0%)
Unsupported compression type: kXpressCompression.
Compression: kZSTD                    Size:   32208707 Blocks:   2300 Compressed:   2301 (100.0%) Not compressed (ratio):      0 (  0.0%) Not compressed (abort):      0 (  0.0%)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5791

Differential Revision: D17347870

fbshipit-source-id: af10849c010b46b20e54162b70123c2805ffe526
---
 tools/sst_dump_tool.cc    | 57 +++++++++++++++++++++++++++++++++------
 tools/sst_dump_tool_imp.h |  3 ++-
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index cbd9c0c8785..efe272c093c 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -158,7 +158,8 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
 }
 
 uint64_t SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, size_t block_size) {
+    const TableBuilderOptions& tb_options, size_t block_size,
+    uint64_t* num_data_blocks) {
   std::unique_ptr<WritableFile> out_file;
   std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
   env->NewWritableFile(testFileName, &out_file, soptions_);
@@ -189,6 +190,8 @@ uint64_t SstFileDumper::CalculateCompressedTableSize(
     exit(1);
   }
   uint64_t size = table_builder->FileSize();
+  assert(num_data_blocks != nullptr);
+  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
   env->DeleteFile(testFileName);
   return size;
 }
@@ -199,6 +202,8 @@ int SstFileDumper::ShowAllCompressionSizes(
         compression_types) {
   ReadOptions read_options;
   Options opts;
+  opts.statistics = rocksdb::CreateDBStatistics();
+  opts.statistics->set_stats_level(StatsLevel::kAll);
   const ImmutableCFOptions imoptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
@@ -217,16 +222,52 @@ int SstFileDumper::ShowAllCompressionSizes(
           imoptions, moptions, ikc, &block_based_table_factories, i.first,
           0 /* sample_for_compression */, compress_opt,
           false /* skip_filters */, column_family_name, unknown_level);
-      uint64_t file_size = CalculateCompressedTableSize(tb_opts, block_size);
-      fprintf(stdout, "Compression: %s", i.second);
-      fprintf(stdout, " Size: %" PRIu64 "\n", file_size);
+      uint64_t num_data_blocks = 0;
+      uint64_t file_size =
+          CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks);
+      fprintf(stdout, "Compression: %-24s", i.second);
+      fprintf(stdout, " Size: %10" PRIu64, file_size);
+      fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+      const uint64_t compressed_blocks =
+          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+      const uint64_t not_compressed_blocks =
+          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+      // When the option enable_index_compression is true,
+      // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+      if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+        num_data_blocks = compressed_blocks + not_compressed_blocks;
+      }
+      const uint64_t ratio_not_compressed_blocks =
+          (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+      const double compressed_pcnt =
+          (0 == num_data_blocks) ? 0.0
+                                 : ((static_cast<double>(compressed_blocks) /
+                                     static_cast<double>(num_data_blocks)) *
+                                    100.0);
+      const double ratio_not_compressed_pcnt =
+          (0 == num_data_blocks)
+              ? 0.0
+              : ((static_cast<double>(ratio_not_compressed_blocks) /
+                  static_cast<double>(num_data_blocks)) *
+                 100.0);
+      const double not_compressed_pcnt =
+          (0 == num_data_blocks)
+              ? 0.0
+              : ((static_cast<double>(not_compressed_blocks) /
+                  static_cast<double>(num_data_blocks)) *
+                 100.0);
+      fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+              compressed_pcnt);
+      fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+              ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+      fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+              not_compressed_blocks, not_compressed_pcnt);
     } else {
       fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
     }
   }
   return 0;
 }
-
 Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
                                           RandomAccessFileReader* file,
                                           uint64_t file_size) {
@@ -365,15 +406,15 @@ namespace {
 
 void print_help() {
   fprintf(stderr,
-          R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw]
+          R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
     --command=check|scan|raw|verify
-        check: Iterate over entries in files but dont print anything except if an error is encounterd (default command)
+        check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
         scan: Iterate over entries in files and print them to screen
         raw: Dump all the table contents to <file_name>_dump.txt
-        verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered
+        verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered
         recompress: reports the SST file size if recompressed with different
                     compression types
 
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index 846738a4041..51c15c8aa93 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -46,7 +46,8 @@ class SstFileDumper {
                              RandomAccessFileReader* file, uint64_t file_size);
 
   uint64_t CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                        size_t block_size);
+                                        size_t block_size,
+                                        uint64_t* num_data_blocks);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
   Status SetOldTableOptions();

From 0b59ef519fca040bec88a8f7c36e133de1f60830 Mon Sep 17 00:00:00 2001
From: Ariel Hurdle <arielhurdle@gmail.com>
Date: Sun, 15 Sep 2019 21:27:44 -0700
Subject: [PATCH 383/572] Add avrio to USERS.md (#5748)

Summary:
Add Avrio's use case of RocksDB to USERS.md
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5748

Differential Revision: D17392483

Pulled By: riversand963

fbshipit-source-id: 673f4e1a5c82079ec8dfb2816db0cc9af9d38341
---
 USERS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/USERS.md b/USERS.md
index fdb5af74d9e..0fc030339d8 100644
--- a/USERS.md
+++ b/USERS.md
@@ -97,5 +97,8 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed
 ## IOTA Foundation
  [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
  
+## Avrio Project
+ [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
+ 
 ## Crux
 [Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.

From 915d72d849d0ba1fe88310ea81ce627270b88df7 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 16 Sep 2019 09:35:51 -0700
Subject: [PATCH 384/572] Improve accuracy testing for DynamicBloom (#5805)

Summary:
DynamicBloom unit test now tests non-sequential as well as
sequential keys in testing FP rates. Also now verifies larger structures.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5805

Test Plan: thisisthetest

Differential Revision: D17398109

Pulled By: pdillinger

fbshipit-source-id: 374074206c76d242efa378afc27830448a0e892a
---
 util/dynamic_bloom_test.cc | 72 ++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 23 deletions(-)

diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 22741ed87bb..4799b35a6a3 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -36,10 +36,25 @@ DEFINE_bool(enable_perf, false, "");
 
 namespace rocksdb {
 
-static Slice Key(uint64_t i, char* buffer) {
-  memcpy(buffer, &i, sizeof(i));
-  return Slice(buffer, sizeof(i));
-}
+struct KeyMaker {
+  uint64_t a;
+  uint64_t b;
+
+  // Sequential, within a hash function block
+  inline Slice Seq(uint64_t i) {
+    a = i;
+    return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+  }
+  // Not quite sequential, varies across hash function blocks
+  inline Slice Nonseq(uint64_t i) {
+    a = i;
+    b = i * 123;
+    return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+  }
+  inline Slice Key(uint64_t i, bool nonseq) {
+    return nonseq ? Nonseq(i) : Seq(i);
+  }
+};
 
 class DynamicBloomTest : public testing::Test {};
 
@@ -100,13 +115,13 @@ static uint32_t NextNum(uint32_t num) {
   } else if (num < 1000) {
     num += 100;
   } else {
-    num += 1000;
+    num = num * 26 / 10;
   }
   return num;
 }
 
 TEST_F(DynamicBloomTest, VaryingLengths) {
-  char buffer[sizeof(uint64_t)];
+  KeyMaker km;
 
   // Count number of filters that significantly exceed the false positive rate
   int mediocre_filters = 0;
@@ -116,47 +131,53 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
   fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
           num_probes);
 
-  for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+  // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys.
+  // But that effect is hidden if using sequential keys (unique hashes).
+  for (bool nonseq : {false, true}) {
+  const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
+  for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
     uint32_t bloom_bits = 0;
     Arena arena;
     bloom_bits = num * FLAGS_bits_per_key;
     DynamicBloom bloom(&arena, bloom_bits, num_probes);
     for (uint64_t i = 0; i < num; i++) {
-      bloom.Add(Key(i, buffer));
-      ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+      bloom.Add(km.Key(i, nonseq));
+      ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
     }
 
     // All added keys must match
     for (uint64_t i = 0; i < num; i++) {
-      ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
-                                                    << "; key " << i;
+      ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
     }
 
     // Check false positive rate
     int result = 0;
-    for (uint64_t i = 0; i < 10000; i++) {
-      if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+    for (uint64_t i = 0; i < 30000; i++) {
+      if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
         result++;
       }
     }
-    double rate = result / 10000.0;
+    double rate = result / 30000.0;
 
     fprintf(stderr,
-            "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u\n",
-            rate * 100.0, num, bloom_bits);
+            "False positives (%s keys): "
+            "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
+            nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
 
     if (rate > 0.0125)
       mediocre_filters++;  // Allowed, but not too often
     else
       good_filters++;
   }
+  }
 
   fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
           mediocre_filters);
-  ASSERT_LE(mediocre_filters, good_filters / 5);
+  ASSERT_LE(mediocre_filters, good_filters / 25);
 }
 
 TEST_F(DynamicBloomTest, perf) {
+  KeyMaker km;
   StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
@@ -173,7 +194,7 @@ TEST_F(DynamicBloomTest, perf) {
 
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
-      std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+      std_bloom.Add(km.Seq(i));
     }
 
     uint64_t elapsed = timer.ElapsedNanos();
@@ -183,7 +204,7 @@ TEST_F(DynamicBloomTest, perf) {
     uint32_t count = 0;
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
-      if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
+      if (std_bloom.MayContain(km.Seq(i))) {
         ++count;
       }
     }
@@ -203,6 +224,9 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
   uint32_t num_threads = 4;
   std::vector<port::Thread> threads;
 
+  // NB: Uses sequential keys for speed, but that hides the FP rate
+  // impact of 32-bit hash, which is noticeable starting around 10M keys
+  // when they vary across hashing blocks.
   for (uint32_t m = 1; m <= m_limit; ++m) {
     Arena arena;
     const uint32_t num_keys = m * 8 * 1024 * 1024;
@@ -213,11 +237,11 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
     std::atomic<uint64_t> elapsed(0);
 
     std::function<void(size_t)> adder([&](size_t t) {
+      KeyMaker km;
       StopWatchNano timer(Env::Default());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-        std_bloom.AddConcurrently(
-            Slice(reinterpret_cast<const char*>(&i), 8));
+        std_bloom.AddConcurrently(km.Seq(i));
       }
       elapsed += timer.ElapsedNanos();
     });
@@ -235,11 +259,12 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
     elapsed = 0;
     std::function<void(size_t)> hitter([&](size_t t) {
+      KeyMaker km;
       StopWatchNano timer(Env::Default());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
         bool f =
-            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+            std_bloom.MayContain(km.Seq(i));
         ASSERT_TRUE(f);
       }
       elapsed += timer.ElapsedNanos();
@@ -259,12 +284,13 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
     elapsed = 0;
     std::atomic<uint32_t> false_positives(0);
     std::function<void(size_t)> misser([&](size_t t) {
+      KeyMaker km;
       StopWatchNano timer(Env::Default());
       timer.Start();
       for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
            i += num_threads) {
         bool f =
-            std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
+            std_bloom.MayContain(km.Seq(i));
         if (f) {
           ++false_positives;
         }

From b931f84e562a6f484fe0e700bdc5471d395a3924 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 16 Sep 2019 10:31:27 -0700
Subject: [PATCH 385/572] Divide file_reader_writer.h and .cc (#5803)

Summary:
file_reader_writer.h and .cc contain several files and helper function, and it's hard to navigate. Separate it to multiple files and put them under file/
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5803

Test Plan: Build whole project using make and cmake.

Differential Revision: D17374550

fbshipit-source-id: 10efca907721e7a78ed25bbf74dc5410dea05987
---
 CMakeLists.txt                                |    7 +-
 TARGETS                                       |    7 +-
 db/builder.cc                                 |    3 +-
 db/compaction/compaction_job.cc               |    3 +-
 db/compaction/compaction_job_test.cc          |    2 +-
 db/db_impl/db_impl.cc                         |    2 +-
 db/db_impl/db_impl_open.cc                    |    2 +
 db/db_test.cc                                 |    1 -
 db/external_sst_file_ingestion_job.cc         |    2 +-
 db/flush_job_test.cc                          |    2 +-
 db/import_column_family_job.cc                |    2 +-
 db/log_reader.cc                              |    3 +-
 db/log_reader.h                               |    5 +-
 db/log_test.cc                                |    3 +-
 db/log_writer.cc                              |    2 +-
 db/repair.cc                                  |    2 +-
 db/table_cache.cc                             |    3 +-
 db/table_properties_collector_test.cc         |    6 +-
 db/transaction_log_impl.cc                    |    2 +-
 db/version_set.cc                             |    4 +-
 db/wal_manager.cc                             |    2 +-
 db/wal_manager_test.cc                        |    2 +-
 file/file_prefetch_buffer.cc                  |  133 ++
 file/file_prefetch_buffer.h                   |   97 ++
 file/file_util.cc                             |    4 +-
 file/filename.cc                              |    2 +-
 file/random_access_file_reader.cc             |  188 +++
 file/random_access_file_reader.h              |  120 ++
 file/read_write_util.cc                       |   61 +
 file/read_write_util.h                        |   29 +
 file/readahead_raf.cc                         |  170 +++
 file/readahead_raf.h                          |   27 +
 file/sequence_file_reader.cc                  |  241 ++++
 file/sequence_file_reader.h                   |   66 +
 file/writable_file_writer.cc                  |  405 ++++++
 file/writable_file_writer.h                   |  155 +++
 logging/env_logger.h                          |    2 +-
 options/options_parser.cc                     |    3 +-
 src.mk                                        |    7 +-
 table/block_based/block_based_table_reader.cc |    4 +-
 table/block_based/block_based_table_reader.h  |    2 +-
 table/block_fetcher.cc                        |    1 -
 table/cuckoo/cuckoo_table_builder.cc          |    2 +-
 table/cuckoo/cuckoo_table_builder_test.cc     |    3 +-
 table/cuckoo/cuckoo_table_reader.h            |    2 +-
 table/format.cc                               |    2 +-
 table/format.h                                |    4 +-
 table/iterator_wrapper.h                      |    1 +
 table/meta_blocks.cc                          |    2 +-
 table/mock_table.cc                           |    2 +-
 table/plain/plain_table_builder.cc            |    2 +-
 table/plain/plain_table_key_coding.cc         |    2 +-
 table/plain/plain_table_reader.h              |    2 +-
 table/sst_file_reader.cc                      |    2 +-
 table/sst_file_writer.cc                      |    2 +-
 table/table_builder.h                         |    2 +-
 table/table_reader_bench.cc                   |    2 +-
 test_util/testutil.cc                         |    4 +-
 tools/sst_dump_test.cc                        |    2 +-
 tools/sst_dump_tool_imp.h                     |    2 +-
 tools/trace_analyzer_test.cc                  |    1 +
 tools/trace_analyzer_tool.cc                  |    3 +-
 util/file_reader_writer.cc                    | 1085 -----------------
 util/file_reader_writer.h                     |  407 -------
 util/file_reader_writer_test.cc               |    5 +-
 util/log_write_bench.cc                       |    2 +-
 utilities/backupable/backupable_db.cc         |    3 +-
 utilities/backupable/backupable_db_test.cc    |    1 -
 utilities/blob_db/blob_db_impl.cc             |    3 +-
 utilities/blob_db/blob_dump_tool.cc           |    5 +-
 utilities/blob_db/blob_dump_tool.h            |    2 +-
 utilities/blob_db/blob_file.cc                |    1 +
 utilities/blob_db/blob_file.h                 |    2 +-
 utilities/blob_db/blob_log_reader.cc          |    2 +-
 utilities/blob_db/blob_log_reader.h           |    2 +-
 utilities/blob_db/blob_log_writer.cc          |    2 +-
 .../persistent_cache/block_cache_tier_file.h  |    3 +-
 utilities/simulator_cache/sim_cache.cc        |    2 +-
 utilities/trace/file_trace_reader_writer.cc   |    3 +-
 79 files changed, 1799 insertions(+), 1559 deletions(-)
 create mode 100644 file/file_prefetch_buffer.cc
 create mode 100644 file/file_prefetch_buffer.h
 create mode 100644 file/random_access_file_reader.cc
 create mode 100644 file/random_access_file_reader.h
 create mode 100644 file/read_write_util.cc
 create mode 100644 file/read_write_util.h
 create mode 100644 file/readahead_raf.cc
 create mode 100644 file/readahead_raf.h
 create mode 100644 file/sequence_file_reader.cc
 create mode 100644 file/sequence_file_reader.h
 create mode 100644 file/writable_file_writer.cc
 create mode 100644 file/writable_file_writer.h
 delete mode 100644 util/file_reader_writer.cc
 delete mode 100644 util/file_reader_writer.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9076af3ab08..e5412ab903a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -547,9 +547,15 @@ set(SOURCES
         env/env_hdfs.cc
         env/mock_env.cc
         file/delete_scheduler.cc
+        file/file_prefetch_buffer.cc
         file/file_util.cc
         file/filename.cc
+        file/random_access_file_reader.cc
+        file/read_write_util.cc
+        file/readahead_raf.cc
+        file/sequence_file_reader.cc
         file/sst_file_manager_impl.cc
+        file/writable_file_writer.cc
         logging/auto_roll_logger.cc
         logging/event_logger.cc
         logging/log_buffer.cc
@@ -639,7 +645,6 @@ set(SOURCES
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/dynamic_bloom.cc
-        util/file_reader_writer.cc
         util/filter_policy.cc
         util/hash.cc
         util/murmurhash.cc
diff --git a/TARGETS b/TARGETS
index b603d027f06..e1af516e0b5 100644
--- a/TARGETS
+++ b/TARGETS
@@ -176,9 +176,15 @@ cpp_library(
         "env/io_posix.cc",
         "env/mock_env.cc",
         "file/delete_scheduler.cc",
+        "file/file_prefetch_buffer.cc",
         "file/file_util.cc",
         "file/filename.cc",
+        "file/random_access_file_reader.cc",
+        "file/read_write_util.cc",
+        "file/readahead_raf.cc",
+        "file/sequence_file_reader.cc",
         "file/sst_file_manager_impl.cc",
+        "file/writable_file_writer.cc",
         "logging/auto_roll_logger.cc",
         "logging/event_logger.cc",
         "logging/log_buffer.cc",
@@ -267,7 +273,6 @@ cpp_library(
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
         "util/dynamic_bloom.cc",
-        "util/file_reader_writer.cc",
         "util/filter_policy.cc",
         "util/hash.cc",
         "util/murmurhash.cc",
diff --git a/db/builder.cc b/db/builder.cc
index eac1b5fe2e1..01ed32e0c64 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -22,6 +22,8 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
@@ -33,7 +35,6 @@
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "test_util/sync_point.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 663c8aa0a80..1acc48b4c40 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -34,7 +34,9 @@
 #include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "file/filename.h"
+#include "file/read_write_util.h"
 #include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
 #include "logging/log_buffer.h"
 #include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
@@ -52,7 +54,6 @@
 #include "table/table_builder.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 8f858e092de..8f9db07362d 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -17,6 +17,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
@@ -24,7 +25,6 @@
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 4bcab93a867..8bc7302504a 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -54,6 +54,7 @@
 #include "db/write_callback.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "file/random_access_file_reader.h"
 #include "file/sst_file_manager_impl.h"
 #include "logging/auto_roll_logger.h"
 #include "logging/log_buffer.h"
@@ -96,7 +97,6 @@
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 4fed3fea33c..5078748d036 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -12,7 +12,9 @@
 
 #include "db/builder.h"
 #include "db/error_handler.h"
+#include "file/read_write_util.h"
 #include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
 #include "monitoring/persistent_stats_history.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
diff --git a/db/db_test.cc b/db/db_test.cc
index 116403402ec..60a077f57f3 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -62,7 +62,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/compression.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 3926d7fa9ff..c11b346a2ec 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -16,12 +16,12 @@
 #include "db/db_impl/db_impl.h"
 #include "db/version_edit.h"
 #include "file/file_util.h"
+#include "file/random_access_file_reader.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
 #include "test_util/sync_point.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 3725aae9561..e4400e84355 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -11,12 +11,12 @@
 #include "db/db_impl/db_impl.h"
 #include "db/flush_job.h"
 #include "db/version_set.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index cd591406983..7a6aa734420 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -9,11 +9,11 @@
 
 #include "db/version_edit.h"
 #include "file/file_util.h"
+#include "file/random_access_file_reader.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/db/log_reader.cc b/db/log_reader.cc
index e734e9d6c88..3a71cbc4291 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -10,10 +10,11 @@
 #include "db/log_reader.h"
 
 #include <stdio.h>
+#include "file/sequence_file_reader.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/util.h"
 
 namespace rocksdb {
diff --git a/db/log_reader.h b/db/log_reader.h
index efeb270e225..5f9cb981dba 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -12,13 +12,12 @@
 #include <stdint.h>
 
 #include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "rocksdb/options.h"
 
 namespace rocksdb {
-
-class SequentialFileReader;
 class Logger;
 
 namespace log {
diff --git a/db/log_test.cc b/db/log_test.cc
index be7a3cbe7cf..ecfae3e2db3 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -9,12 +9,13 @@
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/random.h"
 
 namespace rocksdb {
diff --git a/db/log_writer.cc b/db/log_writer.cc
index c46965e16e0..53efc6c15b3 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -10,10 +10,10 @@
 #include "db/log_writer.h"
 
 #include <stdint.h>
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace log {
diff --git a/db/repair.cc b/db/repair.cc
index 0f0d329ccd6..3557eb13992 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -71,6 +71,7 @@
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -78,7 +79,6 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index bd85fe0d3d6..98070be6989 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -14,7 +14,7 @@
 #include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "file/filename.h"
-
+#include "file/random_access_file_reader.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/statistics.h"
 #include "table/block_based/block_based_table_reader.h"
@@ -27,7 +27,6 @@
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index a9895bbedba..e479fa008bf 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -9,10 +9,11 @@
 #include <utility>
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
-
-#include "db/db_impl/db_impl.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -22,7 +23,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index 55f87ede96d..42c724c0365 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -8,7 +8,7 @@
 #include "db/transaction_log_impl.h"
 #include <cinttypes>
 #include "db/write_batch_internal.h"
-#include "util/file_reader_writer.h"
+#include "file/sequence_file_reader.h"
 
 namespace rocksdb {
 
diff --git a/db/version_set.cc b/db/version_set.cc
index e3c2397cdbd..a8ae98550be 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -30,6 +30,9 @@
 #include "db/table_cache.h"
 #include "db/version_builder.h"
 #include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/persistent_stats_history.h"
@@ -47,7 +50,6 @@
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/user_comparator_wrapper.h"
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 6c21ab4a00a..783e1c7acd3 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -20,6 +20,7 @@
 #include "db/write_batch_internal.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "file/sequence_file_reader.h"
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -28,7 +29,6 @@
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 671dc84e1b8..089c49cc637 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -18,10 +18,10 @@
 #include "db/version_set.h"
 #include "db/wal_manager.h"
 #include "env/mock_env.h"
+#include "file/writable_file_writer.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
new file mode 100644
index 00000000000..89f32c6ff0b
--- /dev/null
+++ b/file/file_prefetch_buffer.cc
@@ -0,0 +1,133 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n,
+                                    bool for_compaction) {
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  // Check if requested bytes are in the existing buffer_.
+  // If all bytes exist -- return.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+
+  Status s;
+  uint64_t chunk_offset_in_buffer = 0;
+  uint64_t chunk_len = 0;
+  bool copy_data_to_new_buffer = false;
+  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+      offset <= buffer_offset_ + buffer_.CurrentSize()) {
+    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer. So no need to Read
+      // again.
+      return s;
+    } else {
+      // Only a few requested bytes are in the buffer. memmove those chunk of
+      // bytes to the beginning, and memcpy them back into the new buffer if a
+      // new buffer is created.
+      chunk_offset_in_buffer =
+          Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+      assert(chunk_offset_in_buffer % alignment == 0);
+      assert(chunk_len % alignment == 0);
+      assert(chunk_offset_in_buffer + chunk_len <=
+             buffer_offset_ + buffer_.CurrentSize());
+      if (chunk_len > 0) {
+        copy_data_to_new_buffer = true;
+      } else {
+        // this reset is not necessary, but just to be safe.
+        chunk_offset_in_buffer = 0;
+      }
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (buffer_.Capacity() < roundup_len) {
+    buffer_.Alignment(alignment);
+    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+                              copy_data_to_new_buffer, chunk_offset_in_buffer,
+                              static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+                      static_cast<size_t>(chunk_len));
+  }
+
+  Slice result;
+  s = reader->Read(rounddown_offset + chunk_len,
+                   static_cast<size_t>(roundup_len - chunk_len), &result,
+                   buffer_.BufferStart() + chunk_len, for_compaction);
+  if (s.ok()) {
+    buffer_offset_ = rounddown_offset;
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+                                          Slice* result, bool for_compaction) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      assert(file_reader_ != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+      Status s;
+      if (for_compaction) {
+        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_),
+                     for_compaction);
+      } else {
+        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+      }
+      if (!s.ok()) {
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+
+  uint64_t offset_in_buffer = offset - buffer_offset_;
+  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+}  // namespace rocksdb
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
new file mode 100644
index 00000000000..c3cacf1020e
--- /dev/null
+++ b/file/file_prefetch_buffer.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+  // Constructor.
+  //
+  // All arguments are optional.
+  // file_reader        : the file reader to use. Can be a nullptr.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  //
+  // Automatic readhead is enabled for a file if file_reader, readahead_size,
+  // and max_readahead_size are passed in.
+  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+  // does not make any sense. So it does nothing.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
+  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
+      : buffer_offset_(0),
+        file_reader_(file_reader),
+        readahead_size_(readadhead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
+
+  // Load data into the buffer from a file.
+  // reader : the file reader.
+  // offset : the file offset to start reading from.
+  // n      : the number of bytes to read.
+  // for_compaction : if prefetch is done for compaction read.
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
+                  bool for_compaction = false);
+
+  // Tries returning the data for a file raed from this buffer, if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readadhead_size is set as part
+  // of the constructor.
+  //
+  // offset : the file offset.
+  // n      : the number of bytes.
+  // result : output buffer to put the data into.
+  // for_compaction : if cache read is done for compaction read.
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+                        bool for_compaction = false);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+ private:
+  AlignedBuffer buffer_;
+  uint64_t buffer_offset_;
+  RandomAccessFileReader* file_reader_;
+  size_t readahead_size_;
+  size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+};
+}  // namespace rocksdb
diff --git a/file/file_util.cc b/file/file_util.cc
index ee52bf640fb..f1bf6596ba6 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -8,9 +8,11 @@
 #include <string>
 #include <algorithm>
 
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
 #include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/file/filename.cc b/file/filename.cc
index a6360b5046d..5a3fa290226 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -12,10 +12,10 @@
 #include <ctype.h>
 #include <stdio.h>
 #include <vector>
+#include "file/writable_file_writer.h"
 #include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
new file mode 100644
index 00000000000..5b5a19ff862
--- /dev/null
+++ b/file/random_access_file_reader.cc
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+                                    char* scratch, bool for_compaction) const {
+  Status s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset =
+          TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size =
+          Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
+                          buf.Destination());
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = buf.Read(scratch, offset_advance,
+                           std::min(buf.CurrentSize() - offset_advance, n));
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                Env::IOPriority::IO_LOW, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, s.ok() ? pos : 0);
+    }
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+
+Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
+                                         size_t num_reqs) const {
+  Status s;
+  uint64_t elapsed = 0;
+  assert(!use_direct_io());
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::TimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = std::chrono::system_clock::now();
+    }
+#endif  // ROCKSDB_LITE
+    {
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+      s = file_->MultiRead(read_reqs, num_reqs);
+    }
+    for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+                               start_ts, finish_ts, read_reqs[i].status);
+      }
+#endif  // ROCKSDB_LITE
+      IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+    }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+}  // namespace rocksdb
diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h
new file mode 100644
index 00000000000..abbc71ff112
--- /dev/null
+++ b/file/random_access_file_reader.h
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+class Statistics;
+class HistogramImpl;
+
+// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<RandomAccessFile> file_;
+  std::string file_name_;
+  Env* env_;
+  Statistics* stats_;
+  uint32_t hist_type_;
+  HistogramImpl* file_read_hist_;
+  RateLimiter* rate_limiter_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_(std::move(raf)),
+        file_name_(std::move(_file_name)),
+        env_(env),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
+      ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    env_ = std::move(o.env_);
+    stats_ = std::move(o.stats_);
+    hist_type_ = std::move(o.hist_type_);
+    file_read_hist_ = std::move(o.file_read_hist_);
+    rate_limiter_ = std::move(o.rate_limiter_);
+    return *this;
+  }
+
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
+              bool for_compaction = false) const;
+
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) const;
+
+  Status Prefetch(uint64_t offset, size_t n) const {
+    return file_->Prefetch(offset, n);
+  }
+
+  RandomAccessFile* file() { return file_.get(); }
+
+  std::string file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+};
+}  // namespace rocksdb
diff --git a/file/read_write_util.cc b/file/read_write_util.cc
new file mode 100644
index 00000000000..8614fcaa8d2
--- /dev/null
+++ b/file/read_write_util.cc
@@ -0,0 +1,61 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+#include "test_util/sync_point.h"
+
+namespace rocksdb {
+Status NewWritableFile(Env* env, const std::string& fname,
+                       std::unique_ptr<WritableFile>* result,
+                       const EnvOptions& options) {
+  Status s = env->NewWritableFile(fname, result, options);
+  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  return s;
+}
+
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
+}  // namespace rocksdb
diff --git a/file/read_write_util.h b/file/read_write_util.h
new file mode 100644
index 00000000000..7c344728fdf
--- /dev/null
+++ b/file/read_write_util.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern Status NewWritableFile(Env* env, const std::string& fname,
+                              std::unique_ptr<WritableFile>* result,
+                              const EnvOptions& options);
+
+// Read a single line from a file.
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result);
+
+}  // namespace rocksdb
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
new file mode 100644
index 00000000000..5c5582d141e
--- /dev/null
+++ b/file/readahead_raf.cc
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+namespace {
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+}  // namespace
+#endif
+
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+      delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      return file_->Read(offset, n, result, scratch);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+    // In the case of cache hit advanced_offset is already aligned, means that
+    // chunk_offset equals to advanced_offset
+    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                       scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Prefetch(uint64_t offset, size_t n) override {
+    if (n < readahead_size_) {
+      // Don't allow smaller prefetches than the configured `readahead_size_`.
+      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+      return Status::OK();
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t offset_ = static_cast<size_t>(offset);
+    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+    if (prefetch_offset == buffer_offset_) {
+      return Status::OK();
+    }
+    return ReadIntoBuffer(prefetch_offset,
+                          Roundup(offset_ + n, alignment_) - prefetch_offset);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes starting at offset. If anything was read
+  // from the cache, it sets cached_len to the number of bytes actually read,
+  // copies these number of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                        char* scratch) const {
+    if (offset < buffer_offset_ ||
+        offset >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = offset - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<RandomAccessFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  mutable std::mutex lock_;
+  // The buffer storing the prefetched data
+  mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<RandomAccessFile> result(
+      new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace rocksdb
diff --git a/file/readahead_raf.h b/file/readahead_raf.h
new file mode 100644
index 00000000000..f6d64e77ac5
--- /dev/null
+++ b/file/readahead_raf.h
@@ -0,0 +1,27 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+}  // namespace rocksdb
diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc
new file mode 100644
index 00000000000..f9b20d067d0
--- /dev/null
+++ b/file/sequence_file_reader.cc
@@ -0,0 +1,241 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+namespace {
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+}  // namespace
+#endif
+
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  Status s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+    Slice tmp;
+    s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
+    if (s.ok() && offset_advance < tmp.size()) {
+      buf.Size(tmp.size());
+      r = buf.Read(scratch, offset_advance,
+                   std::min(tmp.size() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    s = file_->Read(n, result, scratch);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return s;
+}
+
+Status SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public SequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<SequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    n -= cached_len;
+
+    Status s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, result, scratch + cached_len);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    Status s = Status::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return file_->PositionedRead(offset, n, result, scratch);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(size_t n) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<SequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
+}  // namespace
+
+std::unique_ptr<SequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<SequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
+  std::unique_ptr<SequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace rocksdb
diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h
new file mode 100644
index 00000000000..6a6350e1d69
--- /dev/null
+++ b/file/sequence_file_reader.h
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+  std::unique_ptr<SequentialFile> file_;
+  std::string file_name_;
+  std::atomic<size_t> offset_{0};  // read offset
+
+ public:
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name)
+      : file_(std::move(_file)), file_name_(_file_name) {}
+
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name,
+                                size_t _readahead_size)
+      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
+        file_name_(_file_name) {}
+
+  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    return *this;
+  }
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch);
+
+  Status Skip(uint64_t n);
+
+  SequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<SequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<SequentialFile>&& file, size_t readahead_size);
+};
+}  // namespace rocksdb
diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
new file mode 100644
index 00000000000..277e55500c1
--- /dev/null
+++ b/file/writable_file_writer.cc
@@ -0,0 +1,405 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status WritableFileWriter::Append(const Slice& data) {
+  const char* src = data.data();
+  size_t left = data.size();
+  Status s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  {
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  // We never write directly to disk with direct I/O on.
+  // or we simply use it for its original purpose to accumulate many small
+  // chunks
+  if (use_direct_io() || (buf_.Capacity() >= left)) {
+    while (left > 0) {
+      size_t appended = buf_.Append(src, left);
+      left -= appended;
+      src += appended;
+
+      if (left > 0) {
+        s = Flush();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+  } else {
+    // Writing directly to file bypassing the buffer
+    assert(buf_.CurrentSize() == 0);
+    s = WriteBuffered(src, left);
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  if (s.ok()) {
+    filesize_ += data.size();
+  }
+  return s;
+}
+
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      Status s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  filesize_ += pad_bytes;
+  return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+  // Do not quit immediately on failure the file MUST be closed
+  Status s;
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (!writable_file_) {
+    return s;
+  }
+
+  s = Flush();  // flush cache to OS
+
+  Status interim;
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    interim = writable_file_->Truncate(filesize_);
+    if (interim.ok()) {
+      interim = writable_file_->Fsync();
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+  interim = writable_file_->Close();
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+Status WritableFileWriter::Flush() {
+  Status s;
+  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        s = WriteDirect();
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  s = writable_file_->Flush();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange =
+        1024 * 1024;                                // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
+    if (filesize_ > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+  Status s = Flush();
+  if (!s.ok()) {
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  pending_sync_ = false;
+  return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return Status::NotSupported(
+        "Can't WritableFileWriter::SyncWithoutFlush() because "
+        "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  Status s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+  Status s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  if (use_fsync) {
+    s = writable_file_->Fsync();
+  } else {
+    s = writable_file_->Sync();
+  }
+  SetPerfLevel(prev_perf_level);
+  return s;
+}
+
+Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+  return writable_file_->RangeSync(offset, nbytes);
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  Status s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+
+  while (left > 0) {
+    size_t allowed;
+    if (rate_limiter_ != nullptr) {
+      allowed = rate_limiter_->RequestToken(
+          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
+          RateLimiter::OpType::kWrite);
+    } else {
+      allowed = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize();
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+        s = writable_file_->Append(Slice(src, allowed));
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+
+    left -= allowed;
+    src += allowed;
+  }
+  buf_.Size(0);
+  return s;
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+Status WritableFileWriter::WriteDirect() {
+  assert(use_direct_io());
+  Status s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write
+  // it again in the future either on Close() OR when the current whole page
+  // fills out
+  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size;
+    if (rate_limiter_ != nullptr) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         writable_file_->GetIOPriority(),
+                                         stats_, RateLimiter::OpType::kWrite);
+    } else {
+      size = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
+      // direct writes must be positional
+      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h
new file mode 100644
index 00000000000..09d612233af
--- /dev/null
+++ b/file/writable_file_writer.h
@@ -0,0 +1,155 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+class Statistics;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<WritableFile> writable_file_;
+  std::string file_name_;
+  Env* env_;
+  AlignedBuffer buf_;
+  size_t max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  uint64_t filesize_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool pending_sync_;
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+  RateLimiter* rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
+      const EnvOptions& options, Env* env = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : writable_file_(std::move(file)),
+        file_name_(_file_name),
+        env_(env),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_() {
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() { Close(); }
+
+  std::string file_name() const { return file_name_; }
+
+  Status Append(const Slice& data);
+
+  Status Pad(const size_t pad_bytes);
+
+  Status Flush();
+
+  Status Close();
+
+  Status Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  Status SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() const { return filesize_; }
+
+  Status InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  WritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+ private:
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  Status WriteDirect();
+#endif  // !ROCKSDB_LITE
+  // Normal write
+  Status WriteBuffered(const char* data, size_t size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes);
+  Status SyncInternal(bool use_fsync);
+};
+}  // namespace rocksdb
diff --git a/logging/env_logger.h b/logging/env_logger.h
index 94cf129228c..5d7ff7afe17 100644
--- a/logging/env_logger.h
+++ b/logging/env_logger.h
@@ -16,11 +16,11 @@
 #include "port/sys_time.h"
 #include <time.h>
 
+#include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "test_util/sync_point.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/options/options_parser.cc b/options/options_parser.cc
index d5b0c25a32e..6d38f019265 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -13,12 +13,13 @@
 #include <utility>
 #include <vector>
 
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 #include "port/port.h"
diff --git a/src.mk b/src.mk
index 7a555253daf..42d6ae70b03 100644
--- a/src.mk
+++ b/src.mk
@@ -71,9 +71,15 @@ LIB_SOURCES =                                                   \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
   file/delete_scheduler.cc                                      \
+  file/file_prefetch_buffer.cc                                  \
   file/file_util.cc                                             \
   file/filename.cc                                              \
+  file/random_access_file_reader.cc                             \
+  file/read_write_util.cc                                       \
+  file/readahead_raf.cc                                         \
+  file/sequence_file_reader.cc                                  \
   file/sst_file_manager_impl.cc                                 \
+  file/writable_file_writer.cc                                  \
   logging/auto_roll_logger.cc                                   \
   logging/event_logger.cc                                       \
   logging/log_buffer.cc                                         \
@@ -159,7 +165,6 @@ LIB_SOURCES =                                                   \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
   util/dynamic_bloom.cc                                         \
-  util/file_reader_writer.cc                                    \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
   util/murmurhash.cc                                            \
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 0efd5e3c121..f6afab43fe3 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -18,6 +18,9 @@
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
@@ -49,7 +52,6 @@
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index c8e8ea006f4..b18dccd11d0 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -18,6 +18,7 @@
 
 #include "db/range_tombstone_fragmenter.h"
 #include "file/filename.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/persistent_cache.h"
@@ -39,7 +40,6 @@
 #include "table/two_level_iterator.h"
 #include "trace_replay/block_cache_tracer.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 81e1345d9c2..3e9f6ff3f04 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -23,7 +23,6 @@
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc
index f1a64cb6a67..8857cf7ea9f 100644
--- a/table/cuckoo/cuckoo_table_builder.cc
+++ b/table/cuckoo/cuckoo_table_builder.cc
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_builder.h"
@@ -20,7 +21,6 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/autovector.h"
-#include "util/file_reader_writer.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc
index f9d46c03bd8..b84cc9f5bcf 100644
--- a/table/cuckoo/cuckoo_table_builder_test.cc
+++ b/table/cuckoo/cuckoo_table_builder_test.cc
@@ -10,11 +10,12 @@
 #include <map>
 #include <utility>
 
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
 #include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 extern const uint64_t kCuckooTableMagicNumber;
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index ea33ffb2a8a..d90a147573a 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -15,11 +15,11 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "table/table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/format.cc b/table/format.cc
index b3eb281a2e5..6b9125de762 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "block_fetcher.h"
+#include "file/random_access_file_reader.h"
 #include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
@@ -24,7 +25,6 @@
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
diff --git a/table/format.h b/table/format.h
index effc13addaf..ef323a64716 100644
--- a/table/format.h
+++ b/table/format.h
@@ -17,6 +17,9 @@
 #include <malloc.h>
 #endif
 #endif
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@@ -27,7 +30,6 @@
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/xxhash.h"
 
 namespace rocksdb {
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index a5aa5c49eac..d6648bc3812 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -12,6 +12,7 @@
 #include <set>
 
 #include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 3bbc6d87080..1ba52d6e1c7 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -9,6 +9,7 @@
 
 #include "block_fetcher.h"
 #include "db/table_properties_collector.h"
+#include "file/random_access_file_reader.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based/block.h"
@@ -18,7 +19,6 @@
 #include "table/table_properties_internal.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 022f9a63f52..551c1ba5d1c 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -6,11 +6,11 @@
 #include "table/mock_table.h"
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/table_properties.h"
 #include "table/get_context.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace mock {
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index 8a51b64e60e..696340525a7 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -13,6 +13,7 @@
 #include <map>
 
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
@@ -26,7 +27,6 @@
 #include "table/plain/plain_table_index.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc
index c84f337eb42..b70ce65e675 100644
--- a/table/plain/plain_table_key_coding.cc
+++ b/table/plain/plain_table_key_coding.cc
@@ -9,9 +9,9 @@
 #include <algorithm>
 #include <string>
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index f95616cc57f..fa248d54ff5 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -13,6 +13,7 @@
 #include <stdint.h>
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "memory/arena.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -23,7 +24,6 @@
 #include "table/plain/plain_table_factory.h"
 #include "table/plain/plain_table_index.h"
 #include "table/table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index cc892c25b2a..48db1d8b41e 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -9,11 +9,11 @@
 
 #include "db/db_iter.h"
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index b53f3161e3e..dc2c589f21a 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -8,11 +8,11 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
 #include "test_util/sync_point.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/table_builder.h b/table/table_builder.h
index 23189200c64..4a4b19b626c 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -15,11 +15,11 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
 #include "trace_replay/block_cache_tracer.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 45d760f0ef8..05bb2ea25e4 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -13,6 +13,7 @@ int main() {
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/db.h"
 #include "rocksdb/slice_transform.h"
@@ -24,7 +25,6 @@ int main() {
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index f3e71bebce8..3bf0e878c5a 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -15,8 +15,10 @@
 #include <sstream>
 
 #include "db/memtable_list.h"
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "port/port.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace test {
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index d3b1f0e581d..d7391ebab9c 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -12,12 +12,12 @@
 #include <stdint.h>
 #include "rocksdb/sst_dump_tool.h"
 
+#include "file/random_access_file_reader.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index 51c15c8aa93..f8d54435285 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -10,8 +10,8 @@
 #include <memory>
 #include <string>
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index dcc954384fd..f2779224113 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -23,6 +23,7 @@ int main() {
 #include <thread>
 
 #include "db/db_test_util.h"
+#include "file/read_write_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 9ee746af4a2..c5576873dbf 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -26,6 +26,8 @@
 #include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -43,7 +45,6 @@
 #include "trace_replay/trace_replay.h"
 #include "util/coding.h"
 #include "util/compression.h"
-#include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
deleted file mode 100644
index b6a5eefcfdb..00000000000
--- a/util/file_reader_writer.cc
+++ /dev/null
@@ -1,1085 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/file_reader_writer.h"
-
-#include <algorithm>
-#include <mutex>
-
-#include "monitoring/histogram.h"
-#include "monitoring/iostats_context_imp.h"
-#include "port/port.h"
-#include "test_util/sync_point.h"
-#include "util/random.h"
-#include "util/rate_limiter.h"
-
-namespace rocksdb {
-
-#ifndef NDEBUG
-namespace {
-bool IsFileSectorAligned(const size_t off, size_t sector_size) {
-  return off % sector_size == 0;
-}
-}
-#endif
-
-Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
-  Status s;
-  if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-    size_t offset = offset_.fetch_add(n);
-    size_t alignment = file_->GetRequiredBufferAlignment();
-    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
-    size_t offset_advance = offset - aligned_offset;
-    size_t size = Roundup(offset + n, alignment) - aligned_offset;
-    size_t r = 0;
-    AlignedBuffer buf;
-    buf.Alignment(alignment);
-    buf.AllocateNewBuffer(size);
-    Slice tmp;
-    s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
-    if (s.ok() && offset_advance < tmp.size()) {
-      buf.Size(tmp.size());
-      r = buf.Read(scratch, offset_advance,
-                   std::min(tmp.size() - offset_advance, n));
-    }
-    *result = Slice(scratch, r);
-#endif  // !ROCKSDB_LITE
-  } else {
-    s = file_->Read(n, result, scratch);
-  }
-  IOSTATS_ADD(bytes_read, result->size());
-  return s;
-}
-
-
-Status SequentialFileReader::Skip(uint64_t n) {
-#ifndef ROCKSDB_LITE
-  if (use_direct_io()) {
-    offset_ += static_cast<size_t>(n);
-    return Status::OK();
-  }
-#endif  // !ROCKSDB_LITE
-  return file_->Skip(n);
-}
-
-Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
-                                    char* scratch, bool for_compaction) const {
-  Status s;
-  uint64_t elapsed = 0;
-  {
-    StopWatch sw(env_, stats_, hist_type_,
-                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
-                true /*delay_enabled*/);
-    auto prev_perf_level = GetPerfLevel();
-    IOSTATS_TIMER_GUARD(read_nanos);
-    if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-      size_t alignment = file_->GetRequiredBufferAlignment();
-      size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
-      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
-      size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
-      AlignedBuffer buf;
-      buf.Alignment(alignment);
-      buf.AllocateNewBuffer(read_size);
-      while (buf.CurrentSize() < read_size) {
-        size_t allowed;
-        if (for_compaction && rate_limiter_ != nullptr) {
-          allowed = rate_limiter_->RequestToken(
-              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
-              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
-        } else {
-          assert(buf.CurrentSize() == 0);
-          allowed = read_size;
-        }
-        Slice tmp;
-
-        FileOperationInfo::TimePoint start_ts;
-        uint64_t orig_offset = 0;
-        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
-          orig_offset = aligned_offset + buf.CurrentSize();
-        }
-        {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
-                          buf.Destination());
-        }
-        if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
-          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
-                                 s);
-        }
-
-        buf.Size(buf.CurrentSize() + tmp.size());
-        if (!s.ok() || tmp.size() < allowed) {
-          break;
-        }
-      }
-      size_t res_len = 0;
-      if (s.ok() && offset_advance < buf.CurrentSize()) {
-        res_len = buf.Read(scratch, offset_advance,
-                           std::min(buf.CurrentSize() - offset_advance, n));
-      }
-      *result = Slice(scratch, res_len);
-#endif  // !ROCKSDB_LITE
-    } else {
-      size_t pos = 0;
-      const char* res_scratch = nullptr;
-      while (pos < n) {
-        size_t allowed;
-        if (for_compaction && rate_limiter_ != nullptr) {
-          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
-            sw.DelayStart();
-          }
-          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
-                                                Env::IOPriority::IO_LOW, stats_,
-                                                RateLimiter::OpType::kRead);
-          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
-            sw.DelayStop();
-          }
-        } else {
-          allowed = n;
-        }
-        Slice tmp_result;
-
-#ifndef ROCKSDB_LITE
-        FileOperationInfo::TimePoint start_ts;
-        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
-        }
-#endif
-        {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
-        }
-#ifndef ROCKSDB_LITE
-        if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
-          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
-                                 finish_ts, s);
-        }
-#endif
-
-        if (res_scratch == nullptr) {
-          // we can't simply use `scratch` because reads of mmap'd files return
-          // data in a different buffer.
-          res_scratch = tmp_result.data();
-        } else {
-          // make sure chunks are inserted contiguously into `res_scratch`.
-          assert(tmp_result.data() == res_scratch + pos);
-        }
-        pos += tmp_result.size();
-        if (!s.ok() || tmp_result.size() < allowed) {
-          break;
-        }
-      }
-      *result = Slice(res_scratch, s.ok() ? pos : 0);
-    }
-    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
-    SetPerfLevel(prev_perf_level);
-  }
-  if (stats_ != nullptr && file_read_hist_ != nullptr) {
-    file_read_hist_->Add(elapsed);
-  }
-
-  return s;
-}
-
-Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
-                                         size_t num_reqs) const {
-  Status s;
-  uint64_t elapsed = 0;
-  assert(!use_direct_io());
-  {
-    StopWatch sw(env_, stats_, hist_type_,
-                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
-                true /*delay_enabled*/);
-    auto prev_perf_level = GetPerfLevel();
-    IOSTATS_TIMER_GUARD(read_nanos);
-
-#ifndef ROCKSDB_LITE
-      FileOperationInfo::TimePoint start_ts;
-      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
-      }
-#endif // ROCKSDB_LITE
-      {
-        IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-        s = file_->MultiRead(read_reqs, num_reqs);
-      }
-      for (size_t i = 0; i < num_reqs; ++i) {
-#ifndef ROCKSDB_LITE
-        if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
-            NotifyOnFileReadFinish(read_reqs[i].offset,
-                read_reqs[i].result.size(), start_ts, finish_ts,
-                read_reqs[i].status);
-        }
-#endif // ROCKSDB_LITE
-        IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
-      }
-    SetPerfLevel(prev_perf_level);
-  }
-  if (stats_ != nullptr && file_read_hist_ != nullptr) {
-    file_read_hist_->Add(elapsed);
-  }
-
-  return s;
-}
-
-Status WritableFileWriter::Append(const Slice& data) {
-  const char* src = data.data();
-  size_t left = data.size();
-  Status s;
-  pending_sync_ = true;
-
-  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
-
-  {
-    IOSTATS_TIMER_GUARD(prepare_write_nanos);
-    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
-    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
-  }
-
-  // See whether we need to enlarge the buffer to avoid the flush
-  if (buf_.Capacity() - buf_.CurrentSize() < left) {
-    for (size_t cap = buf_.Capacity();
-         cap < max_buffer_size_;  // There is still room to increase
-         cap *= 2) {
-      // See whether the next available size is large enough.
-      // Buffer will never be increased to more than max_buffer_size_.
-      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
-      if (desired_capacity - buf_.CurrentSize() >= left ||
-          (use_direct_io() && desired_capacity == max_buffer_size_)) {
-        buf_.AllocateNewBuffer(desired_capacity, true);
-        break;
-      }
-    }
-  }
-
-  // Flush only when buffered I/O
-  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
-    if (buf_.CurrentSize() > 0) {
-      s = Flush();
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    assert(buf_.CurrentSize() == 0);
-  }
-
-  // We never write directly to disk with direct I/O on.
-  // or we simply use it for its original purpose to accumulate many small
-  // chunks
-  if (use_direct_io() || (buf_.Capacity() >= left)) {
-    while (left > 0) {
-      size_t appended = buf_.Append(src, left);
-      left -= appended;
-      src += appended;
-
-      if (left > 0) {
-        s = Flush();
-        if (!s.ok()) {
-          break;
-        }
-      }
-    }
-  } else {
-    // Writing directly to file bypassing the buffer
-    assert(buf_.CurrentSize() == 0);
-    s = WriteBuffered(src, left);
-  }
-
-  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
-  if (s.ok()) {
-    filesize_ += data.size();
-  }
-  return s;
-}
-
-Status WritableFileWriter::Pad(const size_t pad_bytes) {
-  assert(pad_bytes < kDefaultPageSize);
-  size_t left = pad_bytes;
-  size_t cap = buf_.Capacity() - buf_.CurrentSize();
-
-  // Assume pad_bytes is small compared to buf_ capacity. So we always
-  // use buf_ rather than write directly to file in certain cases like
-  // Append() does.
-  while (left) {
-    size_t append_bytes = std::min(cap, left);
-    buf_.PadWith(append_bytes, 0);
-    left -= append_bytes;
-    if (left > 0) {
-      Status s = Flush();
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    cap = buf_.Capacity() - buf_.CurrentSize();
-  }
-  pending_sync_ = true;
-  filesize_ += pad_bytes;
-  return Status::OK();
-}
-
-Status WritableFileWriter::Close() {
-
-  // Do not quit immediately on failure the file MUST be closed
-  Status s;
-
-  // Possible to close it twice now as we MUST close
-  // in __dtor, simply flushing is not enough
-  // Windows when pre-allocating does not fill with zeros
-  // also with unbuffered access we also set the end of data.
-  if (!writable_file_) {
-    return s;
-  }
-
-  s = Flush();  // flush cache to OS
-
-  Status interim;
-  // In direct I/O mode we write whole pages so
-  // we need to let the file know where data ends.
-  if (use_direct_io()) {
-    interim = writable_file_->Truncate(filesize_);
-    if (interim.ok()) {
-      interim = writable_file_->Fsync();
-    }
-    if (!interim.ok() && s.ok()) {
-      s = interim;
-    }
-  }
-
-  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
-  interim = writable_file_->Close();
-  if (!interim.ok() && s.ok()) {
-    s = interim;
-  }
-
-  writable_file_.reset();
-  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
-
-  return s;
-}
-
-// write out the cached data to the OS cache or storage if direct I/O
-// enabled
-Status WritableFileWriter::Flush() {
-  Status s;
-  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
-
-  if (buf_.CurrentSize() > 0) {
-    if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-      if (pending_sync_) {
-        s = WriteDirect();
-      }
-#endif  // !ROCKSDB_LITE
-    } else {
-      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
-    }
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  s = writable_file_->Flush();
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  // sync OS cache to disk for every bytes_per_sync_
-  // TODO: give log file and sst file different options (log
-  // files could be potentially cached in OS for their whole
-  // life time, thus we might not want to flush at all).
-
-  // We try to avoid sync to the last 1MB of data. For two reasons:
-  // (1) avoid rewrite the same page that is modified later.
-  // (2) for older version of OS, write can block while writing out
-  //     the page.
-  // Xfs does neighbor page flushing outside of the specified ranges. We
-  // need to make sure sync range is far from the write offset.
-  if (!use_direct_io() && bytes_per_sync_) {
-    const uint64_t kBytesNotSyncRange = 1024 * 1024;  // recent 1MB is not synced.
-    const uint64_t kBytesAlignWhenSync = 4 * 1024;    // Align 4KB.
-    if (filesize_ > kBytesNotSyncRange) {
-      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
-      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
-      assert(offset_sync_to >= last_sync_size_);
-      if (offset_sync_to > 0 &&
-          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
-        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
-        last_sync_size_ = offset_sync_to;
-      }
-    }
-  }
-
-  return s;
-}
-
-Status WritableFileWriter::Sync(bool use_fsync) {
-  Status s = Flush();
-  if (!s.ok()) {
-    return s;
-  }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
-  if (!use_direct_io() && pending_sync_) {
-    s = SyncInternal(use_fsync);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
-  pending_sync_ = false;
-  return Status::OK();
-}
-
-Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
-  if (!writable_file_->IsSyncThreadSafe()) {
-    return Status::NotSupported(
-      "Can't WritableFileWriter::SyncWithoutFlush() because "
-      "WritableFile::IsSyncThreadSafe() is false");
-  }
-  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
-  Status s = SyncInternal(use_fsync);
-  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
-  return s;
-}
-
-Status WritableFileWriter::SyncInternal(bool use_fsync) {
-  Status s;
-  IOSTATS_TIMER_GUARD(fsync_nanos);
-  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
-  auto prev_perf_level = GetPerfLevel();
-  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-  if (use_fsync) {
-    s = writable_file_->Fsync();
-  } else {
-    s = writable_file_->Sync();
-  }
-  SetPerfLevel(prev_perf_level);
-  return s;
-}
-
-Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
-  IOSTATS_TIMER_GUARD(range_sync_nanos);
-  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
-  return writable_file_->RangeSync(offset, nbytes);
-}
-
-// This method writes to disk the specified data and makes use of the rate
-// limiter if available
-Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
-  Status s;
-  assert(!use_direct_io());
-  const char* src = data;
-  size_t left = size;
-
-  while (left > 0) {
-    size_t allowed;
-    if (rate_limiter_ != nullptr) {
-      allowed = rate_limiter_->RequestToken(
-          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
-          RateLimiter::OpType::kWrite);
-    } else {
-      allowed = left;
-    }
-
-    {
-      IOSTATS_TIMER_GUARD(write_nanos);
-      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-
-#ifndef ROCKSDB_LITE
-      FileOperationInfo::TimePoint start_ts;
-      uint64_t old_size = writable_file_->GetFileSize();
-      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
-        old_size = next_write_offset_;
-      }
-#endif
-      {
-        auto prev_perf_level = GetPerfLevel();
-        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-        s = writable_file_->Append(Slice(src, allowed));
-        SetPerfLevel(prev_perf_level);
-      }
-#ifndef ROCKSDB_LITE
-      if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
-        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
-      }
-#endif
-      if (!s.ok()) {
-        return s;
-      }
-    }
-
-    IOSTATS_ADD(bytes_written, allowed);
-    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
-
-    left -= allowed;
-    src += allowed;
-  }
-  buf_.Size(0);
-  return s;
-}
-
-
-// This flushes the accumulated data in the buffer. We pad data with zeros if
-// necessary to the whole page.
-// However, during automatic flushes padding would not be necessary.
-// We always use RateLimiter if available. We move (Refit) any buffer bytes
-// that are left over the
-// whole number of pages to be written again on the next flush because we can
-// only write on aligned
-// offsets.
-#ifndef ROCKSDB_LITE
-Status WritableFileWriter::WriteDirect() {
-  assert(use_direct_io());
-  Status s;
-  const size_t alignment = buf_.Alignment();
-  assert((next_write_offset_ % alignment) == 0);
-
-  // Calculate whole page final file advance if all writes succeed
-  size_t file_advance =
-    TruncateToPageBoundary(alignment, buf_.CurrentSize());
-
-  // Calculate the leftover tail, we write it here padded with zeros BUT we
-  // will write
-  // it again in the future either on Close() OR when the current whole page
-  // fills out
-  size_t leftover_tail = buf_.CurrentSize() - file_advance;
-
-  // Round up and pad
-  buf_.PadToAlignmentWith(0);
-
-  const char* src = buf_.BufferStart();
-  uint64_t write_offset = next_write_offset_;
-  size_t left = buf_.CurrentSize();
-
-  while (left > 0) {
-    // Check how much is allowed
-    size_t size;
-    if (rate_limiter_ != nullptr) {
-      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
-                                         writable_file_->GetIOPriority(),
-                                         stats_, RateLimiter::OpType::kWrite);
-    } else {
-      size = left;
-    }
-
-    {
-      IOSTATS_TIMER_GUARD(write_nanos);
-      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-      FileOperationInfo::TimePoint start_ts;
-      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
-      }
-      // direct writes must be positional
-      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
-      if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
-        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
-      }
-      if (!s.ok()) {
-        buf_.Size(file_advance + leftover_tail);
-        return s;
-      }
-    }
-
-    IOSTATS_ADD(bytes_written, size);
-    left -= size;
-    src += size;
-    write_offset += size;
-    assert((next_write_offset_ % alignment) == 0);
-  }
-
-  if (s.ok()) {
-    // Move the tail to the beginning of the buffer
-    // This never happens during normal Append but rather during
-    // explicit call to Flush()/Sync() or Close()
-    buf_.RefitTail(file_advance, leftover_tail);
-    // This is where we start writing next time which may or not be
-    // the actual file size on disk. They match if the buffer size
-    // is a multiple of whole pages otherwise filesize_ is leftover_tail
-    // behind
-    next_write_offset_ += file_advance;
-  }
-  return s;
-}
-#endif  // !ROCKSDB_LITE
-
-namespace {
-class ReadaheadRandomAccessFile : public RandomAccessFile {
- public:
-  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
-                            size_t readahead_size)
-      : file_(std::move(file)),
-        alignment_(file_->GetRequiredBufferAlignment()),
-        readahead_size_(Roundup(readahead_size, alignment_)),
-        buffer_(),
-        buffer_offset_(0) {
-    buffer_.Alignment(alignment_);
-    buffer_.AllocateNewBuffer(readahead_size_);
-  }
-
- ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
-
- ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
-
- Status Read(uint64_t offset, size_t n, Slice* result,
-             char* scratch) const override {
-   // Read-ahead only make sense if we have some slack left after reading
-   if (n + alignment_ >= readahead_size_) {
-     return file_->Read(offset, n, result, scratch);
-   }
-
-   std::unique_lock<std::mutex> lk(lock_);
-
-   size_t cached_len = 0;
-   // Check if there is a cache hit, meaning that [offset, offset + n) is either
-   // completely or partially in the buffer.
-   // If it's completely cached, including end of file case when offset + n is
-   // greater than EOF, then return.
-   if (TryReadFromCache(offset, n, &cached_len, scratch) &&
-       (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
-     // We read exactly what we needed, or we hit end of file - return.
-     *result = Slice(scratch, cached_len);
-     return Status::OK();
-   }
-   size_t advanced_offset = static_cast<size_t>(offset + cached_len);
-   // In the case of cache hit advanced_offset is already aligned, means that
-   // chunk_offset equals to advanced_offset
-   size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
-
-   Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
-   if (s.ok()) {
-     // The data we need is now in cache, so we can safely read it
-     size_t remaining_len;
-     TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
-                      scratch + cached_len);
-     *result = Slice(scratch, cached_len + remaining_len);
-   }
-   return s;
- }
-
- Status Prefetch(uint64_t offset, size_t n) override {
-   if (n < readahead_size_) {
-     // Don't allow smaller prefetches than the configured `readahead_size_`.
-     // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
-     return Status::OK();
-   }
-
-   std::unique_lock<std::mutex> lk(lock_);
-
-   size_t offset_ = static_cast<size_t>(offset);
-   size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
-   if (prefetch_offset == buffer_offset_) {
-     return Status::OK();
-   }
-   return ReadIntoBuffer(prefetch_offset,
-                         Roundup(offset_ + n, alignment_) - prefetch_offset);
- }
-
- size_t GetUniqueId(char* id, size_t max_size) const override {
-   return file_->GetUniqueId(id, max_size);
- }
-
- void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
-
- Status InvalidateCache(size_t offset, size_t length) override {
-   std::unique_lock<std::mutex> lk(lock_);
-   buffer_.Clear();
-   return file_->InvalidateCache(offset, length);
- }
-
- bool use_direct_io() const override { return file_->use_direct_io(); }
-
-private:
- // Tries to read from buffer_ n bytes starting at offset. If anything was read
- // from the cache, it sets cached_len to the number of bytes actually read,
- // copies these number of bytes to scratch and returns true.
- // If nothing was read sets cached_len to 0 and returns false.
- bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
-                       char* scratch) const {
-   if (offset < buffer_offset_ ||
-       offset >= buffer_offset_ + buffer_.CurrentSize()) {
-     *cached_len = 0;
-     return false;
-   }
-   uint64_t offset_in_buffer = offset - buffer_offset_;
-   *cached_len = std::min(
-       buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
-   memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
-   return true;
-  }
-
-  // Reads into buffer_ the next n bytes from file_ starting at offset.
-  // Can actually read less if EOF was reached.
-  // Returns the status of the read operastion on the file.
-  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
-    if (n > buffer_.Capacity()) {
-      n = buffer_.Capacity();
-    }
-    assert(IsFileSectorAligned(offset, alignment_));
-    assert(IsFileSectorAligned(n, alignment_));
-    Slice result;
-    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
-    if (s.ok()) {
-      buffer_offset_ = offset;
-      buffer_.Size(result.size());
-      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
-    }
-    return s;
-  }
-
-  const std::unique_ptr<RandomAccessFile> file_;
-  const size_t alignment_;
-  const size_t readahead_size_;
-
-  mutable std::mutex lock_;
-  // The buffer storing the prefetched data
-  mutable AlignedBuffer buffer_;
-  // The offset in file_, corresponding to data stored in buffer_
-  mutable uint64_t buffer_offset_;
-};
-
-// This class wraps a SequentialFile, exposing same API, with the differenece
-// of being able to prefetch up to readahead_size bytes and then serve them
-// from memory, avoiding the entire round-trip if, for example, the data for the
-// file is actually remote.
-class ReadaheadSequentialFile : public SequentialFile {
- public:
-  ReadaheadSequentialFile(std::unique_ptr<SequentialFile>&& file,
-                          size_t readahead_size)
-      : file_(std::move(file)),
-        alignment_(file_->GetRequiredBufferAlignment()),
-        readahead_size_(Roundup(readahead_size, alignment_)),
-        buffer_(),
-        buffer_offset_(0),
-        read_offset_(0) {
-    buffer_.Alignment(alignment_);
-    buffer_.AllocateNewBuffer(readahead_size_);
-  }
-
-  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
-
-  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
-
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    std::unique_lock<std::mutex> lk(lock_);
-
-    size_t cached_len = 0;
-    // Check if there is a cache hit, meaning that [offset, offset + n) is
-    // either completely or partially in the buffer. If it's completely cached,
-    // including end of file case when offset + n is greater than EOF, then
-    // return.
-    if (TryReadFromCache(n, &cached_len, scratch) &&
-        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
-      // We read exactly what we needed, or we hit end of file - return.
-      *result = Slice(scratch, cached_len);
-      return Status::OK();
-    }
-    n -= cached_len;
-
-    Status s;
-    // Read-ahead only make sense if we have some slack left after reading
-    if (n + alignment_ >= readahead_size_) {
-      s = file_->Read(n, result, scratch + cached_len);
-      if (s.ok()) {
-        read_offset_ += result->size();
-        *result = Slice(scratch, cached_len + result->size());
-      }
-      buffer_.Clear();
-      return s;
-    }
-
-    s = ReadIntoBuffer(readahead_size_);
-    if (s.ok()) {
-      // The data we need is now in cache, so we can safely read it
-      size_t remaining_len;
-      TryReadFromCache(n, &remaining_len, scratch + cached_len);
-      *result = Slice(scratch, cached_len + remaining_len);
-    }
-    return s;
-  }
-
-  Status Skip(uint64_t n) override {
-    std::unique_lock<std::mutex> lk(lock_);
-    Status s = Status::OK();
-    // First check if we need to skip already cached data
-    if (buffer_.CurrentSize() > 0) {
-      // Do we need to skip beyond cached data?
-      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
-        // Yes. Skip whaterver is in memory and adjust offset accordingly
-        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
-        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
-      } else {
-        // No. The entire section to be skipped is entirely i cache.
-        read_offset_ += n;
-        n = 0;
-      }
-    }
-    if (n > 0) {
-      // We still need to skip more, so call the file API for skipping
-      s = file_->Skip(n);
-      if (s.ok()) {
-        read_offset_ += n;
-      }
-      buffer_.Clear();
-    }
-    return s;
-  }
-
-  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                        char* scratch) override {
-    return file_->PositionedRead(offset, n, result, scratch);
-  }
-
-  Status InvalidateCache(size_t offset, size_t length) override {
-    std::unique_lock<std::mutex> lk(lock_);
-    buffer_.Clear();
-    return file_->InvalidateCache(offset, length);
-  }
-
-  bool use_direct_io() const override { return file_->use_direct_io(); }
-
- private:
-  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
-  // sets cached_len to the number of bytes actually read, copies these number
-  // of bytes to scratch and returns true.
-  // If nothing was read sets cached_len to 0 and returns false.
-  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
-    if (read_offset_ < buffer_offset_ ||
-        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
-      *cached_len = 0;
-      return false;
-    }
-    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
-    *cached_len = std::min(
-        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
-    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
-    read_offset_ += *cached_len;
-    return true;
-  }
-
-  // Reads into buffer_ the next n bytes from file_.
-  // Can actually read less if EOF was reached.
-  // Returns the status of the read operastion on the file.
-  Status ReadIntoBuffer(size_t n) {
-    if (n > buffer_.Capacity()) {
-      n = buffer_.Capacity();
-    }
-    assert(IsFileSectorAligned(n, alignment_));
-    Slice result;
-    Status s = file_->Read(n, &result, buffer_.BufferStart());
-    if (s.ok()) {
-      buffer_offset_ = read_offset_;
-      buffer_.Size(result.size());
-      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
-    }
-    return s;
-  }
-
-  const std::unique_ptr<SequentialFile> file_;
-  const size_t alignment_;
-  const size_t readahead_size_;
-
-  std::mutex lock_;
-  // The buffer storing the prefetched data
-  AlignedBuffer buffer_;
-  // The offset in file_, corresponding to data stored in buffer_
-  uint64_t buffer_offset_;
-  // The offset up to which data was read from file_. In fact, it can be larger
-  // than the actual file size, since the file_->Skip(n) call doesn't return the
-  // actual number of bytes that were skipped, which can be less than n.
-  // This is not a problemm since read_offset_ is monotonically increasing and
-  // its only use is to figure out if next piece of data should be read from
-  // buffer_ or file_ directly.
-  uint64_t read_offset_;
-};
-}  // namespace
-
-Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
-                                    uint64_t offset, size_t n,
-                                    bool for_compaction) {
-  size_t alignment = reader->file()->GetRequiredBufferAlignment();
-  size_t offset_ = static_cast<size_t>(offset);
-  uint64_t rounddown_offset = Rounddown(offset_, alignment);
-  uint64_t roundup_end = Roundup(offset_ + n, alignment);
-  uint64_t roundup_len = roundup_end - rounddown_offset;
-  assert(roundup_len >= alignment);
-  assert(roundup_len % alignment == 0);
-
-  // Check if requested bytes are in the existing buffer_.
-  // If all bytes exist -- return.
-  // If only a few bytes exist -- reuse them & read only what is really needed.
-  //     This is typically the case of incremental reading of data.
-  // If no bytes exist in buffer -- full pread.
-
-  Status s;
-  uint64_t chunk_offset_in_buffer = 0;
-  uint64_t chunk_len = 0;
-  bool copy_data_to_new_buffer = false;
-  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
-      offset <= buffer_offset_ + buffer_.CurrentSize()) {
-    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
-      // All requested bytes are already in the buffer. So no need to Read
-      // again.
-      return s;
-    } else {
-      // Only a few requested bytes are in the buffer. memmove those chunk of
-      // bytes to the beginning, and memcpy them back into the new buffer if a
-      // new buffer is created.
-      chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
-      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
-      assert(chunk_offset_in_buffer % alignment == 0);
-      assert(chunk_len % alignment == 0);
-      assert(chunk_offset_in_buffer + chunk_len <=
-             buffer_offset_ + buffer_.CurrentSize());
-      if (chunk_len > 0) {
-        copy_data_to_new_buffer = true;
-      } else {
-        // this reset is not necessary, but just to be safe.
-        chunk_offset_in_buffer = 0;
-      }
-    }
-  }
-
-  // Create a new buffer only if current capacity is not sufficient, and memcopy
-  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
-  if (buffer_.Capacity() < roundup_len) {
-    buffer_.Alignment(alignment);
-    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
-                              copy_data_to_new_buffer, chunk_offset_in_buffer,
-                              static_cast<size_t>(chunk_len));
-  } else if (chunk_len > 0) {
-    // New buffer not needed. But memmove bytes from tail to the beginning since
-    // chunk_len is greater than 0.
-    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
-  }
-
-  Slice result;
-  s = reader->Read(rounddown_offset + chunk_len,
-                   static_cast<size_t>(roundup_len - chunk_len), &result,
-                   buffer_.BufferStart() + chunk_len, for_compaction);
-  if (s.ok()) {
-    buffer_offset_ = rounddown_offset;
-    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
-  }
-  return s;
-}
-
-bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
-                                          Slice* result, bool for_compaction) {
-  if (track_min_offset_ && offset < min_offset_read_) {
-    min_offset_read_ = static_cast<size_t>(offset);
-  }
-  if (!enable_ || offset < buffer_offset_) {
-    return false;
-  }
-
-  // If the buffer contains only a few of the requested bytes:
-  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
-  //        and satisfy the request.
-  //    If readahead is not enabled: return false.
-  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
-    if (readahead_size_ > 0) {
-      assert(file_reader_ != nullptr);
-      assert(max_readahead_size_ >= readahead_size_);
-      Status s;
-      if (for_compaction) {
-        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), for_compaction);
-      } else {
-        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
-      }
-      if (!s.ok()) {
-        return false;
-      }
-      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
-    } else {
-      return false;
-    }
-  }
-
-  uint64_t offset_in_buffer = offset - buffer_offset_;
-  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
-  return true;
-}
-
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
-  std::unique_ptr<RandomAccessFile> result(
-    new ReadaheadRandomAccessFile(std::move(file), readahead_size));
-  return result;
-}
-
-std::unique_ptr<SequentialFile>
-SequentialFileReader::NewReadaheadSequentialFile(
-    std::unique_ptr<SequentialFile>&& file, size_t readahead_size) {
-  if (file->GetRequiredBufferAlignment() >= readahead_size) {
-    // Short-circuit and return the original file if readahead_size is
-    // too small and hence doesn't make sense to be used for prefetching.
-    return std::move(file);
-  }
-  std::unique_ptr<SequentialFile> result(
-      new ReadaheadSequentialFile(std::move(file), readahead_size));
-  return result;
-}
-
-Status NewWritableFile(Env* env, const std::string& fname,
-                       std::unique_ptr<WritableFile>* result,
-                       const EnvOptions& options) {
-  Status s = env->NewWritableFile(fname, result, options);
-  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
-  return s;
-}
-
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 8192;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-
-}  // namespace rocksdb
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
deleted file mode 100644
index a93274644c4..00000000000
--- a/util/file_reader_writer.h
+++ /dev/null
@@ -1,407 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#include <atomic>
-#include <sstream>
-#include <string>
-#include "port/port.h"
-#include "rocksdb/env.h"
-#include "rocksdb/listener.h"
-#include "rocksdb/rate_limiter.h"
-#include "test_util/sync_point.h"
-#include "util/aligned_buffer.h"
-
-namespace rocksdb {
-
-class Statistics;
-class HistogramImpl;
-
-// This file provides the following main abstractions:
-// SequentialFileReader : wrapper over Env::SequentialFile
-// RandomAccessFileReader : wrapper over Env::RandomAccessFile
-// WritableFileWriter : wrapper over Env::WritableFile
-// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
-// and ReadOneLine primitives.
-
-// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
-// always prefetch additional data with every read. This is mainly used in
-// Compaction Table Readers.
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-  std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
-
-// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
-// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
-// cache disabled) reads appropriately, and also updates the IO stats.
-class SequentialFileReader {
- private:
-  std::unique_ptr<SequentialFile> file_;
-  std::string file_name_;
-  std::atomic<size_t> offset_{0};  // read offset
-
- public:
-  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
-                                const std::string& _file_name)
-      : file_(std::move(_file)), file_name_(_file_name) {}
-
-  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
-                                const std::string& _file_name,
-                                size_t _readahead_size)
-      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
-        file_name_(_file_name) {}
-
-  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
-  }
-
-  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    return *this;
-  }
-
-  SequentialFileReader(const SequentialFileReader&) = delete;
-  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
-
-  Status Read(size_t n, Slice* result, char* scratch);
-
-  Status Skip(uint64_t n);
-
-  SequentialFile* file() { return file_.get(); }
-
-  std::string file_name() { return file_name_; }
-
-  bool use_direct_io() const { return file_->use_direct_io(); }
-
- private:
-  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
-  // always prefetch additional data with every read.
-  static std::unique_ptr<SequentialFile> NewReadaheadSequentialFile(
-      std::unique_ptr<SequentialFile>&& file, size_t readahead_size);
-};
-
-// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
-// responsible for:
-// - Handling Buffered and Direct reads appropriately.
-// - Rate limiting compaction reads.
-// - Notifying any interested listeners on the completion of a read.
-// - Updating IO stats.
-class RandomAccessFileReader {
- private:
-#ifndef ROCKSDB_LITE
-  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
-                              const FileOperationInfo::TimePoint& start_ts,
-                              const FileOperationInfo::TimePoint& finish_ts,
-                              const Status& status) const {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
-    info.offset = offset;
-    info.length = length;
-    info.status = status;
-
-    for (auto& listener : listeners_) {
-      listener->OnFileReadFinish(info);
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
-
-  std::unique_ptr<RandomAccessFile> file_;
-  std::string     file_name_;
-  Env*            env_;
-  Statistics*     stats_;
-  uint32_t        hist_type_;
-  HistogramImpl*  file_read_hist_;
-  RateLimiter* rate_limiter_;
-  std::vector<std::shared_ptr<EventListener>> listeners_;
-
- public:
-  explicit RandomAccessFileReader(
-      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
-      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
-      HistogramImpl* file_read_hist = nullptr,
-      RateLimiter* rate_limiter = nullptr,
-      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : file_(std::move(raf)),
-        file_name_(std::move(_file_name)),
-        env_(env),
-        stats_(stats),
-        hist_type_(hist_type),
-        file_read_hist_(file_read_hist),
-        rate_limiter_(rate_limiter),
-        listeners_() {
-#ifndef ROCKSDB_LITE
-    std::for_each(listeners.begin(), listeners.end(),
-                  [this](const std::shared_ptr<EventListener>& e) {
-                    if (e->ShouldBeNotifiedOnFileIO()) {
-                      listeners_.emplace_back(e);
-                    }
-                  });
-#else  // !ROCKSDB_LITE
-    (void)listeners;
-#endif
-  }
-
-  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
-  }
-
-  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
-      ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    env_ = std::move(o.env_);
-    stats_ = std::move(o.stats_);
-    hist_type_ = std::move(o.hist_type_);
-    file_read_hist_ = std::move(o.file_read_hist_);
-    rate_limiter_ = std::move(o.rate_limiter_);
-    return *this;
-  }
-
-  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
-  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
-
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
-              bool for_compaction = false) const;
-
-  Status MultiRead(ReadRequest* reqs, size_t num_reqs) const;
-
-  Status Prefetch(uint64_t offset, size_t n) const {
-    return file_->Prefetch(offset, n);
-  }
-
-  RandomAccessFile* file() { return file_.get(); }
-
-  std::string file_name() const { return file_name_; }
-
-  bool use_direct_io() const { return file_->use_direct_io(); }
-};
-
-// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
-// facilities to:
-// - Handle Buffered and Direct writes.
-// - Rate limit writes.
-// - Flush and Sync the data to the underlying filesystem.
-// - Notify any interested listeners on the completion of a write.
-// - Update IO stats.
-class WritableFileWriter {
- private:
-#ifndef ROCKSDB_LITE
-  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
-                               const FileOperationInfo::TimePoint& start_ts,
-                               const FileOperationInfo::TimePoint& finish_ts,
-                               const Status& status) {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
-    info.offset = offset;
-    info.length = length;
-    info.status = status;
-
-    for (auto& listener : listeners_) {
-      listener->OnFileWriteFinish(info);
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
-
-  std::unique_ptr<WritableFile> writable_file_;
-  std::string file_name_;
-  Env* env_;
-  AlignedBuffer           buf_;
-  size_t                  max_buffer_size_;
-  // Actually written data size can be used for truncate
-  // not counting padding data
-  uint64_t                filesize_;
-#ifndef ROCKSDB_LITE
-  // This is necessary when we use unbuffered access
-  // and writes must happen on aligned offsets
-  // so we need to go back and write that page again
-  uint64_t                next_write_offset_;
-#endif  // ROCKSDB_LITE
-  bool                    pending_sync_;
-  uint64_t                last_sync_size_;
-  uint64_t                bytes_per_sync_;
-  RateLimiter*            rate_limiter_;
-  Statistics* stats_;
-  std::vector<std::shared_ptr<EventListener>> listeners_;
-
- public:
-  WritableFileWriter(
-      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
-      const EnvOptions& options, Env* env = nullptr,
-      Statistics* stats = nullptr,
-      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : writable_file_(std::move(file)),
-        file_name_(_file_name),
-        env_(env),
-        buf_(),
-        max_buffer_size_(options.writable_file_max_buffer_size),
-        filesize_(0),
-#ifndef ROCKSDB_LITE
-        next_write_offset_(0),
-#endif  // ROCKSDB_LITE
-        pending_sync_(false),
-        last_sync_size_(0),
-        bytes_per_sync_(options.bytes_per_sync),
-        rate_limiter_(options.rate_limiter),
-        stats_(stats),
-        listeners_() {
-    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
-                             reinterpret_cast<void*>(max_buffer_size_));
-    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
-    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
-#ifndef ROCKSDB_LITE
-    std::for_each(listeners.begin(), listeners.end(),
-                  [this](const std::shared_ptr<EventListener>& e) {
-                    if (e->ShouldBeNotifiedOnFileIO()) {
-                      listeners_.emplace_back(e);
-                    }
-                  });
-#else  // !ROCKSDB_LITE
-    (void)listeners;
-#endif
-  }
-
-  WritableFileWriter(const WritableFileWriter&) = delete;
-
-  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
-
-  ~WritableFileWriter() { Close(); }
-
-  std::string file_name() const { return file_name_; }
-
-  Status Append(const Slice& data);
-
-  Status Pad(const size_t pad_bytes);
-
-  Status Flush();
-
-  Status Close();
-
-  Status Sync(bool use_fsync);
-
-  // Sync only the data that was already Flush()ed. Safe to call concurrently
-  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
-  // returns NotSupported status.
-  Status SyncWithoutFlush(bool use_fsync);
-
-  uint64_t GetFileSize() const { return filesize_; }
-
-  Status InvalidateCache(size_t offset, size_t length) {
-    return writable_file_->InvalidateCache(offset, length);
-  }
-
-  WritableFile* writable_file() const { return writable_file_.get(); }
-
-  bool use_direct_io() { return writable_file_->use_direct_io(); }
-
-  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
-
- private:
-  // Used when os buffering is OFF and we are writing
-  // DMA such as in Direct I/O mode
-#ifndef ROCKSDB_LITE
-  Status WriteDirect();
-#endif  // !ROCKSDB_LITE
-  // Normal write
-  Status WriteBuffered(const char* data, size_t size);
-  Status RangeSync(uint64_t offset, uint64_t nbytes);
-  Status SyncInternal(bool use_fsync);
-};
-
-// FilePrefetchBuffer is a smart buffer to store and read data from a file.
-class FilePrefetchBuffer {
- public:
-  // Constructor.
-  //
-  // All arguments are optional.
-  // file_reader        : the file reader to use. Can be a nullptr.
-  // readahead_size     : the initial readahead size.
-  // max_readahead_size : the maximum readahead size.
-  //   If max_readahead_size > readahead_size, the readahead size will be
-  //   doubled on every IO until max_readahead_size is hit.
-  //   Typically this is set as a multiple of readahead_size.
-  //   max_readahead_size should be greater than equal to readahead_size.
-  // enable : controls whether reading from the buffer is enabled.
-  //   If false, TryReadFromCache() always return false, and we only take stats
-  //   for the minimum offset if track_min_offset = true.
-  // track_min_offset : Track the minimum offset ever read and collect stats on
-  //   it. Used for adaptable readahead of the file footer/metadata.
-  //
-  // Automatic readhead is enabled for a file if file_reader, readahead_size,
-  // and max_readahead_size are passed in.
-  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
-  // does not make any sense. So it does nothing.
-  // A user can construct a FilePrefetchBuffer without any arguments, but use
-  // `Prefetch` to load data into the buffer.
-  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
-                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
-                     bool enable = true, bool track_min_offset = false)
-      : buffer_offset_(0),
-        file_reader_(file_reader),
-        readahead_size_(readadhead_size),
-        max_readahead_size_(max_readahead_size),
-        min_offset_read_(port::kMaxSizet),
-        enable_(enable),
-        track_min_offset_(track_min_offset) {}
-
-  // Load data into the buffer from a file.
-  // reader : the file reader.
-  // offset : the file offset to start reading from.
-  // n      : the number of bytes to read.
-  // for_compaction : if prefetch is done for compaction read.
-  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
-                  bool for_compaction = false);
-
-  // Tries returning the data for a file raed from this buffer, if that data is
-  // in the buffer.
-  // It handles tracking the minimum read offset if track_min_offset = true.
-  // It also does the exponential readahead when readadhead_size is set as part
-  // of the constructor.
-  //
-  // offset : the file offset.
-  // n      : the number of bytes.
-  // result : output buffer to put the data into.
-  // for_compaction : if cache read is done for compaction read.
-  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
-                        bool for_compaction = false);
-
-  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
-  // tracked if track_min_offset = true.
-  size_t min_offset_read() const { return min_offset_read_; }
-
- private:
-  AlignedBuffer buffer_;
-  uint64_t buffer_offset_;
-  RandomAccessFileReader* file_reader_;
-  size_t readahead_size_;
-  size_t max_readahead_size_;
-  // The minimum `offset` ever passed to TryReadFromCache().
-  size_t min_offset_read_;
-  // if false, TryReadFromCache() always return false, and we only take stats
-  // for track_min_offset_ if track_min_offset_ = true
-  bool enable_;
-  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
-  // can be fetched from min_offset_read().
-  bool track_min_offset_;
-};
-
-// Returns a WritableFile.
-//
-// env     : the Env.
-// fname   : the file name.
-// result  : output arg. A WritableFile based on `fname` returned.
-// options : the Env Options.
-extern Status NewWritableFile(Env* env, const std::string& fname,
-                              std::unique_ptr<WritableFile>* result,
-                              const EnvOptions& options);
-
-// Read a single line from a file.
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result);
-
-}  // namespace rocksdb
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 1b86f798f7f..9a07bccccd5 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -3,9 +3,12 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/file_reader_writer.h"
 #include <algorithm>
 #include <vector>
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index ac4cb685b6e..0b5eee00a24 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -11,11 +11,11 @@ int main() {
 }
 #else
 
+#include "file/writable_file_writer.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index b7592a0ce2b..9adeb721bcc 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -11,6 +11,8 @@
 
 #include "rocksdb/utilities/backupable_db.h"
 #include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/rate_limiter.h"
@@ -19,7 +21,6 @@
 #include "util/channel.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 37d9e4cd182..725bc0740ff 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -25,7 +25,6 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 574338e5296..448d846cb14 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -15,7 +15,9 @@
 #include "db/write_batch_internal.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "file/random_access_file_reader.h"
 #include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
 #include "logging/logging.h"
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/statistics.h"
@@ -31,7 +33,6 @@
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index b74a211bc95..9905ce12e86 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -5,17 +5,18 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_dump_tool.h"
-#include <cinttypes>
 #include <stdio.h>
+#include <cinttypes>
 #include <iostream>
 #include <memory>
 #include <string>
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "table/format.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h
index ff4672fd3f3..39f83737d46 100644
--- a/utilities/blob_db/blob_dump_tool.h
+++ b/utilities/blob_db/blob_dump_tool.h
@@ -8,9 +8,9 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include "file/random_access_file_reader.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 3f128c7d55e..de16b1522a8 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -17,6 +17,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "file/filename.h"
+#include "file/readahead_raf.h"
 #include "logging/logging.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 668a0372286..5442b727fed 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -8,10 +8,10 @@
 #include <atomic>
 #include <memory>
 
+#include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 #include "utilities/blob_db/blob_log_reader.h"
 #include "utilities/blob_db/blob_log_writer.h"
diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc
index 8ffcc2fa1ea..2ede0a8f2f7 100644
--- a/utilities/blob_db/blob_log_reader.cc
+++ b/utilities/blob_db/blob_log_reader.cc
@@ -9,8 +9,8 @@
 
 #include <algorithm>
 
+#include "file/random_access_file_reader.h"
 #include "monitoring/statistics.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
index af7971554ce..6c3990e3041 100644
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@@ -10,11 +10,11 @@
 #include <memory>
 #include <string>
 
+#include "file/random_access_file_reader.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_log_writer.cc b/utilities/blob_db/blob_log_writer.cc
index 51578c5c322..5087f67e797 100644
--- a/utilities/blob_db/blob_log_writer.cc
+++ b/utilities/blob_db/blob_log_writer.cc
@@ -9,10 +9,10 @@
 #include <cstdint>
 #include <string>
 
+#include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "utilities/blob_db/blob_log_format.h"
 
diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h
index b7f820b0681..ddea4032cbd 100644
--- a/utilities/persistent_cache/block_cache_tier_file.h
+++ b/utilities/persistent_cache/block_cache_tier_file.h
@@ -11,6 +11,8 @@
 #include <string>
 #include <vector>
 
+#include "file/random_access_file_reader.h"
+
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 
@@ -21,7 +23,6 @@
 
 #include "port/port.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 
 // The io code path of persistent cache uses pipelined architecture
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index d84a593b9d5..3e1f821f7a5 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -5,10 +5,10 @@
 
 #include "rocksdb/utilities/sim_cache.h"
 #include <atomic>
+#include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc
index d8e36c31276..2910745e20b 100644
--- a/utilities/trace/file_trace_reader_writer.cc
+++ b/utilities/trace/file_trace_reader_writer.cc
@@ -5,9 +5,10 @@
 
 #include "utilities/trace/file_trace_reader_writer.h"
 
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "trace_replay/trace_replay.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 

From 54fc617891d10e8480ff8613ee22aca128148fb2 Mon Sep 17 00:00:00 2001
From: "Peter (Stig) Edwards" <thatsafunnyname@gmail.com>
Date: Mon, 16 Sep 2019 10:42:55 -0700
Subject: [PATCH 386/572] Mention sst_dump cmd=recompress changes (#5807)

Summary:
As requested by siying in https://github.com/facebook/rocksdb/pull/5791#issuecomment-531417468
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5807

Differential Revision: D17399349

fbshipit-source-id: 5986c3894f356becd393fee0f1aeadcd9affc798
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index 0a33faaa4d0..0e91e518ec8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -10,6 +10,7 @@
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
+* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.

From 9bd5fce6e89fcb294a1d193f32f3e4bb2e41d994 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 16 Sep 2019 10:49:20 -0700
Subject: [PATCH 387/572] Refactor UniversalCompactionPicker code a little bit
 (#5639)

Summary:
Several functions of UniversalCompactionPicker share most of the parameters. Move these functions to a class with those shared arguments as class members. Hopefully this will make code slightly easier to maintain.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5639

Test Plan: Run all existing test.

Differential Revision: D16996403

fbshipit-source-id: fffafd1897ab132b420b1dec073542cffb5c44de
---
 db/compaction/compaction_picker_universal.cc | 435 +++++++++++--------
 db/compaction/compaction_picker_universal.h  |  66 ---
 db/db_universal_compaction_test.cc           |   8 +-
 3 files changed, 266 insertions(+), 243 deletions(-)

diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 5909ab576c3..7eddfa2b83c 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -25,6 +25,98 @@
 
 namespace rocksdb {
 namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+  UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
+                             const InternalKeyComparator* icmp,
+                             const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             UniversalCompactionPicker* picker,
+                             LogBuffer* log_buffer)
+      : ioptions_(ioptions),
+        icmp_(icmp),
+        cf_name_(cf_name),
+        mutable_cf_options_(mutable_cf_options),
+        vstorage_(vstorage),
+        picker_(picker),
+        log_buffer_(log_buffer) {}
+
+  // Form and return the compaction object. The caller owns return object.
+  Compaction* PickCompaction();
+
+ private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      size_t sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionToReduceSortedRuns(
+      unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionToReduceSizeAmp();
+
+  Compaction* PickDeleteTriggeredCompaction();
+
+  // Used in universal compaction when the enabled_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputFilesNonOverlapping(Compaction* c);
+
+  const ImmutableCFOptions& ioptions_;
+  const InternalKeyComparator* icmp_;
+  double score_;
+  std::vector<SortedRun> sorted_runs_;
+  const std::string& cf_name_;
+  const MutableCFOptions& mutable_cf_options_;
+  VersionStorageInfo* vstorage_;
+  UniversalCompactionPicker* picker_;
+  LogBuffer* log_buffer_;
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            uint64_t file_size);
+};
+
 // Used in universal compaction when trivial move is enabled.
 // This structure is used for the construction of min heap
 // that contains the file meta data, the level of the file
@@ -113,7 +205,7 @@ void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
 
 // Algorithm that checks to see if there are any overlapping
 // files in the input
-bool UniversalCompactionPicker::IsInputFilesNonOverlapping(Compaction* c) {
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
   auto comparator = icmp_->user_comparator();
   int first_iter = 1;
 
@@ -167,9 +259,18 @@ bool UniversalCompactionPicker::NeedsCompaction(
   return false;
 }
 
-void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
-                                                size_t out_buf_size,
-                                                bool print_path) const {
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+                                     mutable_cf_options, vstorage, this,
+                                     log_buffer);
+  return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+                                                 size_t out_buf_size,
+                                                 bool print_path) const {
   if (level == 0) {
     assert(file != nullptr);
     if (file->fd.GetPathId() == 0 || !print_path) {
@@ -185,7 +286,7 @@ void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
   }
 }
 
-void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
     char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
   if (level == 0) {
     assert(file != nullptr);
@@ -204,11 +305,11 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
   }
 }
 
-std::vector<UniversalCompactionPicker::SortedRun>
-UniversalCompactionPicker::CalculateSortedRuns(
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
     const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
     const MutableCFOptions& mutable_cf_options) {
-  std::vector<UniversalCompactionPicker::SortedRun> ret;
+  std::vector<UniversalCompactionBuilder::SortedRun> ret;
   for (FileMetaData* f : vstorage.LevelFiles(0)) {
     ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
                      f->being_compacted);
@@ -231,8 +332,8 @@ UniversalCompactionPicker::CalculateSortedRuns(
         // non-zero level, all the files should share the same being_compacted
         // value.
         // This assumption is only valid when
-        // mutable_cf_options.compaction_options_universal.allow_trivial_move is
-        // false
+        // mutable_cf_options.compaction_options_universal.allow_trivial_move
+        // is false
         assert(is_first || f->being_compacted == being_compacted);
       }
       if (is_first) {
@@ -250,65 +351,59 @@ UniversalCompactionPicker::CalculateSortedRuns(
 
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
-Compaction* UniversalCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
-  double score = vstorage->CompactionScore(kLevel0);
-  std::vector<SortedRun> sorted_runs =
-      CalculateSortedRuns(*vstorage, ioptions_, mutable_cf_options);
-
-  if (sorted_runs.size() == 0 ||
-      (vstorage->FilesMarkedForCompaction().empty() &&
-       sorted_runs.size() < (unsigned int)mutable_cf_options
-                                .level0_file_num_compaction_trigger)) {
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: nothing to do\n",
-                     cf_name.c_str());
-    TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
-                             nullptr);
+  score_ = vstorage_->CompactionScore(kLevel0);
+  sorted_runs_ =
+      CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+
+  if (sorted_runs_.size() == 0 ||
+      (vstorage_->FilesMarkedForCompaction().empty() &&
+       sorted_runs_.size() < (unsigned int)mutable_cf_options_
+                                 .level0_file_num_compaction_trigger)) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+                     cf_name_.c_str());
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
   ROCKS_LOG_BUFFER_MAX_SZ(
-      log_buffer, 3072,
+      log_buffer_, 3072,
       "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
-      cf_name.c_str(), sorted_runs.size(), vstorage->LevelSummary(&tmp));
+      cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
   // Check for size amplification first.
   Compaction* c = nullptr;
-  if (sorted_runs.size() >=
+  if (sorted_runs_.size() >=
       static_cast<size_t>(
-          mutable_cf_options.level0_file_num_compaction_trigger)) {
-    if ((c = PickCompactionToReduceSizeAmp(cf_name, mutable_cf_options,
-                                           vstorage, score, sorted_runs,
-                                           log_buffer)) != nullptr) {
-      ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: compacting for size amp\n",
-                       cf_name.c_str());
+          mutable_cf_options_.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+                       cf_name_.c_str());
     } else {
       // Size amplification is within limits. Try reducing read
       // amplification while maintaining file size ratios.
       unsigned int ratio =
-          mutable_cf_options.compaction_options_universal.size_ratio;
+          mutable_cf_options_.compaction_options_universal.size_ratio;
 
-      if ((c = PickCompactionToReduceSortedRuns(
-               cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX,
-               sorted_runs, log_buffer)) != nullptr) {
-        ROCKS_LOG_BUFFER(log_buffer,
+      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer_,
                          "[%s] Universal: compacting for size ratio\n",
-                         cf_name.c_str());
+                         cf_name_.c_str());
       } else {
         // Size amplification and file size ratios are within configured limits.
         // If max read amplification is exceeding configured limits, then force
         // compaction without looking at filesize ratios and try to reduce
         // the number of files to fewer than level0_file_num_compaction_trigger.
         // This is guaranteed by NeedsCompaction()
-        assert(sorted_runs.size() >=
+        assert(sorted_runs_.size() >=
                static_cast<size_t>(
-                   mutable_cf_options.level0_file_num_compaction_trigger));
+                   mutable_cf_options_.level0_file_num_compaction_trigger));
         // Get the total number of sorted runs that are not being compacted
         int num_sr_not_compacted = 0;
-        for (size_t i = 0; i < sorted_runs.size(); i++) {
-          if (sorted_runs[i].being_compacted == false) {
+        for (size_t i = 0; i < sorted_runs_.size(); i++) {
+          if (sorted_runs_[i].being_compacted == false) {
             num_sr_not_compacted++;
           }
         }
@@ -316,16 +411,15 @@ Compaction* UniversalCompactionPicker::PickCompaction(
         // The number of sorted runs that are not being compacted is greater
         // than the maximum allowed number of sorted runs
         if (num_sr_not_compacted >
-            mutable_cf_options.level0_file_num_compaction_trigger) {
+            mutable_cf_options_.level0_file_num_compaction_trigger) {
           unsigned int num_files =
               num_sr_not_compacted -
-              mutable_cf_options.level0_file_num_compaction_trigger + 1;
-          if ((c = PickCompactionToReduceSortedRuns(
-                   cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
-                   num_files, sorted_runs, log_buffer)) != nullptr) {
-            ROCKS_LOG_BUFFER(log_buffer,
+              mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+              nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer_,
                              "[%s] Universal: compacting for file num -- %u\n",
-                             cf_name.c_str(), num_files);
+                             cf_name_.c_str(), num_files);
           }
         }
       }
@@ -333,22 +427,20 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   }
 
   if (c == nullptr) {
-    if ((c = PickDeleteTriggeredCompaction(cf_name, mutable_cf_options,
-                                           vstorage, score, sorted_runs,
-                                           log_buffer)) != nullptr) {
-      ROCKS_LOG_BUFFER(log_buffer,
+    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] Universal: delete triggered compaction\n",
-                       cf_name.c_str());
+                       cf_name_.c_str());
     }
   }
 
   if (c == nullptr) {
-    TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
-                             nullptr);
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
 
-  if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
       true) {
     c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
   }
@@ -394,15 +486,15 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
                     c->inputs(0)->size());
 
-  RegisterCompaction(c);
-  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+  picker_->RegisterCompaction(c);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
 
-  TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
+  TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
                            c);
   return c;
 }
 
-uint32_t UniversalCompactionPicker::GetPathId(
+uint32_t UniversalCompactionBuilder::GetPathId(
     const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
   // Two conditions need to be satisfied:
@@ -440,15 +532,12 @@ uint32_t UniversalCompactionPicker::GetPathId(
 // Consider compaction files based on their size differences with
 // the next file in time order.
 //
-Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score, unsigned int ratio,
-    unsigned int max_number_of_files_to_compact,
-    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+    unsigned int ratio, unsigned int max_number_of_files_to_compact) {
   unsigned int min_merge_width =
-      mutable_cf_options.compaction_options_universal.min_merge_width;
+      mutable_cf_options_.compaction_options_universal.min_merge_width;
   unsigned int max_merge_width =
-      mutable_cf_options.compaction_options_universal.max_merge_width;
+      mutable_cf_options_.compaction_options_universal.max_merge_width;
 
   const SortedRun* sr = nullptr;
   bool done = false;
@@ -462,16 +551,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   // Caller checks the size before executing this function. This invariant is
   // important because otherwise we may have a possible integer underflow when
   // dealing with unsigned types.
-  assert(sorted_runs.size() > 0);
+  assert(sorted_runs_.size() > 0);
 
   // Considers a candidate file only if it is smaller than the
   // total size accumulated so far.
-  for (size_t loop = 0; loop < sorted_runs.size(); loop++) {
+  for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
     candidate_count = 0;
 
     // Skip files that are already being compacted
-    for (sr = nullptr; loop < sorted_runs.size(); loop++) {
-      sr = &sorted_runs[loop];
+    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+      sr = &sorted_runs_[loop];
 
       if (!sr->being_compacted) {
         candidate_count = 1;
@@ -479,10 +568,10 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       }
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf));
-      ROCKS_LOG_BUFFER(log_buffer,
+      ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] Universal: %s"
                        "[%d] being compacted, skipping",
-                       cf_name.c_str(), file_num_buf, loop);
+                       cf_name_.c_str(), file_num_buf, loop);
 
       sr = nullptr;
     }
@@ -493,15 +582,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
     if (sr != nullptr) {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
-      ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Possible candidate %s[%d].",
-                       cf_name.c_str(), file_num_buf, loop);
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Possible candidate %s[%d].",
+                       cf_name_.c_str(), file_num_buf, loop);
     }
 
     // Check if the succeeding files need compaction.
     for (size_t i = loop + 1;
-         candidate_count < max_files_to_compact && i < sorted_runs.size();
+         candidate_count < max_files_to_compact && i < sorted_runs_.size();
          i++) {
-      const SortedRun* succeeding_sr = &sorted_runs[i];
+      const SortedRun* succeeding_sr = &sorted_runs_[i];
       if (succeeding_sr->being_compacted) {
         break;
       }
@@ -515,7 +605,7 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       if (sz < static_cast<double>(succeeding_sr->size)) {
         break;
       }
-      if (mutable_cf_options.compaction_options_universal.stop_style ==
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
           kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
@@ -541,12 +631,12 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       break;
     } else {
       for (size_t i = loop;
-           i < loop + candidate_count && i < sorted_runs.size(); i++) {
-        const SortedRun* skipping_sr = &sorted_runs[i];
+           i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs_[i];
         char file_num_buf[256];
         skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
-        ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Skipping %s",
-                         cf_name.c_str(), file_num_buf);
+        ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+                         cf_name_.c_str(), file_num_buf);
       }
     }
   }
@@ -558,16 +648,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   // size ratio of compression.
   bool enable_compression = true;
   int ratio_to_compress =
-      mutable_cf_options.compaction_options_universal.compression_size_percent;
+      mutable_cf_options_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
     uint64_t total_size = 0;
-    for (auto& sorted_run : sorted_runs) {
+    for (auto& sorted_run : sorted_runs_) {
       total_size += sorted_run.compensated_file_size;
     }
 
     uint64_t older_file_size = 0;
-    for (size_t i = sorted_runs.size() - 1; i >= first_index_after; i--) {
-      older_file_size += sorted_runs[i].size;
+    for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs_[i].size;
       if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
         enable_compression = false;
         break;
@@ -577,46 +667,46 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
 
   uint64_t estimated_total_size = 0;
   for (unsigned int i = 0; i < first_index_after; i++) {
-    estimated_total_size += sorted_runs[i].size;
+    estimated_total_size += sorted_runs_[i].size;
   }
   uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
-  int start_level = sorted_runs[start_index].level;
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
   int output_level;
-  if (first_index_after == sorted_runs.size()) {
-    output_level = vstorage->num_levels() - 1;
-  } else if (sorted_runs[first_index_after].level == 0) {
+  if (first_index_after == sorted_runs_.size()) {
+    output_level = vstorage_->num_levels() - 1;
+  } else if (sorted_runs_[first_index_after].level == 0) {
     output_level = 0;
   } else {
-    output_level = sorted_runs[first_index_after].level - 1;
+    output_level = sorted_runs_[first_index_after].level - 1;
   }
 
   // last level is reserved for the files ingested behind
   if (ioptions_.allow_ingest_behind &&
-      (output_level == vstorage->num_levels() - 1)) {
+      (output_level == vstorage_->num_levels() - 1)) {
     assert(output_level > 1);
     output_level--;
   }
 
-  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
   }
   for (size_t i = start_index; i < first_index_after; i++) {
-    auto& picking_sr = sorted_runs[i];
+    auto& picking_sr = sorted_runs_[i];
     if (picking_sr.level == 0) {
       FileMetaData* picking_file = picking_sr.file;
       inputs[0].files.push_back(picking_file);
     } else {
       auto& files = inputs[picking_sr.level - start_level].files;
-      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
         files.push_back(f);
       }
     }
     char file_num_buf[256];
     picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Picking %s", cf_name.c_str(),
-                     file_num_buf);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+                     cf_name_.c_str(), file_num_buf);
   }
 
   CompactionReason compaction_reason;
@@ -626,16 +716,17 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
     compaction_reason = CompactionReason::kUniversalSortedRunNum;
   }
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
       LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, start_level,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
                          1, enable_compression),
-      GetCompressionOptions(ioptions_, vstorage, start_level,
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
                             enable_compression),
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score, false /* deletion_compaction */, compaction_reason);
+      score_, false /* deletion_compaction */, compaction_reason);
 }
 
 // Look at overall size amplification. If size amplification
@@ -644,12 +735,9 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
 //
-Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score,
-    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
   // percentage flexibility while reducing size amplification
-  uint64_t ratio = mutable_cf_options.compaction_options_universal
+  uint64_t ratio = mutable_cf_options_.compaction_options_universal
                        .max_size_amplification_percent;
 
   unsigned int candidate_count = 0;
@@ -657,21 +745,23 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   size_t start_index = 0;
   const SortedRun* sr = nullptr;
 
-  if (sorted_runs.back().being_compacted) {
+  assert(!sorted_runs_.empty());
+  if (sorted_runs_.back().being_compacted) {
     return nullptr;
   }
 
   // Skip files that are already being compacted
-  for (size_t loop = 0; loop < sorted_runs.size() - 1; loop++) {
-    sr = &sorted_runs[loop];
+  for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
     if (!sr->being_compacted) {
       start_index = loop;  // Consider this as the first candidate.
       break;
     }
     char file_num_buf[kFormatFileNumberBufSize];
     sr->Dump(file_num_buf, sizeof(file_num_buf), true);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: skipping %s[%d] compacted %s",
-                     cf_name.c_str(), file_num_buf, loop,
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: skipping %s[%d] compacted %s",
+                     cf_name_.c_str(), file_num_buf, loop,
                      " cannot be a candidate to reduce size amp.\n");
     sr = nullptr;
   }
@@ -683,20 +773,20 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
     char file_num_buf[kFormatFileNumberBufSize];
     sr->Dump(file_num_buf, sizeof(file_num_buf), true);
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
-        cf_name.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+        cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
   }
 
   // keep adding up all the remaining files
-  for (size_t loop = start_index; loop < sorted_runs.size() - 1; loop++) {
-    sr = &sorted_runs[loop];
+  for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
     if (sr->being_compacted) {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       ROCKS_LOG_BUFFER(
-          log_buffer, "[%s] Universal: Possible candidate %s[%d] %s",
-          cf_name.c_str(), file_num_buf, start_index,
+          log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name_.c_str(), file_num_buf, start_index,
           " is already being compacted. No size amp reduction possible.\n");
       return nullptr;
     }
@@ -708,58 +798,58 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   }
 
   // size of earliest file
-  uint64_t earliest_file_size = sorted_runs.back().size;
+  uint64_t earliest_file_size = sorted_runs_.back().size;
 
   // size amplification = percentage of additional size
   if (candidate_size * 100 < ratio * earliest_file_size) {
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
         " earliest-file-size %" PRIu64,
-        cf_name.c_str(), candidate_size, earliest_file_size);
+        cf_name_.c_str(), candidate_size, earliest_file_size);
     return nullptr;
   } else {
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
         " earliest-file-size %" PRIu64,
-        cf_name.c_str(), candidate_size, earliest_file_size);
+        cf_name_.c_str(), candidate_size, earliest_file_size);
   }
-  assert(start_index < sorted_runs.size() - 1);
+  assert(start_index < sorted_runs_.size() - 1);
 
   // Estimate total file size
   uint64_t estimated_total_size = 0;
-  for (size_t loop = start_index; loop < sorted_runs.size(); loop++) {
-    estimated_total_size += sorted_runs[loop].size;
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
   }
   uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
-  int start_level = sorted_runs[start_index].level;
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
 
-  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
   }
   // We always compact all the files, so always compress.
-  for (size_t loop = start_index; loop < sorted_runs.size(); loop++) {
-    auto& picking_sr = sorted_runs[loop];
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    auto& picking_sr = sorted_runs_[loop];
     if (picking_sr.level == 0) {
       FileMetaData* f = picking_sr.file;
       inputs[0].files.push_back(f);
     } else {
       auto& files = inputs[picking_sr.level - start_level].files;
-      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
         files.push_back(f);
       }
     }
     char file_num_buf[256];
     picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: size amp picking %s",
-                     cf_name.c_str(), file_num_buf);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: size amp picking %s",
+                     cf_name_.c_str(), file_num_buf);
   }
 
   // output files at the bottom most level, unless it's reserved
-  int output_level = vstorage->num_levels() - 1;
+  int output_level = vstorage_->num_levels() - 1;
   // last level is reserved for the files ingested behind
   if (ioptions_.allow_ingest_behind) {
     assert(output_level > 1);
@@ -767,29 +857,27 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   }
 
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
       /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
-                         1),
-      GetCompressionOptions(ioptions_, vstorage, output_level),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1),
+      GetCompressionOptions(ioptions_, vstorage_, output_level),
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score, false /* deletion_compaction */,
+      score_, false /* deletion_compaction */,
       CompactionReason::kUniversalSizeAmplification);
 }
 
 // Pick files marked for compaction. Typically, files are marked by
 // CompactOnDeleteCollector due to the presence of tombstones.
-Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score,
-    const std::vector<SortedRun>& /*sorted_runs*/, LogBuffer* /*log_buffer*/) {
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
   CompactionInputFiles start_level_inputs;
   int output_level;
   std::vector<CompactionInputFiles> inputs;
 
-  if (vstorage->num_levels() == 1) {
+  if (vstorage_->num_levels() == 1) {
     // This is single level universal. Since we're basically trying to reclaim
     // space by processing files marked for compaction due to high tombstone
     // density, let's do the same thing as compaction to reduce size amp which
@@ -799,7 +887,7 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
     start_level_inputs.level = 0;
     start_level_inputs.files.clear();
     output_level = 0;
-    for (FileMetaData* f : vstorage->LevelFiles(0)) {
+    for (FileMetaData* f : vstorage_->LevelFiles(0)) {
       if (f->marked_for_compaction) {
         compact = true;
       }
@@ -818,24 +906,24 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
     // For multi-level universal, the strategy is to make this look more like
     // leveled. We pick one of the files marked for compaction and compact with
     // overlapping files in the adjacent level.
-    PickFilesMarkedForCompaction(cf_name, vstorage, &start_level, &output_level,
-                                 &start_level_inputs);
+    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+                                          &output_level, &start_level_inputs);
     if (start_level_inputs.empty()) {
       return nullptr;
     }
 
     // Pick the first non-empty level after the start_level
-    for (output_level = start_level + 1; output_level < vstorage->num_levels();
+    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
          output_level++) {
-      if (vstorage->NumLevelFiles(output_level) != 0) {
+      if (vstorage_->NumLevelFiles(output_level) != 0) {
         break;
       }
     }
 
     // If all higher levels are empty, pick the highest level as output level
-    if (output_level == vstorage->num_levels()) {
+    if (output_level == vstorage_->num_levels()) {
       if (start_level == 0) {
-        output_level = vstorage->num_levels() - 1;
+        output_level = vstorage_->num_levels() - 1;
       } else {
         // If start level is non-zero and all higher levels are empty, this
         // compaction will translate into a trivial move. Since the idea is
@@ -845,15 +933,15 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
       }
     }
     if (ioptions_.allow_ingest_behind &&
-        output_level == vstorage->num_levels() - 1) {
+        output_level == vstorage_->num_levels() - 1) {
       assert(output_level > 1);
       output_level--;
     }
 
     if (output_level != 0) {
       if (start_level == 0) {
-        if (!GetOverlappingL0Files(vstorage, &start_level_inputs, output_level,
-                                   nullptr)) {
+        if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+                                            output_level, nullptr)) {
           return nullptr;
         }
       }
@@ -862,16 +950,16 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
       int parent_index = -1;
 
       output_level_inputs.level = output_level;
-      if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage,
-                            &start_level_inputs, &output_level_inputs,
-                            &parent_index, -1)) {
+      if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                     &start_level_inputs, &output_level_inputs,
+                                     &parent_index, -1)) {
         return nullptr;
       }
       inputs.push_back(start_level_inputs);
       if (!output_level_inputs.empty()) {
         inputs.push_back(output_level_inputs);
       }
-      if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+      if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
         return nullptr;
       }
     } else {
@@ -881,21 +969,22 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
 
   uint64_t estimated_total_size = 0;
   // Use size of the output level as estimated file size
-  for (FileMetaData* f : vstorage->LevelFiles(output_level)) {
+  for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
     estimated_total_size += f->fd.GetFileSize();
   }
   uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
       /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
-                         1),
-      GetCompressionOptions(ioptions_, vstorage, output_level),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1),
+      GetCompressionOptions(ioptions_, vstorage_, output_level),
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
-      score, false /* deletion_compaction */,
+      score_, false /* deletion_compaction */,
       CompactionReason::kFilesMarkedForCompaction);
 }
 }  // namespace rocksdb
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index 2c44735d95f..28f3a63cd92 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -27,72 +27,6 @@ class UniversalCompactionPicker : public CompactionPicker {
 
   virtual bool NeedsCompaction(
       const VersionStorageInfo* vstorage) const override;
-
- private:
-  struct SortedRun {
-    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
-              uint64_t _compensated_file_size, bool _being_compacted)
-        : level(_level),
-          file(_file),
-          size(_size),
-          compensated_file_size(_compensated_file_size),
-          being_compacted(_being_compacted) {
-      assert(compensated_file_size > 0);
-      assert(level != 0 || file != nullptr);
-    }
-
-    void Dump(char* out_buf, size_t out_buf_size,
-              bool print_path = false) const;
-
-    // sorted_run_count is added into the string to print
-    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
-                      size_t sorted_run_count) const;
-
-    int level;
-    // `file` Will be null for level > 0. For level = 0, the sorted run is
-    // for this file.
-    FileMetaData* file;
-    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
-    // files in the level. `being_compacted` should be the same for all files
-    // in a non-zero level. Use the value here.
-    uint64_t size;
-    uint64_t compensated_file_size;
-    bool being_compacted;
-  };
-
-  // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionToReduceSortedRuns(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score, unsigned int ratio,
-      unsigned int num_files, const std::vector<SortedRun>& sorted_runs,
-      LogBuffer* log_buffer);
-
-  // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionToReduceSizeAmp(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score,
-      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
-
-  Compaction* PickDeleteTriggeredCompaction(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score,
-      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
-
-  // Used in universal compaction when the enabled_trivial_move
-  // option is set. Checks whether there are any overlapping files
-  // in the input. Returns true if the input files are non
-  // overlapping.
-  bool IsInputFilesNonOverlapping(Compaction* c);
-
-  static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
-      const MutableCFOptions& mutable_cf_options);
-
-  // Pick a path ID to place a newly generated file, with its estimated file
-  // size.
-  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
-                            const MutableCFOptions& mutable_cf_options,
-                            uint64_t file_size);
 };
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 524892e9e41..4f3ddef10c0 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -397,7 +397,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
   int total_picked_compactions = 0;
   int total_size_amp_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
           Compaction* c = static_cast<Compaction*>(arg);
@@ -478,7 +478,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
   int total_picked_compactions = 0;
   int total_size_ratio_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
           Compaction* c = static_cast<Compaction*>(arg);
@@ -837,14 +837,14 @@ TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
         "BackgroundCallCompaction:0"},
-       {"UniversalCompactionPicker::PickCompaction:Return",
+       {"UniversalCompactionBuilder::PickCompaction:Return",
         "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
        {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
         "CompactionJob::Run():Start"}});
 
   int total_picked_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
         }

From d3a6726f02e8b9aa17d7fcc76d352f766c070457 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 16 Sep 2019 11:36:58 -0700
Subject: [PATCH 388/572] Revert changes from PR#5784 accidentally in PR#5780
 (#5810)

Summary:
This will allow us to fix history by having the code changes for PR#5784 properly attributed to it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5810

Differential Revision: D17400231

Pulled By: pdillinger

fbshipit-source-id: 2da8b1cdf2533cfedb35b5526eadefb38c291f09
---
 Makefile                                |   5 -
 db/db_properties_test.cc                |   6 +-
 port/port_posix.h                       |  23 +---
 table/full_filter_bits_builder.h        |   8 +-
 table/plain/plain_table_bloom.cc        |   6 +-
 table/plain/plain_table_bloom.h         |  63 ++++++---
 table/plain/plain_table_reader.cc       |   3 +-
 third-party/folly/folly/ConstexprMath.h |  28 ----
 util/bloom.cc                           | 169 +++++++++++++++++-------
 util/bloom_impl.h                       | 140 --------------------
 util/bloom_test.cc                      |   3 +-
 11 files changed, 185 insertions(+), 269 deletions(-)
 delete mode 100644 util/bloom_impl.h

diff --git a/Makefile b/Makefile
index 898570f84ae..dc90bdc9ae6 100644
--- a/Makefile
+++ b/Makefile
@@ -332,11 +332,6 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
 	endif
 endif
 
-ifdef TEST_CACHE_LINE_SIZE
-  PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
-  PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
-endif
-
 # This (the first rule) must depend on "all".
 default: all
 
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 0c3bb891464..956accef821 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -212,8 +212,7 @@ void VerifySimilar(uint64_t a, uint64_t b, double bias) {
 
 void VerifyTableProperties(const TableProperties& base_tp,
                            const TableProperties& new_tp,
-                           double filter_size_bias =
-                               CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+                           double filter_size_bias = 0.1,
                            double index_size_bias = 0.1,
                            double data_size_bias = 0.1,
                            double num_data_blocks_bias = 0.05) {
@@ -267,8 +266,7 @@ void GetExpectedTableProperties(
        // discount 1 byte as value size is not encoded in value delta encoding
        (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
-      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
-                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+      kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
 }
 }  // anonymous namespace
 
diff --git a/port/port_posix.h b/port/port_posix.h
index 49d2b9ae854..51eb241629a 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -178,31 +178,22 @@ typedef pthread_once_t OnceType;
 extern void InitOnce(OnceType* once, void (*initializer)());
 
 #ifndef CACHE_LINE_SIZE
-  // To test behavior with non-native cache line size, e.g. for
-  // Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
-  // This disables ALIGN_AS to keep it from failing compilation.
-  #ifdef TEST_CACHE_LINE_SIZE
-    #define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
-    #define ALIGN_AS(n) /*empty*/
+  #if defined(__s390__)
+    #define CACHE_LINE_SIZE 256U
+  #elif defined(__powerpc__) || defined(__aarch64__)
+    #define CACHE_LINE_SIZE 128U
   #else
-    #if defined(__s390__)
-      #define CACHE_LINE_SIZE 256U
-    #elif defined(__powerpc__) || defined(__aarch64__)
-      #define CACHE_LINE_SIZE 128U
-    #else
-      #define CACHE_LINE_SIZE 64U
-    #endif
-    #define ALIGN_AS(n) alignas(n)
+    #define CACHE_LINE_SIZE 64U
   #endif
 #endif
 
-static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
-              "Cache line size must be a power of 2 number of bytes");
 
 extern void *cacheline_aligned_alloc(size_t size);
 
 extern void cacheline_aligned_free(void *memblock);
 
+#define ALIGN_AS(n) alignas(n)
+
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
 extern void Crash(const std::string& srcfile, int srcline);
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
index 5e0b9bb1944..c719c698aac 100644
--- a/table/full_filter_bits_builder.h
+++ b/table/full_filter_bits_builder.h
@@ -20,8 +20,8 @@ class Slice;
 
 class FullFilterBitsBuilder : public FilterBitsBuilder {
  public:
-  explicit FullFilterBitsBuilder(const int bits_per_key,
-                                 const int num_probes);
+  explicit FullFilterBitsBuilder(const size_t bits_per_key,
+                                 const size_t num_probes);
 
   // No Copy allowed
   FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
@@ -56,8 +56,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
 
  private:
   friend class FullFilterBlockTest_DuplicateEntries_Test;
-  int bits_per_key_;
-  int num_probes_;
+  size_t bits_per_key_;
+  size_t num_probes_;
   std::vector<uint32_t> hash_entries_;
 
   // Get totalbits that optimized for cpu cache line
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
index 542010169dd..778b3b558ad 100644
--- a/table/plain/plain_table_bloom.cc
+++ b/table/plain/plain_table_bloom.cc
@@ -33,9 +33,9 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
     : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
 
-void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+void PlainTableBloomV1::SetRawData(unsigned char* raw_data, uint32_t total_bits,
                               uint32_t num_blocks) {
-  data_ = raw_data;
+  data_ = reinterpret_cast<uint8_t*>(raw_data);
   kTotalBits = total_bits;
   kNumBlocks = num_blocks;
 }
@@ -63,7 +63,7 @@ void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
   if (kNumBlocks > 0 && cache_line_offset > 0) {
     raw += CACHE_LINE_SIZE - cache_line_offset;
   }
-  data_ = raw;
+  data_ = reinterpret_cast<uint8_t*>(raw);
 }
 
 void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index 271aa8f4feb..b9248cdaf12 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -10,11 +10,8 @@
 #include "rocksdb/slice.h"
 
 #include "port/port.h"
-#include "util/bloom_impl.h"
 #include "util/hash.h"
 
-#include "third-party/folly/folly/ConstexprMath.h"
-
 #include <memory>
 
 namespace rocksdb {
@@ -54,10 +51,10 @@ class PlainTableBloomV1 {
   uint32_t GetNumBlocks() const { return kNumBlocks; }
 
   Slice GetRawData() const {
-    return Slice(data_, GetTotalBits() / 8);
+    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
   }
 
-  void SetRawData(char* raw_data, uint32_t total_bits,
+  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
                   uint32_t num_blocks = 0);
 
   uint32_t GetTotalBits() const { return kTotalBits; }
@@ -69,10 +66,7 @@ class PlainTableBloomV1 {
   uint32_t kNumBlocks;
   const uint32_t kNumProbes;
 
-  char* data_;
-
-  static constexpr int LOG2_CACHE_LINE_SIZE =
-      folly::constexpr_log2(CACHE_LINE_SIZE);
+  uint8_t* data_;
 };
 
 #if defined(_MSC_VER)
@@ -82,9 +76,8 @@ class PlainTableBloomV1 {
 #endif
 inline void PlainTableBloomV1::Prefetch(uint32_t h) {
   if (kNumBlocks != 0) {
-    uint32_t ignored;
-    LegacyLocalityBloomImpl</*ExtraRotates*/true>::PrepareHashMayMatch(
-      h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    PREFETCH(&(data_[b / 8]), 0, 3);
   }
 }
 #if defined(_MSC_VER)
@@ -93,22 +86,54 @@ inline void PlainTableBloomV1::Prefetch(uint32_t h) {
 
 inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
   assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    return LegacyLocalityBloomImpl<true>::HashMayMatch(
-               h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      //  to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
   } else {
-    return LegacyNoLocalityBloomImpl::HashMayMatch(
-               h, kTotalBits, kNumProbes, data_);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
   }
+  return true;
 }
 
 inline void PlainTableBloomV1::AddHash(uint32_t h) {
   assert(IsInitialized());
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    LegacyLocalityBloomImpl<true>::AddHash(
-        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+      // to a simple and operation by compiler.
+      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      // Rotate h so that we don't reuse the same bytes.
+      h = h / (CACHE_LINE_SIZE * 8) +
+          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
+      h += delta;
+    }
   } else {
-    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+    for (uint32_t i = 0; i < kNumProbes; ++i) {
+      const uint32_t bitpos = h % kTotalBits;
+      data_[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
   }
 }
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index ed3be5b933c..085faf519f1 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -368,7 +368,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     }
     // cast away const qualifier, because bloom_ won't be changed
     bloom_.SetRawData(
-        const_cast<char*>(bloom_block->data()),
+        const_cast<unsigned char*>(
+            reinterpret_cast<const unsigned char*>(bloom_block->data())),
         static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
   } else {
     // Index in file but no bloom in file. Disable bloom filter in this case.
diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h
index 6e3a9966e38..b125c5f423b 100644
--- a/third-party/folly/folly/ConstexprMath.h
+++ b/third-party/folly/folly/ConstexprMath.h
@@ -14,32 +14,4 @@ template <typename T, typename... Ts>
 constexpr T constexpr_max(T a, T b, Ts... ts) {
   return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...);
 }
-
-namespace detail {
-template <typename T>
-constexpr T constexpr_log2_(T a, T e) {
-  return e == T(1) ? a : constexpr_log2_(a + T(1), e / T(2));
-}
-
-template <typename T>
-constexpr T constexpr_log2_ceil_(T l2, T t) {
-  return l2 + T(T(1) << l2 < t ? 1 : 0);
-}
-
-template <typename T>
-constexpr T constexpr_square_(T t) {
-  return t * t;
-}
-} // namespace detail
-
-template <typename T>
-constexpr T constexpr_log2(T t) {
-  return detail::constexpr_log2_(T(0), t);
-}
-
-template <typename T>
-constexpr T constexpr_log2_ceil(T t) {
-  return detail::constexpr_log2_ceil_(constexpr_log2(t), t);
-}
-
 } // namespace folly
diff --git a/util/bloom.cc b/util/bloom.cc
index 1548f7c2519..23607a51e4a 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -13,19 +13,16 @@
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/full_filter_bits_builder.h"
-#include "third-party/folly/folly/ConstexprMath.h"
-#include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
-typedef LegacyLocalityBloomImpl</*ExtraRotates*/false> LegacyFullFilterImpl;
 class BlockBasedFilterBlockBuilder;
 class FullFilterBlockBuilder;
 
-FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
-                                             const int num_probes)
+FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
+                                             const size_t num_probes)
     : bits_per_key_(bits_per_key), num_probes_(num_probes) {
   assert(bits_per_key_);
   }
@@ -77,7 +74,7 @@ uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
                                                uint32_t* num_lines) {
   assert(bits_per_key_);
   if (num_entry != 0) {
-    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
+    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
 
     *total_bits = GetTotalBitsForLocality(total_bits_tmp);
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
@@ -127,16 +124,24 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
 #endif
   assert(num_lines > 0 && total_bits > 0);
 
-  LegacyFullFilterImpl::AddHash(
-      h, num_lines, num_probes_, data,
-      folly::constexpr_log2(CACHE_LINE_SIZE));
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
+
+  for (uint32_t i = 0; i < num_probes_; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    // to a simple operation by compiler.
+    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+    data[bitpos / 8] |= (1 << (bitpos % 8));
+
+    h += delta;
+  }
 }
 
 namespace {
 class FullFilterBitsReader : public FilterBitsReader {
  public:
   explicit FullFilterBitsReader(const Slice& contents)
-      : data_(contents.data()),
+      : data_(const_cast<char*>(contents.data())),
         data_len_(static_cast<uint32_t>(contents.size())),
         num_probes_(0),
         num_lines_(0),
@@ -172,23 +177,16 @@ class FullFilterBitsReader : public FilterBitsReader {
 
   ~FullFilterBitsReader() override {}
 
-  // "contents" contains the data built by a preceding call to
-  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
-  // passed to FilterBitsBuilder::AddKey. This method may return true or false
-  // if the key was not on the list, but it should aim to return false with a
-  // high probability.
-  bool MayMatch(const Slice& key) override {
+  bool MayMatch(const Slice& entry) override {
     if (data_len_ <= 5) {   // remain same with original filter
       return false;
     }
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return true;
-    uint32_t hash = BloomHash(key);
-    uint32_t byte_offset;
-    LegacyFullFilterImpl::PrepareHashMayMatch(
-        hash, num_lines_, data_, /*out*/&byte_offset, log2_cache_line_size_);
-    return LegacyFullFilterImpl::HashMayMatchPrepared(
-             hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+    uint32_t hash = BloomHash(entry);
+    uint32_t bit_offset;
+    FilterPrepare(hash, Slice(data_, data_len_), num_lines_, &bit_offset);
+    return HashMayMatch(hash, Slice(data_, data_len_), num_probes_, bit_offset);
   }
 
   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
@@ -204,18 +202,16 @@ class FullFilterBitsReader : public FilterBitsReader {
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return;
     uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
-    uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
+    uint32_t bit_offsets[MultiGetContext::MAX_BATCH_SIZE];
     for (int i = 0; i < num_keys; ++i) {
       hashes[i] = BloomHash(*keys[i]);
-      LegacyFullFilterImpl::PrepareHashMayMatch(
-          hashes[i], num_lines_, data_,
-          /*out*/&byte_offsets[i], log2_cache_line_size_);
+      FilterPrepare(hashes[i], Slice(data_, data_len_), num_lines_,
+                    &bit_offsets[i]);
     }
 
     for (int i = 0; i < num_keys; ++i) {
-      if (!LegacyFullFilterImpl::HashMayMatchPrepared(
-               hashes[i], num_probes_,
-               data_ + byte_offsets[i], log2_cache_line_size_)) {
+      if (!HashMayMatch(hashes[i], Slice(data_, data_len_), num_probes_,
+                        bit_offsets[i])) {
         may_match[i] = false;
       }
     }
@@ -223,20 +219,38 @@ class FullFilterBitsReader : public FilterBitsReader {
 
  private:
   // Filter meta data
-  const char* data_;
+  char* data_;
   uint32_t data_len_;
-  int num_probes_;
+  size_t num_probes_;
   uint32_t num_lines_;
   uint32_t log2_cache_line_size_;
 
   // Get num_probes, and num_lines from filter
   // If filter format broken, set both to 0.
-  void GetFilterMeta(const Slice& filter, int* num_probes,
+  void GetFilterMeta(const Slice& filter, size_t* num_probes,
                              uint32_t* num_lines);
+
+  // "filter" contains the data appended by a preceding call to
+  // FilterBitsBuilder::Finish. This method must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  //
+  // hash: target to be checked
+  // filter: the whole filter, including meta data bytes
+  // num_probes: number of probes, read before hand
+  // num_lines: filter metadata, read before hand
+  // Before calling this function, need to ensure the input meta data
+  // is valid.
+  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
+                    const size_t& num_probes, const uint32_t& bit_offset);
+
+  void FilterPrepare(const uint32_t& hash, const Slice& filter,
+                     const uint32_t& num_lines, uint32_t* bit_offset);
 };
 
 void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
-    int* num_probes, uint32_t* num_lines) {
+    size_t* num_probes, uint32_t* num_lines) {
   uint32_t len = static_cast<uint32_t>(filter.size());
   if (len <= 5) {
     // filter is empty or broken
@@ -249,6 +263,54 @@ void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
   *num_lines = DecodeFixed32(filter.data() + len - 4);
 }
 
+void FullFilterBitsReader::FilterPrepare(const uint32_t& hash,
+                                         const Slice& filter,
+                                         const uint32_t& num_lines,
+                                         uint32_t* bit_offset) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) return;  // remain the same with original filter
+
+  // It is ensured the params are valid before calling it
+  assert(num_lines != 0 && (len - 5) % num_lines == 0);
+
+  uint32_t h = hash;
+  // Left shift by an extra 3 to convert bytes to bits
+  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
+  PREFETCH(&filter.data()[b / 8], 0 /* rw */, 1 /* locality */);
+  PREFETCH(&filter.data()[b / 8 + (1 << log2_cache_line_size_) - 1],
+      0 /* rw */, 1 /* locality */);
+  *bit_offset = b;
+}
+
+bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
+                                        const Slice& filter,
+                                        const size_t& num_probes,
+                                        const uint32_t& bit_offset) {
+  uint32_t len = static_cast<uint32_t>(filter.size());
+  if (len <= 5) return false;  // remain the same with original filter
+
+  // It is ensured the params are valid before calling it
+  assert(num_probes != 0);
+  const char* data = filter.data();
+
+  uint32_t h = hash;
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+
+  for (uint32_t i = 0; i < num_probes; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    //  to a simple and operation by compiler.
+    const uint32_t bitpos =
+        bit_offset + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
+    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+
+    h += delta;
+  }
+
+  return true;
+}
+
 // An implementation of filter policy
 class BloomFilterPolicy : public FilterPolicy {
  public:
@@ -264,43 +326,56 @@ class BloomFilterPolicy : public FilterPolicy {
 
   void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
-    uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
+    size_t bits = n * bits_per_key_;
 
     // For small n, we can see a very high false positive rate.  Fix it
     // by enforcing a minimum bloom filter length.
     if (bits < 64) bits = 64;
 
-    uint32_t bytes = (bits + 7) / 8;
+    size_t bytes = (bits + 7) / 8;
     bits = bytes * 8;
 
     const size_t init_size = dst->size();
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
-    for (int i = 0; i < n; i++) {
-      LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits,
-                                         num_probes_, array);
+    for (size_t i = 0; i < static_cast<size_t>(n); i++) {
+      // Use double-hashing to generate a sequence of hash values.
+      // See analysis in [Kirsch,Mitzenmacher 2006].
+      uint32_t h = hash_func_(keys[i]);
+      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+      for (size_t j = 0; j < num_probes_; j++) {
+        const uint32_t bitpos = h % bits;
+        array[bitpos/8] |= (1 << (bitpos % 8));
+        h += delta;
+      }
     }
   }
 
   bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
-    if (len < 2 || len > 0xffffffffU) { return false; }
+    if (len < 2) return false;
 
     const char* array = bloom_filter.data();
-    const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
+    const size_t bits = (len - 1) * 8;
 
     // Use the encoded k so that we can read filters generated by
     // bloom filters created using different parameters.
-    const int k = static_cast<uint8_t>(array[len-1]);
+    const size_t k = array[len-1];
     if (k > 30) {
       // Reserved for potentially new encodings for short bloom filters.
       // Consider it a match.
       return true;
     }
-    // NB: using k not num_probes_
-    return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits,
-                                                   k, array);
+
+    uint32_t h = hash_func_(key);
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (size_t j = 0; j < k; j++) {
+      const uint32_t bitpos = h % bits;
+      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
+      h += delta;
+    }
+    return true;
   }
 
   FilterBitsBuilder* GetFilterBitsBuilder() const override {
@@ -319,15 +394,15 @@ class BloomFilterPolicy : public FilterPolicy {
   bool UseBlockBasedBuilder() { return use_block_based_builder_; }
 
  private:
-  int bits_per_key_;
-  int num_probes_;
+  size_t bits_per_key_;
+  size_t num_probes_;
   uint32_t (*hash_func_)(const Slice& key);
 
   const bool use_block_based_builder_;
 
   void initialize() {
     // We intentionally round down to reduce probing cost a little bit
-    num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
     if (num_probes_ < 1) num_probes_ = 1;
     if (num_probes_ > 30) num_probes_ = 30;
   }
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
deleted file mode 100644
index 13c7a7ec6b0..00000000000
--- a/util/bloom_impl.h
+++ /dev/null
@@ -1,140 +0,0 @@
-//  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Implementation details of various Bloom filter implementations used in
-// RocksDB. (DynamicBloom is in a separate file for now because it
-// supports concurrent write.)
-
-#pragma once
-#include <stddef.h>
-#include <stdint.h>
-
-#include "rocksdb/slice.h"
-
-namespace rocksdb {
-
-// A legacy Bloom filter implementation with no locality of probes (slow).
-// It uses double hashing to generate a sequence of hash values.
-// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
-// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
-//
-// DO NOT REUSE - faster and more predictably accurate implementations
-// are available at
-// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
-// See e.g. RocksDB DynamicBloom.
-//
-class LegacyNoLocalityBloomImpl {
-public:
-  static inline void AddHash(uint32_t h, uint32_t total_bits,
-                             int num_probes, char *data) {
-    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-    for (int i = 0; i < num_probes; i++) {
-      const uint32_t bitpos = h % total_bits;
-      data[bitpos/8] |= (1 << (bitpos % 8));
-      h += delta;
-    }
-  }
-
-  static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
-                                  int num_probes, const char *data) {
-    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-    for (int i = 0; i < num_probes; i++) {
-      const uint32_t bitpos = h % total_bits;
-      if ((data[bitpos/8] & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
-    }
-    return true;
-  }
-};
-
-
-// A legacy Bloom filter implementation with probes local to a single
-// cache line (fast). Because SST files might be transported between
-// platforms, the cache line size is a parameter rather than hard coded.
-// (But if specified as a constant parameter, an optimizing compiler
-// should take advantage of that.)
-//
-// When ExtraRotates is false, this implementation is notably deficient in
-// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
-// increment being zero (when cache line size is 512 bits). Thus, there's a
-// 1/512 chance of probing only one index, which we'd expect to incur about
-// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
-// https://github.com/facebook/rocksdb/issues/4120
-//
-// DO NOT REUSE - faster and more predictably accurate implementations
-// are available at
-// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
-// See e.g. RocksDB DynamicBloom.
-//
-template <bool ExtraRotates>
-class LegacyLocalityBloomImpl {
-private:
-  static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
-    uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
-    return offset_h % num_lines;
-  }
-public:
-  static inline void AddHash(uint32_t h, uint32_t num_lines,
-                             int num_probes, char *data,
-                             int log2_cache_line_bytes) {
-    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
-
-    char *data_at_offset =
-        data + (GetLine(h, num_lines) << log2_cache_line_bytes);
-    const uint32_t delta = (h >> 17) | (h << 15);
-    for (int i = 0; i < num_probes; ++i) {
-      // Mask to bit-within-cache-line address
-      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
-      data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
-      if (ExtraRotates) {
-        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
-      }
-      h += delta;
-    }
-  }
-
-  static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
-                                         const char *data,
-                                         uint32_t /*out*/*byte_offset,
-                                         int log2_cache_line_bytes) {
-    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
-    PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
-    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1),
-             0 /* rw */, 1 /* locality */);
-    *byte_offset = b;
-  }
-
-  static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
-                                  int num_probes, const char *data,
-                                  int log2_cache_line_bytes) {
-    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
-    return HashMayMatchPrepared(h, num_probes,
-                                data + b, log2_cache_line_bytes);
-  }
-
-  static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
-                                          const char *data_at_offset,
-                                          int log2_cache_line_bytes) {
-    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
-
-    const uint32_t delta = (h >> 17) | (h << 15);
-    for (int i = 0; i < num_probes; ++i) {
-      // Mask to bit-within-cache-line address
-      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
-      if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      if (ExtraRotates) {
-        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
-      }
-      h += delta;
-    }
-    return true;
-  }
-};
-
-}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 26146152106..b759303996e 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -352,8 +352,7 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(),
-              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5)) << length;
 
     // All added keys must match
     for (int i = 0; i < length; i++) {

From 94d62d771e627a184711a5e634a7cfefa52b12d2 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 16 Sep 2019 11:38:10 -0700
Subject: [PATCH 389/572] Temporarily disable partitioned index/filter in
 stress test (#5811)

Summary:
PR https://github.com/facebook/rocksdb/issues/4020 enabled partitioned indexes/filters in stress tests; however,
this causes assertion failures in BatchedOpsStressTest. This patch
disables them until we can root cause the failures.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5811

Test Plan: Ran the script and made sure it only uses the binary search index.

Differential Revision: D17399366

Pulled By: ltamasi

fbshipit-source-id: adb116e6297f9c6ccd7ac15b6a16c9aa91f21ac5
---
 tools/db_crashtest.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 4a393b5357f..32c5abbe7cb 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,8 +41,8 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
-    # Temporarily disable hash index
-    "index_type": lambda: random.choice([0, 2]),
+    # Temporarily disable hash and partitioned index
+    "index_type": 0,
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -50,7 +50,8 @@
     "mmap_read": lambda: random.randint(0, 1),
     "nooverwritepercent": 1,
     "open_files": lambda : random.choice([-1, 500000]),
-    "partition_filters": lambda: random.randint(0, 1),
+    # Temporarily disable partitioned filter
+    "partition_filters": 0,
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,

From 638d23950716f3119057d725fbbf1590b807a849 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 16 Sep 2019 15:14:51 -0700
Subject: [PATCH 390/572] Charge block cache for cache internal usage (#5797)

Summary:
For our default block cache, each additional entry has extra memory overhead. It include LRUHandle (72 bytes currently) and the cache key (two varint64, file id and offset). The usage is not negligible. For example for block_size=4k, the overhead accounts for an extra 2% memory usage for the cache. The patch charging the cache for the extra usage, reducing untracked memory usage outside block cache. The feature is enabled by default and can be disabled by passing kDontChargeCacheMetadata to the cache constructor.
This PR builds up on https://github.com/facebook/rocksdb/issues/4258
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5797

Test Plan:
- Existing tests are updated to either disable the feature when the test has too much dependency on the old way of accounting the usage or increasing the cache capacity to account for the additional charge of metadata.
- The Usage tests in cache_test.cc are augmented to test the cache usage under kFullChargeCacheMetadata.

Differential Revision: D17396833

Pulled By: maysamyabandeh

fbshipit-source-id: 7684ccb9f8a40ca595e4f5efcdb03623afea0c6f
---
 HISTORY.md                                    |  1 +
 cache/cache_test.cc                           | 84 ++++++++++++++++---
 cache/clock_cache.cc                          | 59 ++++++++++---
 cache/lru_cache.cc                            | 66 +++++++++------
 cache/lru_cache.h                             | 24 +++++-
 cache/lru_cache_test.cc                       |  3 +-
 cache/sharded_cache.h                         |  7 ++
 db/db_block_cache_test.cc                     |  7 +-
 db/db_impl/db_impl.cc                         |  7 +-
 db/db_iterator_test.cc                        |  3 +-
 db/db_properties_test.cc                      |  6 +-
 db/db_test2.cc                                |  2 +-
 env/env_test.cc                               |  8 +-
 include/rocksdb/cache.h                       | 30 +++++--
 memory/arena.cc                               |  8 +-
 memtable/write_buffer_manager_test.cc         |  6 +-
 port/malloc.h                                 | 17 ++++
 table/block_based/block.h                     |  8 +-
 table/block_based/full_filter_block.cc        |  9 +-
 table/block_based/partitioned_filter_block.cc |  8 +-
 table/format.h                                |  8 +-
 table/table_test.cc                           |  6 +-
 .../simulator_cache/cache_simulator_test.cc   | 11 ++-
 utilities/simulator_cache/sim_cache.cc        |  7 +-
 utilities/simulator_cache/sim_cache_test.cc   | 14 +++-
 25 files changed, 289 insertions(+), 120 deletions(-)
 create mode 100644 port/malloc.h

diff --git a/HISTORY.md b/HISTORY.md
index 0e91e518ec8..85c65484d9a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,7 @@
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
 * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
+* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index b728c67c7d7..1c6fc771928 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -86,14 +86,22 @@ class CacheTest : public testing::TestWithParam<std::string> {
     return nullptr;
   }
 
-  std::shared_ptr<Cache> NewCache(size_t capacity, int num_shard_bits,
-                                  bool strict_capacity_limit) {
+  std::shared_ptr<Cache> NewCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
     auto type = GetParam();
     if (type == kLRU) {
-      return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, 0.0);
+      LRUCacheOptions co;
+      co.capacity = capacity;
+      co.num_shard_bits = num_shard_bits;
+      co.strict_capacity_limit = strict_capacity_limit;
+      co.high_pri_pool_ratio = 0;
+      co.metadata_charge_policy = charge_policy;
+      return NewLRUCache(co);
     }
     if (type == kClock) {
-      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit);
+      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit,
+                           charge_policy);
     }
     return nullptr;
   }
@@ -143,10 +151,15 @@ class CacheTest : public testing::TestWithParam<std::string> {
 };
 CacheTest* CacheTest::current_;
 
+class LRUCacheTest : public CacheTest {};
+
 TEST_P(CacheTest, UsageTest) {
   // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
 
   size_t usage = 0;
   char value[10] = "abcdef";
@@ -155,31 +168,45 @@ TEST_P(CacheTest, UsageTest) {
     std::string key(i, 'a');
     auto kv_size = key.size() + 5;
     cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter);
     usage += kv_size;
     ASSERT_EQ(usage, cache->GetUsage());
+    ASSERT_LT(usage, precise_cache->GetUsage());
   }
 
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+
   // make sure the cache will be overloaded
   for (uint64_t i = 1; i < kCapacity; ++i) {
     auto key = ToString(i);
     cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                   dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
   }
 
   // the usage should be close to the capacity
   ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_GT(kCapacity, precise_cache->GetUsage());
   ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
 }
 
 TEST_P(CacheTest, PinnedUsageTest) {
   // cache is std::shared_ptr and will be automatically cleaned up.
-  const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  const uint64_t kCapacity = 200000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
 
   size_t pinned_usage = 0;
   char value[10] = "abcdef";
 
   std::forward_list<Cache::Handle*> unreleased_handles;
+  std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;
 
   // Add entries. Unpin some of them after insertion. Then, pin some of them
   // again. Check GetPinnedUsage().
@@ -187,40 +214,72 @@ TEST_P(CacheTest, PinnedUsageTest) {
     std::string key(i, 'a');
     auto kv_size = key.size() + 5;
     Cache::Handle* handle;
+    Cache::Handle* handle_in_precise_cache;
     cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter,
                   &handle);
+    assert(handle);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter, &handle_in_precise_cache);
+    assert(handle_in_precise_cache);
     pinned_usage += kv_size;
     ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     if (i % 2 == 0) {
       cache->Release(handle);
+      precise_cache->Release(handle_in_precise_cache);
       pinned_usage -= kv_size;
       ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     } else {
       unreleased_handles.push_front(handle);
+      unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
     }
     if (i % 3 == 0) {
       unreleased_handles.push_front(cache->Lookup(key));
+      auto x = precise_cache->Lookup(key);
+      assert(x);
+      unreleased_handles_in_precise_cache.push_front(x);
       // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
       // usage increased
       if (i % 2 == 0) {
         pinned_usage += kv_size;
       }
       ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     }
   }
+  auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);
 
   // check that overloading the cache does not change the pinned usage
   for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
     auto key = ToString(i);
     cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                   dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
   }
   ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
 
   // release handles for pinned entries to prevent memory leaks
   for (auto handle : unreleased_handles) {
     cache->Release(handle);
   }
+  for (auto handle : unreleased_handles_in_precise_cache) {
+    precise_cache->Release(handle);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
 }
 
 TEST_P(CacheTest, HitAndMiss) {
@@ -550,10 +609,10 @@ TEST_P(CacheTest, SetCapacity) {
   }
 }
 
-TEST_P(CacheTest, SetStrictCapacityLimit) {
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
   // test1: set the flag to false. Insert more keys than capacity. See if they
   // all go through.
-  std::shared_ptr<Cache> cache = NewLRUCache(5, 0, false);
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
   std::vector<Cache::Handle*> handles(10);
   Status s;
   for (size_t i = 0; i < 10; i++) {
@@ -579,7 +638,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   }
 
   // test3: init with flag being true.
-  std::shared_ptr<Cache> cache2 = NewLRUCache(5, 0, true);
+  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
   for (size_t i = 0; i < 5; i++) {
     std::string key = ToString(i + 1);
     s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
@@ -697,13 +756,14 @@ TEST_P(CacheTest, GetCharge) {
 }
 
 #ifdef SUPPORT_CLOCK_CACHE
-std::shared_ptr<Cache> (*new_clock_cache_func)(size_t, int,
-                                               bool) = NewClockCache;
+std::shared_ptr<Cache> (*new_clock_cache_func)(
+    size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
                         testing::Values(kLRU, kClock));
 #else
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU));
 #endif  // SUPPORT_CLOCK_CACHE
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
 
 }  // namespace rocksdb
 
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 89173834e23..9165ad5dd10 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -13,8 +13,9 @@
 
 namespace rocksdb {
 
-std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/,
-                                     bool /*strict_capacity_limit*/) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy /*metadata_charge_policy*/) {
   // Clock cache not supported.
   return nullptr;
 }
@@ -35,6 +36,7 @@ std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/
 #include "tbb/concurrent_hash_map.h"
 
 #include "cache/sharded_cache.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"
 #include "util/mutexlock.h"
@@ -202,6 +204,27 @@ struct CacheHandle {
     deleter = a.deleter;
     return *this;
   }
+
+  inline static size_t CalcTotalCharge(
+      Slice key, size_t charge,
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+      meta_charge += sizeof(CacheHandle);
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge +=
+          malloc_usable_size(static_cast<void*>(const_cast<char*>(key.data())));
+#else
+      meta_charge += key.size();
+#endif
+    }
+    return charge + meta_charge;
+  }
+
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    return CalcTotalCharge(key, charge, metadata_charge_policy);
+  }
 };
 
 // Key of hash map. We store hash value with the key for convenience.
@@ -404,11 +427,12 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle,
   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
   context->to_delete_key.push_back(handle->key.data());
   context->to_delete_value.emplace_back(*handle);
+  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
   handle->key.clear();
   handle->value = nullptr;
   handle->deleter = nullptr;
   recycle_.push_back(handle);
-  usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }
 
 void ClockCacheShard::Cleanup(const CleanupContext& context) {
@@ -434,7 +458,8 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
                                             std::memory_order_relaxed)) {
       if (CountRefs(flags) == 0) {
         // No reference count before the operation.
-        pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed);
+        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
       }
       return true;
     }
@@ -454,7 +479,8 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
   assert(CountRefs(flags) > 0);
   if (CountRefs(flags) == 1) {
     // this is the last reference.
-    pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     // Cleanup if it is the last reference.
     if (!InCache(flags)) {
       MutexLock l(&mutex_);
@@ -539,8 +565,10 @@ CacheHandle* ClockCacheShard::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value), bool hold_reference,
     CleanupContext* context) {
+  size_t total_charge =
+      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
   MutexLock l(&mutex_);
-  bool success = EvictFromCache(charge, context);
+  bool success = EvictFromCache(total_charge, context);
   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
   if (!success && (strict || !hold_reference)) {
     context->to_delete_key.push_back(key.data());
@@ -575,9 +603,9 @@ CacheHandle* ClockCacheShard::Insert(
   }
   table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
   if (hold_reference) {
-    pinned_usage_.fetch_add(charge, std::memory_order_relaxed);
+    pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
   }
-  usage_.fetch_add(charge, std::memory_order_relaxed);
+  usage_.fetch_add(total_charge, std::memory_order_relaxed);
   return handle;
 }
 
@@ -674,10 +702,14 @@ void ClockCacheShard::EraseUnRefEntries() {
 
 class ClockCache final : public ShardedCache {
  public:
-  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit)
+  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+             CacheMetadataChargePolicy metadata_charge_policy)
       : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
     int num_shards = 1 << num_shard_bits;
     shards_ = new ClockCacheShard[num_shards];
+    for (int i = 0; i < num_shards; i++) {
+      shards_[i].set_metadata_charge_policy(metadata_charge_policy);
+    }
     SetCapacity(capacity);
     SetStrictCapacityLimit(strict_capacity_limit);
   }
@@ -714,13 +746,14 @@ class ClockCache final : public ShardedCache {
 
 }  // end anonymous namespace
 
-std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
-                                     bool strict_capacity_limit) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
   if (num_shard_bits < 0) {
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
-  return std::make_shared<ClockCache>(capacity, num_shard_bits,
-                                      strict_capacity_limit);
+  return std::make_shared<ClockCache>(
+      capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy);
 }
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 7c04cb909d5..85d2d67ec6a 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -97,7 +97,8 @@ void LRUHandleTable::Resize() {
 
 LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                              double high_pri_pool_ratio,
-                             bool use_adaptive_mutex)
+                             bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy)
     : capacity_(0),
       high_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
@@ -106,6 +107,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
       usage_(0),
       lru_usage_(0),
       mutex_(use_adaptive_mutex) {
+  set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -124,7 +126,9 @@ void LRUCacheShard::EraseUnRefEntries() {
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
       old->SetInCache(false);
-      usage_ -= old->charge;
+      size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
       last_reference_list.push_back(old);
     }
   }
@@ -180,16 +184,19 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
   e->prev = e->next = nullptr;
-  lru_usage_ -= e->charge;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+  assert(lru_usage_ >= total_charge);
+  lru_usage_ -= total_charge;
   if (e->InHighPriPool()) {
-    assert(high_pri_pool_usage_ >= e->charge);
-    high_pri_pool_usage_ -= e->charge;
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
   }
 }
 
 void LRUCacheShard::LRU_Insert(LRUHandle* e) {
   assert(e->next == nullptr);
   assert(e->prev == nullptr);
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
   if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
     // Inset "e" to head of LRU list.
     e->next = &lru_;
@@ -197,7 +204,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
     e->prev->next = e;
     e->next->prev = e;
     e->SetInHighPriPool(true);
-    high_pri_pool_usage_ += e->charge;
+    high_pri_pool_usage_ += total_charge;
     MaintainPoolSize();
   } else {
     // Insert "e" to the head of low-pri pool. Note that when
@@ -209,7 +216,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
     e->SetInHighPriPool(false);
     lru_low_pri_ = e;
   }
-  lru_usage_ += e->charge;
+  lru_usage_ += total_charge;
 }
 
 void LRUCacheShard::MaintainPoolSize() {
@@ -218,6 +225,7 @@ void LRUCacheShard::MaintainPoolSize() {
     lru_low_pri_ = lru_low_pri_->next;
     assert(lru_low_pri_ != &lru_);
     lru_low_pri_->SetInHighPriPool(false);
+    assert(high_pri_pool_usage_ >= lru_low_pri_->charge);
     high_pri_pool_usage_ -= lru_low_pri_->charge;
   }
 }
@@ -231,7 +239,9 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
     LRU_Remove(old);
     table_.Remove(old->key(), old->hash);
     old->SetInCache(false);
-    usage_ -= old->charge;
+    size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+    assert(usage_ >= old_total_charge);
+    usage_ -= old_total_charge;
     deleted->push_back(old);
   }
 }
@@ -311,7 +321,9 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
       }
     }
     if (last_reference) {
-      usage_ -= e->charge;
+      size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
     }
   }
 
@@ -345,15 +357,16 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   e->SetInCache(true);
   e->SetPriority(priority);
   memcpy(e->key_data, key.data(), key.size());
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
 
   {
     MutexLock l(&mutex_);
 
     // Free the space following strict LRU policy until enough space
     // is freed or the lru list is empty
-    EvictFromLRU(charge, &last_reference_list);
+    EvictFromLRU(total_charge, &last_reference_list);
 
-    if ((usage_ + charge) > capacity_ &&
+    if ((usage_ + total_charge) > capacity_ &&
         (strict_capacity_limit_ || handle == nullptr)) {
       if (handle == nullptr) {
         // Don't insert the entry but still return ok, as if the entry inserted
@@ -369,14 +382,17 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
       // Insert into the cache. Note that the cache might get larger than its
       // capacity if not enough space was freed up.
       LRUHandle* old = table_.Insert(e);
-      usage_ += e->charge;
+      usage_ += total_charge;
       if (old != nullptr) {
         assert(old->InCache());
         old->SetInCache(false);
         if (!old->HasRefs()) {
           // old is on LRU because it's in cache and its reference count is 0
           LRU_Remove(old);
-          usage_ -= old->charge;
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
           last_reference_list.push_back(old);
         }
       }
@@ -409,7 +425,9 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
       if (!e->HasRefs()) {
         // The entry is in LRU since it's in hash and has no external references
         LRU_Remove(e);
-        usage_ -= e->charge;
+        size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+        assert(usage_ >= total_charge);
+        usage_ -= total_charge;
         last_reference = true;
       }
     }
@@ -447,7 +465,8 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                    bool strict_capacity_limit, double high_pri_pool_ratio,
                    std::shared_ptr<MemoryAllocator> allocator,
-                   bool use_adaptive_mutex)
+                   bool use_adaptive_mutex,
+                   CacheMetadataChargePolicy metadata_charge_policy)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                    std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
@@ -457,7 +476,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
   for (int i = 0; i < num_shards_; i++) {
     new (&shards_[i])
         LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-            use_adaptive_mutex);
+                      use_adaptive_mutex, metadata_charge_policy);
   }
 }
 
@@ -526,15 +545,15 @@ std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
   return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
                      cache_opts.strict_capacity_limit,
                      cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator,
-                     cache_opts.use_adaptive_mutex);
+                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+                     cache_opts.metadata_charge_policy);
 }
 
 std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
     double high_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator,
-    bool use_adaptive_mutex) {
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -545,10 +564,9 @@ std::shared_ptr<Cache> NewLRUCache(
   if (num_shard_bits < 0) {
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
-  return std::make_shared<LRUCache>(capacity, num_shard_bits,
-                                    strict_capacity_limit, high_pri_pool_ratio,
-                                    std::move(memory_allocator),
-                                    use_adaptive_mutex);
+  return std::make_shared<LRUCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
 }
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 1ff765d1592..6313c69dba9 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -12,6 +12,7 @@
 
 #include "cache/sharded_cache.h"
 
+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"
 
@@ -128,6 +129,22 @@ struct LRUHandle {
     }
     delete[] reinterpret_cast<char*>(this);
   }
+
+  // Caclculate the memory usage by metadata
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    assert(key_length);
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge += malloc_usable_size(static_cast<void*>(this));
+#else
+      // This is the size that is used when a new handle is created
+      meta_charge += sizeof(LRUHandle) - 1 + key_length;
+#endif
+    }
+    return charge + meta_charge;
+  }
 };
 
 // We provide our own simple hash table since it removes a whole bunch
@@ -176,7 +193,8 @@ class LRUHandleTable {
 class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
-                double high_pri_pool_ratio, bool use_adaptive_mutex);
+                double high_pri_pool_ratio, bool use_adaptive_mutex,
+                CacheMetadataChargePolicy metadata_charge_policy);
   virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
@@ -297,7 +315,9 @@ class LRUCache
   LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
            double high_pri_pool_ratio,
            std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-           bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+           CacheMetadataChargePolicy metadata_charge_policy =
+               kDontChargeCacheMetadata);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
   virtual CacheShard* GetShard(int shard) override;
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 575764611ce..f4f4dee69c3 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -31,7 +31,8 @@ class LRUCacheTest : public testing::Test {
     cache_ = reinterpret_cast<LRUCacheShard*>(
         port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
     new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                               high_pri_pool_ratio, use_adaptive_mutex);
+                               high_pri_pool_ratio, use_adaptive_mutex,
+                               kDontChargeCacheMetadata);
   }
 
   void Insert(const std::string& key,
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 0c1499f22dd..4a396bd47ff 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -40,6 +40,13 @@ class CacheShard {
                                       bool thread_safe) = 0;
   virtual void EraseUnRefEntries() = 0;
   virtual std::string GetPrintableOptions() const { return ""; }
+  void set_metadata_charge_policy(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    metadata_charge_policy_ = metadata_charge_policy;
+  }
+
+ protected:
+  CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata;
 };
 
 // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 39bb4de2f2c..89c2dbd5d16 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -380,8 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   options.statistics = rocksdb::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
   // 500 bytes are enough to hold the first two blocks
-  std::shared_ptr<Cache> cache = NewLRUCache(500, 0, false);
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8bc7302504a..443138908da 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -240,8 +240,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   const int table_cache_size = (mutable_db_options_.max_open_files == -1)
                                    ? TableCache::kInfiniteCapacity
                                    : mutable_db_options_.max_open_files - 10;
-  table_cache_ = NewLRUCache(table_cache_size,
-                             immutable_db_options_.table_cache_numshardbits);
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);
 
   versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_,
                                  table_cache_.get(), write_buffer_manager_,
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 9ea7ea0d9f3..ed12a880156 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -1070,7 +1070,8 @@ TEST_P(DBIteratorTest, IndexWithFirstKey) {
         BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
     table_options.flush_block_policy_factory =
         std::make_shared<FlushBlockEveryKeyPolicyFactory>();
-    table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     DestroyAndReopen(options);
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 956accef821..be70bcea572 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -1631,7 +1631,11 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) {
 
   // Test with empty block cache.
   constexpr size_t kCapacity = 100;
-  auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/);
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
   table_options.block_cache = block_cache;
   table_options.no_block_cache = false;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 2c993580ac2..cf622973a7f 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3780,7 +3780,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
 TEST_F(DBTest2, RowCacheSnapshot) {
   Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
-  options.row_cache = NewLRUCache(8192);
+  options.row_cache = NewLRUCache(8 * 8192);
   DestroyAndReopen(options);
 
   ASSERT_OK(Put("foo", "bar1"));
diff --git a/env/env_test.cc b/env/env_test.cc
index 6f225e37f67..f9c597823ef 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -11,13 +11,6 @@
 #include <sys/ioctl.h>
 #endif
 
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include <sys/types.h>
 
 #include <iostream>
@@ -39,6 +32,7 @@
 
 #include "env/env_chroot.h"
 #include "logging/log_buffer.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index d8093c7eabb..27b4a6f6432 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -36,6 +36,13 @@ class Cache;
 
 extern const bool kDefaultToAdaptiveMutex;
 
+enum CacheMetadataChargePolicy {
+  kDontChargeCacheMetadata,
+  kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+    kFullChargeCacheMetadata;
+
 struct LRUCacheOptions {
   // Capacity of the cache.
   size_t capacity = 0;
@@ -76,17 +83,23 @@ struct LRUCacheOptions {
   // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
   bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
 
+  CacheMetadataChargePolicy metadata_charge_policy =
+      kDefaultCacheMetadataChargePolicy;
+
   LRUCacheOptions() {}
   LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                   bool _strict_capacity_limit, double _high_pri_pool_ratio,
                   std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
-                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex)
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+                  CacheMetadataChargePolicy _metadata_charge_policy =
+                      kDefaultCacheMetadataChargePolicy)
       : capacity(_capacity),
         num_shard_bits(_num_shard_bits),
         strict_capacity_limit(_strict_capacity_limit),
         high_pri_pool_ratio(_high_pri_pool_ratio),
         memory_allocator(std::move(_memory_allocator)),
-        use_adaptive_mutex(_use_adaptive_mutex) {}
+        use_adaptive_mutex(_use_adaptive_mutex),
+        metadata_charge_policy(_metadata_charge_policy) {}
 };
 
 // Create a new cache with a fixed size capacity. The cache is sharded
@@ -101,7 +114,9 @@ extern std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits = -1,
     bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
     std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-    bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
 
 extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 
@@ -110,10 +125,11 @@ extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 // more detail.
 //
 // Return nullptr if it is not supported.
-extern std::shared_ptr<Cache> NewClockCache(size_t capacity,
-                                            int num_shard_bits = -1,
-                                            bool strict_capacity_limit = false);
-
+extern std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
 class Cache {
  public:
   // Depending on implementation, cache entries with high priority could be less
diff --git a/memory/arena.cc b/memory/arena.cc
index 3f113e776a4..70c8039015b 100644
--- a/memory/arena.cc
+++ b/memory/arena.cc
@@ -8,18 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "memory/arena.h"
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #ifndef OS_WIN
 #include <sys/mman.h>
 #endif
 #include <algorithm>
 #include "logging/logging.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc
index 06514eabde4..23de06a623a 100644
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@@ -51,8 +51,12 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) {
 }
 
 TEST_F(WriteBufferManagerTest, CacheCost) {
+  LRUCacheOptions co;
   // 1GB cache
-  std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4);
+  co.capacity = 1024 * 1024 * 1024;
+  co.num_shard_bits = 4;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   // A write buffer manager of size 50MB
   std::unique_ptr<WriteBufferManager> wbf(
       new WriteBufferManager(50 * 1024 * 1024, cache));
diff --git a/port/malloc.h b/port/malloc.h
new file mode 100644
index 00000000000..f973263e2ae
--- /dev/null
+++ b/port/malloc.h
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 3e19f9fdc66..9568cd69c06 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -12,16 +12,10 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index cf1afb5d36b..905bbd217ba 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -6,15 +6,8 @@
 #include <array>
 #include "table/block_based/full_filter_block.h"
 
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
-
 #include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 1ba6b3c07a1..f06150c296d 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -5,16 +5,10 @@
 
 #include "table/block_based/partitioned_filter_block.h"
 
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include <utility>
 
 #include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block.h"
diff --git a/table/format.h b/table/format.h
index ef323a64716..552cd940d8c 100644
--- a/table/format.h
+++ b/table/format.h
@@ -10,13 +10,6 @@
 #pragma once
 #include <stdint.h>
 #include <string>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include "file/file_prefetch_buffer.h"
 #include "file/random_access_file_reader.h"
 
@@ -27,6 +20,7 @@
 
 #include "memory/memory_allocator.h"
 #include "options/cf_options.h"
+#include "port/malloc.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/crc32c.h"
diff --git a/table/table_test.cc b/table/table_test.cc
index cd7363df0da..77b96259830 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2599,7 +2599,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // Enable the cache for index/filter blocks
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  table_options.block_cache = NewLRUCache(2048, 2);
+  LRUCacheOptions co;
+  co.capacity = 2048;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_options.block_cache = NewLRUCache(co);
   table_options.cache_index_and_filter_blocks = true;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
index 3d3432e20a4..085e113ffe3 100644
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -313,10 +313,13 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
   get.sst_fd_number = 0;
   get.get_from_user_specified_snapshot = Boolean::kFalse;
 
-  std::shared_ptr<Cache> sim_cache =
-      NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1,
-                  /*strict_capacity_limit=*/false,
-                  /*high_pri_pool_ratio=*/0);
+  LRUCacheOptions co;
+  co.capacity = 16;
+  co.num_shard_bits = 1;
+  co.strict_capacity_limit = false;
+  co.high_pri_pool_ratio = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> sim_cache = NewLRUCache(co);
   std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
       new HybridRowBlockCacheSimulator(
           nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index 3e1f821f7a5..ac57a423012 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -331,8 +331,11 @@ class SimCacheImpl : public SimCache {
 // For instrumentation purpose, use NewSimCache instead
 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                       size_t sim_capacity, int num_shard_bits) {
-  return NewSimCache(NewLRUCache(sim_capacity, num_shard_bits), cache,
-                     num_shard_bits);
+  LRUCacheOptions co;
+  co.capacity = sim_capacity;
+  co.num_shard_bits = num_shard_bits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  return NewSimCache(NewLRUCache(co), cache, num_shard_bits);
 }
 
 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc
index 7f0f904a720..e66228107ea 100644
--- a/utilities/simulator_cache/sim_cache_test.cc
+++ b/utilities/simulator_cache/sim_cache_test.cc
@@ -77,8 +77,12 @@ TEST_F(SimCacheTest, SimCache) {
   auto table_options = GetTableOptions();
   auto options = GetOptions(table_options);
   InitTable(options);
-  std::shared_ptr<SimCache> simCache =
-      NewSimCache(NewLRUCache(0, 0, false), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = simCache;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   Reopen(options);
@@ -142,8 +146,10 @@ TEST_F(SimCacheTest, SimCacheLogging) {
   auto table_options = GetTableOptions();
   auto options = GetOptions(table_options);
   options.disable_auto_compactions = true;
-  std::shared_ptr<SimCache> sim_cache =
-      NewSimCache(NewLRUCache(1024 * 1024), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 1024 * 1024;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = sim_cache;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   Reopen(options);

From 68626249c3551bd31ba8712ba0fc86f31f00890c Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 16 Sep 2019 16:15:18 -0700
Subject: [PATCH 391/572] Refactor/consolidate legacy Bloom implementation
 details (#5784)

Summary:
Refactoring to consolidate implementation details of legacy
Bloom filters. This helps to organize and document some related,
obscure code.

Also added make/cpp var TEST_CACHE_LINE_SIZE so that it's easy to
compile and run unit tests for non-native cache line size. (Fixed a
related test failure in db_properties_test.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5784

Test Plan:
make check, including Recently added Bloom schema unit tests
(in ./plain_table_db_test && ./bloom_test), and including with
TEST_CACHE_LINE_SIZE=128U and TEST_CACHE_LINE_SIZE=256U. Tested the
schema tests with temporary fault injection into new implementations.

Some performance testing with modified unit tests suggest a small to moderate
improvement in speed.

Differential Revision: D17381384

Pulled By: pdillinger

fbshipit-source-id: ee42586da996798910fc45ac0b6289147f16d8df
---
 Makefile                                |   5 +
 db/db_properties_test.cc                |  14 +-
 port/port_posix.h                       |  27 ++--
 table/full_filter_bits_builder.h        |   7 +-
 table/plain/plain_table_bloom.cc        |   8 +-
 table/plain/plain_table_bloom.h         |  66 +++------
 table/plain/plain_table_reader.cc       |   7 +-
 third-party/folly/folly/ConstexprMath.h |  28 ++++
 util/bloom.cc                           | 175 +++++++-----------------
 util/bloom_impl.h                       | 140 +++++++++++++++++++
 util/bloom_test.cc                      |   3 +-
 11 files changed, 279 insertions(+), 201 deletions(-)
 create mode 100644 util/bloom_impl.h

diff --git a/Makefile b/Makefile
index dc90bdc9ae6..898570f84ae 100644
--- a/Makefile
+++ b/Makefile
@@ -332,6 +332,11 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
 	endif
 endif
 
+ifdef TEST_CACHE_LINE_SIZE
+  PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+  PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+endif
+
 # This (the first rule) must depend on "all".
 default: all
 
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index be70bcea572..57206f6edb2 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -210,12 +210,11 @@ void VerifySimilar(uint64_t a, uint64_t b, double bias) {
   }
 }
 
-void VerifyTableProperties(const TableProperties& base_tp,
-                           const TableProperties& new_tp,
-                           double filter_size_bias = 0.1,
-                           double index_size_bias = 0.1,
-                           double data_size_bias = 0.1,
-                           double num_data_blocks_bias = 0.05) {
+void VerifyTableProperties(
+    const TableProperties& base_tp, const TableProperties& new_tp,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+    double index_size_bias = 0.1, double data_size_bias = 0.1,
+    double num_data_blocks_bias = 0.05) {
   VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
   VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
   VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
@@ -266,7 +265,8 @@ void GetExpectedTableProperties(
        // discount 1 byte as value size is not encoded in value delta encoding
        (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
-      kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
 }
 }  // anonymous namespace
 
diff --git a/port/port_posix.h b/port/port_posix.h
index 51eb241629a..892089831ff 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -178,22 +178,31 @@ typedef pthread_once_t OnceType;
 extern void InitOnce(OnceType* once, void (*initializer)());
 
 #ifndef CACHE_LINE_SIZE
-  #if defined(__s390__)
-    #define CACHE_LINE_SIZE 256U
-  #elif defined(__powerpc__) || defined(__aarch64__)
-    #define CACHE_LINE_SIZE 128U
-  #else
-    #define CACHE_LINE_SIZE 64U
-  #endif
+// To test behavior with non-native cache line size, e.g. for
+// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
+// This disables ALIGN_AS to keep it from failing compilation.
+#ifdef TEST_CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
+#define ALIGN_AS(n) /*empty*/
+#else
+#if defined(__s390__)
+#define CACHE_LINE_SIZE 256U
+#elif defined(__powerpc__) || defined(__aarch64__)
+#define CACHE_LINE_SIZE 128U
+#else
+#define CACHE_LINE_SIZE 64U
+#endif
+#define ALIGN_AS(n) alignas(n)
+#endif
 #endif
 
+static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
+              "Cache line size must be a power of 2 number of bytes");
 
 extern void *cacheline_aligned_alloc(size_t size);
 
 extern void cacheline_aligned_free(void *memblock);
 
-#define ALIGN_AS(n) alignas(n)
-
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
 extern void Crash(const std::string& srcfile, int srcline);
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
index c719c698aac..681d20dac42 100644
--- a/table/full_filter_bits_builder.h
+++ b/table/full_filter_bits_builder.h
@@ -20,8 +20,7 @@ class Slice;
 
 class FullFilterBitsBuilder : public FilterBitsBuilder {
  public:
-  explicit FullFilterBitsBuilder(const size_t bits_per_key,
-                                 const size_t num_probes);
+  explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
 
   // No Copy allowed
   FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
@@ -56,8 +55,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
 
  private:
   friend class FullFilterBlockTest_DuplicateEntries_Test;
-  size_t bits_per_key_;
-  size_t num_probes_;
+  int bits_per_key_;
+  int num_probes_;
   std::vector<uint32_t> hash_entries_;
 
   // Get totalbits that optimized for cpu cache line
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
index 778b3b558ad..92461bc86e5 100644
--- a/table/plain/plain_table_bloom.cc
+++ b/table/plain/plain_table_bloom.cc
@@ -33,9 +33,9 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
     : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
 
-void PlainTableBloomV1::SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                              uint32_t num_blocks) {
-  data_ = reinterpret_cast<uint8_t*>(raw_data);
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+                                   uint32_t num_blocks) {
+  data_ = raw_data;
   kTotalBits = total_bits;
   kNumBlocks = num_blocks;
 }
@@ -63,7 +63,7 @@ void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
   if (kNumBlocks > 0 && cache_line_offset > 0) {
     raw += CACHE_LINE_SIZE - cache_line_offset;
   }
-  data_ = reinterpret_cast<uint8_t*>(raw);
+  data_ = raw;
 }
 
 void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index b9248cdaf12..ae08db1aa65 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -10,8 +10,11 @@
 #include "rocksdb/slice.h"
 
 #include "port/port.h"
+#include "util/bloom_impl.h"
 #include "util/hash.h"
 
+#include "third-party/folly/folly/ConstexprMath.h"
+
 #include <memory>
 
 namespace rocksdb {
@@ -50,12 +53,9 @@ class PlainTableBloomV1 {
 
   uint32_t GetNumBlocks() const { return kNumBlocks; }
 
-  Slice GetRawData() const {
-    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
-  }
+  Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
 
-  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                  uint32_t num_blocks = 0);
+  void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
 
   uint32_t GetTotalBits() const { return kTotalBits; }
 
@@ -66,7 +66,10 @@ class PlainTableBloomV1 {
   uint32_t kNumBlocks;
   const uint32_t kNumProbes;
 
-  uint8_t* data_;
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      folly::constexpr_log2(CACHE_LINE_SIZE);
 };
 
 #if defined(_MSC_VER)
@@ -76,8 +79,9 @@ class PlainTableBloomV1 {
 #endif
 inline void PlainTableBloomV1::Prefetch(uint32_t h) {
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    PREFETCH(&(data_[b / 8]), 0, 3);
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+        h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
   }
 }
 #if defined(_MSC_VER)
@@ -86,54 +90,22 @@ inline void PlainTableBloomV1::Prefetch(uint32_t h) {
 
 inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
   assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      //  to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
   } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      if ((data_[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
-    }
+    return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+                                                   data_);
   }
-  return true;
 }
 
 inline void PlainTableBloomV1::AddHash(uint32_t h) {
   assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
   if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      // to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      data_[bitpos / 8] |= (1 << (bitpos % 8));
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
+    LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+                                           LOG2_CACHE_LINE_SIZE);
   } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      data_[bitpos / 8] |= (1 << (bitpos % 8));
-      h += delta;
-    }
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
   }
 }
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 085faf519f1..aa8aa6ed16f 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -367,10 +367,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
       }
     }
     // cast away const qualifier, because bloom_ won't be changed
-    bloom_.SetRawData(
-        const_cast<unsigned char*>(
-            reinterpret_cast<const unsigned char*>(bloom_block->data())),
-        static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
+    bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+                      static_cast<uint32_t>(bloom_block->size()) * 8,
+                      num_blocks);
   } else {
     // Index in file but no bloom in file. Disable bloom filter in this case.
     enable_bloom_ = false;
diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h
index b125c5f423b..f09167e0d42 100644
--- a/third-party/folly/folly/ConstexprMath.h
+++ b/third-party/folly/folly/ConstexprMath.h
@@ -14,4 +14,32 @@ template <typename T, typename... Ts>
 constexpr T constexpr_max(T a, T b, Ts... ts) {
   return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...);
 }
+
+namespace detail {
+template <typename T>
+constexpr T constexpr_log2_(T a, T e) {
+  return e == T(1) ? a : constexpr_log2_(a + T(1), e / T(2));
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil_(T l2, T t) {
+  return l2 + T(T(1) << l2 < t ? 1 : 0);
+}
+
+template <typename T>
+constexpr T constexpr_square_(T t) {
+  return t * t;
+}
+}  // namespace detail
+
+template <typename T>
+constexpr T constexpr_log2(T t) {
+  return detail::constexpr_log2_(T(0), t);
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil(T t) {
+  return detail::constexpr_log2_ceil_(constexpr_log2(t), t);
+}
+
 } // namespace folly
diff --git a/util/bloom.cc b/util/bloom.cc
index 23607a51e4a..56eef320d1c 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -13,19 +13,22 @@
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/full_filter_bits_builder.h"
+#include "third-party/folly/folly/ConstexprMath.h"
+#include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
+typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
 class BlockBasedFilterBlockBuilder;
 class FullFilterBlockBuilder;
 
-FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
-                                             const size_t num_probes)
+FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
+                                             const int num_probes)
     : bits_per_key_(bits_per_key), num_probes_(num_probes) {
   assert(bits_per_key_);
-  }
+}
 
   FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
 
@@ -74,7 +77,7 @@ uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
                                                uint32_t* num_lines) {
   assert(bits_per_key_);
   if (num_entry != 0) {
-    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
+    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
 
     *total_bits = GetTotalBitsForLocality(total_bits_tmp);
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
@@ -124,24 +127,15 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
 #endif
   assert(num_lines > 0 && total_bits > 0);
 
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
-
-  for (uint32_t i = 0; i < num_probes_; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    // to a simple operation by compiler.
-    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-    data[bitpos / 8] |= (1 << (bitpos % 8));
-
-    h += delta;
-  }
+  LegacyFullFilterImpl::AddHash(h, num_lines, num_probes_, data,
+                                folly::constexpr_log2(CACHE_LINE_SIZE));
 }
 
 namespace {
 class FullFilterBitsReader : public FilterBitsReader {
  public:
   explicit FullFilterBitsReader(const Slice& contents)
-      : data_(const_cast<char*>(contents.data())),
+      : data_(contents.data()),
         data_len_(static_cast<uint32_t>(contents.size())),
         num_probes_(0),
         num_lines_(0),
@@ -177,16 +171,23 @@ class FullFilterBitsReader : public FilterBitsReader {
 
   ~FullFilterBitsReader() override {}
 
-  bool MayMatch(const Slice& entry) override {
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
     if (data_len_ <= 5) {   // remain same with original filter
       return false;
     }
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return true;
-    uint32_t hash = BloomHash(entry);
-    uint32_t bit_offset;
-    FilterPrepare(hash, Slice(data_, data_len_), num_lines_, &bit_offset);
-    return HashMayMatch(hash, Slice(data_, data_len_), num_probes_, bit_offset);
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyFullFilterImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+    return LegacyFullFilterImpl::HashMayMatchPrepared(
+        hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
   }
 
   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
@@ -202,16 +203,18 @@ class FullFilterBitsReader : public FilterBitsReader {
     // Other Error params, including a broken filter, regarded as match
     if (num_probes_ == 0 || num_lines_ == 0) return;
     uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
-    uint32_t bit_offsets[MultiGetContext::MAX_BATCH_SIZE];
+    uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
     for (int i = 0; i < num_keys; ++i) {
       hashes[i] = BloomHash(*keys[i]);
-      FilterPrepare(hashes[i], Slice(data_, data_len_), num_lines_,
-                    &bit_offsets[i]);
+      LegacyFullFilterImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                                /*out*/ &byte_offsets[i],
+                                                log2_cache_line_size_);
     }
 
     for (int i = 0; i < num_keys; ++i) {
-      if (!HashMayMatch(hashes[i], Slice(data_, data_len_), num_probes_,
-                        bit_offsets[i])) {
+      if (!LegacyFullFilterImpl::HashMayMatchPrepared(hashes[i], num_probes_,
+                                                      data_ + byte_offsets[i],
+                                                      log2_cache_line_size_)) {
         may_match[i] = false;
       }
     }
@@ -219,38 +222,19 @@ class FullFilterBitsReader : public FilterBitsReader {
 
  private:
   // Filter meta data
-  char* data_;
+  const char* data_;
   uint32_t data_len_;
-  size_t num_probes_;
+  int num_probes_;
   uint32_t num_lines_;
   uint32_t log2_cache_line_size_;
 
   // Get num_probes, and num_lines from filter
   // If filter format broken, set both to 0.
-  void GetFilterMeta(const Slice& filter, size_t* num_probes,
-                             uint32_t* num_lines);
-
-  // "filter" contains the data appended by a preceding call to
-  // FilterBitsBuilder::Finish. This method must return true if the key was
-  // passed to FilterBitsBuilder::AddKey. This method may return true or false
-  // if the key was not on the list, but it should aim to return false with a
-  // high probability.
-  //
-  // hash: target to be checked
-  // filter: the whole filter, including meta data bytes
-  // num_probes: number of probes, read before hand
-  // num_lines: filter metadata, read before hand
-  // Before calling this function, need to ensure the input meta data
-  // is valid.
-  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
-                    const size_t& num_probes, const uint32_t& bit_offset);
-
-  void FilterPrepare(const uint32_t& hash, const Slice& filter,
-                     const uint32_t& num_lines, uint32_t* bit_offset);
+  void GetFilterMeta(const Slice& filter, int* num_probes, uint32_t* num_lines);
 };
 
-void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
-    size_t* num_probes, uint32_t* num_lines) {
+void FullFilterBitsReader::GetFilterMeta(const Slice& filter, int* num_probes,
+                                         uint32_t* num_lines) {
   uint32_t len = static_cast<uint32_t>(filter.size());
   if (len <= 5) {
     // filter is empty or broken
@@ -263,54 +247,6 @@ void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
   *num_lines = DecodeFixed32(filter.data() + len - 4);
 }
 
-void FullFilterBitsReader::FilterPrepare(const uint32_t& hash,
-                                         const Slice& filter,
-                                         const uint32_t& num_lines,
-                                         uint32_t* bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_lines != 0 && (len - 5) % num_lines == 0);
-
-  uint32_t h = hash;
-  // Left shift by an extra 3 to convert bytes to bits
-  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
-  PREFETCH(&filter.data()[b / 8], 0 /* rw */, 1 /* locality */);
-  PREFETCH(&filter.data()[b / 8 + (1 << log2_cache_line_size_) - 1],
-      0 /* rw */, 1 /* locality */);
-  *bit_offset = b;
-}
-
-bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
-                                        const Slice& filter,
-                                        const size_t& num_probes,
-                                        const uint32_t& bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return false;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_probes != 0);
-  const char* data = filter.data();
-
-  uint32_t h = hash;
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-
-  for (uint32_t i = 0; i < num_probes; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    //  to a simple and operation by compiler.
-    const uint32_t bitpos =
-        bit_offset + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
-    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
-      return false;
-    }
-
-    h += delta;
-  }
-
-  return true;
-}
-
 // An implementation of filter policy
 class BloomFilterPolicy : public FilterPolicy {
  public:
@@ -326,56 +262,45 @@ class BloomFilterPolicy : public FilterPolicy {
 
   void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
-    size_t bits = n * bits_per_key_;
+    uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
 
     // For small n, we can see a very high false positive rate.  Fix it
     // by enforcing a minimum bloom filter length.
     if (bits < 64) bits = 64;
 
-    size_t bytes = (bits + 7) / 8;
+    uint32_t bytes = (bits + 7) / 8;
     bits = bytes * 8;
 
     const size_t init_size = dst->size();
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
-    for (size_t i = 0; i < static_cast<size_t>(n); i++) {
-      // Use double-hashing to generate a sequence of hash values.
-      // See analysis in [Kirsch,Mitzenmacher 2006].
-      uint32_t h = hash_func_(keys[i]);
-      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-      for (size_t j = 0; j < num_probes_; j++) {
-        const uint32_t bitpos = h % bits;
-        array[bitpos/8] |= (1 << (bitpos % 8));
-        h += delta;
-      }
+    for (int i = 0; i < n; i++) {
+      LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits, num_probes_,
+                                         array);
     }
   }
 
   bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
-    if (len < 2) return false;
+    if (len < 2 || len > 0xffffffffU) {
+      return false;
+    }
 
     const char* array = bloom_filter.data();
-    const size_t bits = (len - 1) * 8;
+    const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
 
     // Use the encoded k so that we can read filters generated by
     // bloom filters created using different parameters.
-    const size_t k = array[len-1];
+    const int k = static_cast<uint8_t>(array[len - 1]);
     if (k > 30) {
       // Reserved for potentially new encodings for short bloom filters.
       // Consider it a match.
       return true;
     }
-
-    uint32_t h = hash_func_(key);
-    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-    for (size_t j = 0; j < k; j++) {
-      const uint32_t bitpos = h % bits;
-      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
-      h += delta;
-    }
-    return true;
+    // NB: using k not num_probes_
+    return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits, k,
+                                                   array);
   }
 
   FilterBitsBuilder* GetFilterBitsBuilder() const override {
@@ -394,15 +319,15 @@ class BloomFilterPolicy : public FilterPolicy {
   bool UseBlockBasedBuilder() { return use_block_based_builder_; }
 
  private:
-  size_t bits_per_key_;
-  size_t num_probes_;
+  int bits_per_key_;
+  int num_probes_;
   uint32_t (*hash_func_)(const Slice& key);
 
   const bool use_block_based_builder_;
 
   void initialize() {
     // We intentionally round down to reduce probing cost a little bit
-    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
     if (num_probes_ < 1) num_probes_ = 1;
     if (num_probes_ > 30) num_probes_ = 30;
   }
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
new file mode 100644
index 00000000000..13c7a7ec6b0
--- /dev/null
+++ b/util/bloom_impl.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Implementation details of various Bloom filter implementations used in
+// RocksDB. (DynamicBloom is in a separate file for now because it
+// supports concurrent write.)
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+// A legacy Bloom filter implementation with no locality of probes (slow).
+// It uses double hashing to generate a sequence of hash values.
+// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
+// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
+//
+// DO NOT REUSE - faster and more predictably accurate implementations
+// are available at
+// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
+// See e.g. RocksDB DynamicBloom.
+//
+class LegacyNoLocalityBloomImpl {
+public:
+  static inline void AddHash(uint32_t h, uint32_t total_bits,
+                             int num_probes, char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      data[bitpos/8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
+                                  int num_probes, const char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      if ((data[bitpos/8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+
+// A legacy Bloom filter implementation with probes local to a single
+// cache line (fast). Because SST files might be transported between
+// platforms, the cache line size is a parameter rather than hard coded.
+// (But if specified as a constant parameter, an optimizing compiler
+// should take advantage of that.)
+//
+// When ExtraRotates is false, this implementation is notably deficient in
+// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
+// increment being zero (when cache line size is 512 bits). Thus, there's a
+// 1/512 chance of probing only one index, which we'd expect to incur about
+// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
+// https://github.com/facebook/rocksdb/issues/4120
+//
+// DO NOT REUSE - faster and more predictably accurate implementations
+// are available at
+// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
+// See e.g. RocksDB DynamicBloom.
+//
+template <bool ExtraRotates>
+class LegacyLocalityBloomImpl {
+private:
+  static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
+    uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
+    return offset_h % num_lines;
+  }
+public:
+  static inline void AddHash(uint32_t h, uint32_t num_lines,
+                             int num_probes, char *data,
+                             int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    char *data_at_offset =
+        data + (GetLine(h, num_lines) << log2_cache_line_bytes);
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+  }
+
+  static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
+                                         const char *data,
+                                         uint32_t /*out*/*byte_offset,
+                                         int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1),
+             0 /* rw */, 1 /* locality */);
+    *byte_offset = b;
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
+                                  int num_probes, const char *data,
+                                  int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    return HashMayMatchPrepared(h, num_probes,
+                                data + b, log2_cache_line_bytes);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
+                                          const char *data_at_offset,
+                                          int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index b759303996e..26146152106 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -352,7 +352,8 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5)) << length;
+    ASSERT_LE(FilterSize(),
+              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
 
     // All added keys must match
     for (int i = 0; i < length; i++) {

From 811e403f57cd833a4b2d363dca695229096be1f2 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 16 Sep 2019 20:40:44 -0700
Subject: [PATCH 392/572] Dedup IsFileSectorAligned() to fix unity build.
 (#5812)

Summary:
Unity build fails because of name conflict of IsFileSectorAligned() after recent refactoring. Consolidate the function.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5812

Test Plan: make unity. At least the failure goes away. Also "make all", "make release" and see no regression in normal cases.

Differential Revision: D17411403

fbshipit-source-id: 09d5653471ae2c3a4d898e120a024f7dd08d9c9d
---
 file/read_write_util.cc      |  5 +++++
 file/read_write_util.h       |  3 +++
 file/readahead_raf.cc        | 10 +---------
 file/sequence_file_reader.cc | 10 +---------
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/file/read_write_util.cc b/file/read_write_util.cc
index 8614fcaa8d2..892499b8cfa 100644
--- a/file/read_write_util.cc
+++ b/file/read_write_util.cc
@@ -58,4 +58,9 @@ bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
   return *has_data || has_complete_line;
 }
 
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+#endif  // NDEBUG
 }  // namespace rocksdb
diff --git a/file/read_write_util.h b/file/read_write_util.h
index 7c344728fdf..be975e854ff 100644
--- a/file/read_write_util.h
+++ b/file/read_write_util.h
@@ -26,4 +26,7 @@ extern Status NewWritableFile(Env* env, const std::string& fname,
 bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
                  std::string* output, bool* has_data, Status* result);
 
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif  // NDEBUG
 }  // namespace rocksdb
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
index 5c5582d141e..dc005b900b6 100644
--- a/file/readahead_raf.cc
+++ b/file/readahead_raf.cc
@@ -11,19 +11,11 @@
 
 #include <algorithm>
 #include <mutex>
+#include "file/read_write_util.h"
 #include "util/aligned_buffer.h"
 #include "util/rate_limiter.h"
 
 namespace rocksdb {
-
-#ifndef NDEBUG
-namespace {
-bool IsFileSectorAligned(const size_t off, size_t sector_size) {
-  return off % sector_size == 0;
-}
-}  // namespace
-#endif
-
 namespace {
 class ReadaheadRandomAccessFile : public RandomAccessFile {
  public:
diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc
index f9b20d067d0..be766a68b38 100644
--- a/file/sequence_file_reader.cc
+++ b/file/sequence_file_reader.cc
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <mutex>
 
+#include "file/read_write_util.h"
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
@@ -21,15 +22,6 @@
 #include "util/rate_limiter.h"
 
 namespace rocksdb {
-
-#ifndef NDEBUG
-namespace {
-bool IsFileSectorAligned(const size_t off, size_t sector_size) {
-  return off % sector_size == 0;
-}
-}  // namespace
-#endif
-
 Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
   Status s;
   if (use_direct_io()) {

From f5a59c42c51bce53a1ac94dbd1252bef47e46f19 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 16 Sep 2019 20:41:16 -0700
Subject: [PATCH 393/572] Update dependencies (#5777)

Summary:
Update version of dependencies.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5777

Test Plan: make release

Differential Revision: D17269421

fbshipit-source-id: e76dbe5389e1d7f811739d3bc1e404b482dfce34
---
 build_tools/dependencies.sh             | 28 ++++++++++++-------------
 build_tools/dependencies_platform007.sh | 28 ++++++++++++-------------
 build_tools/update_dependencies.sh      |  9 ++++++++
 3 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/build_tools/dependencies.sh b/build_tools/dependencies.sh
index c6e074b6460..22454c76fce 100644
--- a/build_tools/dependencies.sh
+++ b/build_tools/dependencies.sh
@@ -1,19 +1,19 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0
+GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/5.x/centos7-native/c447969
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/1bd23f9917738974ad0ff305aa23eb5f93f18305/9.0.0/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/5.x/gcc-5-glibc-2.23/339d858
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.23/gcc-5-glibc-2.23/ca1d1c0
 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/gcc-5-glibc-2.23/9bc6787
 BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/gcc-5-glibc-2.23/9bc6787
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/gcc-5-glibc-2.23/03859b5
 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b
-NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1
-TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/gcc-5-glibc-2.23/0c8f76d
+NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/gcc-5-glibc-2.23/9bc6787
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/gcc-5-glibc-2.23/b443de1
+TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/gcc-5-glibc-2.23/9bc6787
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/2e3cb7d119b3cea5f1e738cc13a1ac69f49eb875/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/gcc-5-glibc-2.23/9bc6787
 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd
diff --git a/build_tools/dependencies_platform007.sh b/build_tools/dependencies_platform007.sh
index 004bccb5365..40ccc8e4274 100644
--- a/build_tools/dependencies_platform007.sh
+++ b/build_tools/dependencies_platform007.sh
@@ -1,19 +1,19 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413
+GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/1bd23f9917738974ad0ff305aa23eb5f93f18305/9.0.0/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413
 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/platform007/ca4da3d
 BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/platform007/ca4da3d
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/platform007/15a3614
 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e
-NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9
-TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/platform007/c26c002
+NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/platform007/ca4da3d
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9
+TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/2e3cb7d119b3cea5f1e738cc13a1ac69f49eb875/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d
 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh
index 9af8c60d1d8..dbc95a6e545 100755
--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@@ -6,6 +6,12 @@
 BASEDIR=$(dirname $0)
 OUTPUT=""
 
+function log_header()
+{
+  echo "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved." >> "$OUTPUT"
+}
+
+
 function log_variable()
 {
   echo "$1=${!1}" >> "$OUTPUT"
@@ -69,6 +75,7 @@ echo "Writing dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
@@ -108,6 +115,7 @@ echo "Writing dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
@@ -147,6 +155,7 @@ echo "Writing 4.8.1 dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/4.8.1/centos6-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 

From 622683000c707d0203ddbaf2fb1186120dce036f Mon Sep 17 00:00:00 2001
From: andrew <qiw209@gmail.com>
Date: Mon, 16 Sep 2019 21:00:13 -0700
Subject: [PATCH 394/572] Allow users to stop manual compactions (#3971)

Summary:
Manual compaction may bring in very high load because sometime the amount of data involved in a compaction could be large, which may affect online service. So it would be good if the running compaction making the server busy can be stopped immediately. In this implementation, stopping manual compaction condition is only checked in slow process. We let deletion compaction and trivial move go through.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/3971

Test Plan: add tests at more spots.

Differential Revision: D17369043

fbshipit-source-id: 575a624fb992ce0bb07d9443eb209e547740043c
---
 db/compaction/compaction_iterator.cc     |  17 +-
 db/compaction/compaction_iterator.h      |  13 +-
 db/compaction/compaction_job.cc          |  23 ++-
 db/compaction/compaction_job.h           |   4 +-
 db/db_impl/db_impl.cc                    |   1 +
 db/db_impl/db_impl.h                     |   4 +
 db/db_impl/db_impl_compaction_flush.cc   |  46 ++++-
 db/db_test.cc                            |   4 +
 db/db_test2.cc                           | 211 +++++++++++++++++++++++
 include/rocksdb/db.h                     |   3 +
 include/rocksdb/status.h                 |   7 +
 include/rocksdb/utilities/stackable_db.h |   7 +
 util/status.cc                           |   4 +
 13 files changed, 330 insertions(+), 14 deletions(-)

diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 135018f5148..73a7c397407 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -38,7 +38,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    SnapshotListFetchCallback* snap_list_callback,
+    const std::atomic<bool>* manual_compaction_paused)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
@@ -46,7 +47,8 @@ CompactionIterator::CompactionIterator(
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          snap_list_callback) {}
+          snap_list_callback,
+          manual_compaction_paused) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -59,7 +61,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    SnapshotListFetchCallback* snap_list_callback,
+    const std::atomic<bool>* manual_compaction_paused)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
@@ -73,6 +76,7 @@ CompactionIterator::CompactionIterator(
       compaction_(std::move(compaction)),
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
@@ -234,7 +238,8 @@ void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
 
-  while (!valid_ && input_->Valid() && !IsShuttingDown()) {
+  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+      !IsShuttingDown()) {
     key_ = input_->key();
     value_ = input_->value();
     iter_stats_.num_input_records++;
@@ -612,6 +617,10 @@ void CompactionIterator::NextFromInput() {
   if (!valid_ && IsShuttingDown()) {
     status_ = Status::ShutdownInProgress();
   }
+
+  if (IsPausingManualCompaction()) {
+    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
 }
 
 void CompactionIterator::PrepareOutput() {
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 2bf847e2e21..c063739501b 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -118,7 +118,8 @@ class CompactionIterator {
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
                      const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+                     SnapshotListFetchCallback* snap_list_callback = nullptr,
+                     const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(InternalIterator* input, const Comparator* cmp,
@@ -132,7 +133,8 @@ class CompactionIterator {
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
                      const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+                     SnapshotListFetchCallback* snap_list_callback = nullptr,
+                     const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionIterator();
 
@@ -213,6 +215,7 @@ class CompactionIterator {
   std::unique_ptr<CompactionProxy> compaction_;
   const CompactionFilter* compaction_filter_;
   const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
   const SequenceNumber preserve_deletes_seqnum_;
   bool bottommost_level_;
   bool valid_ = false;
@@ -279,5 +282,11 @@ class CompactionIterator {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
     return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
   }
+
+  bool IsPausingManualCompaction() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return manual_compaction_paused_ &&
+      manual_compaction_paused_->load(std::memory_order_relaxed);
+  }
 };
 }  // namespace rocksdb
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 1acc48b4c40..d49783144c6 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -311,7 +311,8 @@ CompactionJob::CompactionJob(
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback)
+    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback,
+    const std::atomic<bool>* manual_compaction_paused)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
@@ -324,6 +325,7 @@ CompactionJob::CompactionJob(
           env_->OptimizeForCompactionTableRead(env_options, db_options_)),
       versions_(versions),
       shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
@@ -867,9 +869,12 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       false /* internal key corruption is expected */,
       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
       snapshot_checker_, compact_->compaction->level(),
-      db_options_.statistics.get(), shutting_down_);
+      db_options_.statistics.get());
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void *>(
+        const_cast<std::atomic<bool> *>(manual_compaction_paused_)));
 
   Slice* start = sub_compact->start;
   Slice* end = sub_compact->end;
@@ -889,7 +894,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &range_del_agg, sub_compact->compaction, compaction_filter,
       shutting_down_, preserve_deletes_seqnum_,
       // Currently range_del_agg is incompatible with snapshot refresh feature.
-      range_del_agg.IsEmpty() ? snap_list_callback_ : nullptr));
+      range_del_agg.IsEmpty() ? snap_list_callback_ : nullptr,
+      manual_compaction_paused_));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
@@ -953,7 +959,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       input_status = input->status();
       output_file_ended = true;
     }
+    TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void *>(
+          const_cast<std::atomic<bool> *>(manual_compaction_paused_)));
     c_iter->Next();
+    if (c_iter->status().IsManualCompactionPaused()) {
+      break;
+    }
     if (!output_file_ended && c_iter->Valid() &&
         sub_compact->compaction->output_level() != 0 &&
         sub_compact->ShouldStopBefore(c_iter->key(),
@@ -1006,6 +1018,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       shutting_down_->load(std::memory_order_relaxed)) {
     status = Status::ShutdownInProgress("Database shutdown");
   }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      (manual_compaction_paused_ &&
+       manual_compaction_paused_->load(std::memory_order_relaxed))) {
+    status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
   if (status.ok()) {
     status = input->status();
   }
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 84d38c163eb..79069770c0e 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -75,7 +75,8 @@ class CompactionJob {
       std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
       bool paranoid_file_checks, bool measure_io_stats,
       const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback);
+      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback,
+      const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionJob();
 
@@ -154,6 +155,7 @@ class CompactionJob {
   EnvOptions env_options_for_read_;
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
   const SequenceNumber preserve_deletes_seqnum_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 443138908da..710fde5b477 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -165,6 +165,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       batch_per_txn_(batch_per_txn),
       db_lock_(nullptr),
       shutting_down_(false),
+      manual_compaction_paused_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
       log_dir_synced_(false),
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 3e45442caa9..3552a7cf235 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -282,6 +282,9 @@ class DBImpl : public DB {
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
 
+  virtual void EnableManualCompaction() override;
+  virtual void DisableManualCompaction() override;
+
   using DB::SetOptions;
   Status SetOptions(
       ColumnFamilyHandle* column_family,
@@ -1638,6 +1641,7 @@ class DBImpl : public DB {
   InstrumentedMutex log_write_mutex_;
 
   std::atomic<bool> shutting_down_;
+  std::atomic<bool> manual_compaction_paused_;
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 1b862e033d0..724d54f0937 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -917,6 +917,9 @@ Status DBImpl::CompactFilesImpl(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return Status::ShutdownInProgress();
   }
+  if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
 
   std::unordered_set<uint64_t> input_set;
   for (const auto& file_name : input_file_names) {
@@ -1012,7 +1015,8 @@ Status DBImpl::CompactFilesImpl(
       immutable_db_options_.max_subcompactions <= 1 &&
               c->mutable_cf_options()->snap_refresh_nanos > 0
           ? &fetch_callback
-          : nullptr);
+          : nullptr,
+      &manual_compaction_paused_);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -1058,6 +1062,12 @@ Status DBImpl::CompactFilesImpl(
     // Done
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
+  } else if (status.IsManualCompactionPaused()) {
+    // Don't report stopping manual compaction as error
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Stopping manual compaction",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id);
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "[%s] [JOB %d] Compaction error: %s",
@@ -1128,6 +1138,10 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
@@ -1190,6 +1204,10 @@ void DBImpl::NotifyOnCompactionCompleted(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
@@ -1879,6 +1897,14 @@ Status DBImpl::EnableAutoCompaction(
   return s;
 }
 
+void DBImpl::DisableManualCompaction() {
+  manual_compaction_paused_.store(true, std::memory_order_release);
+}
+
+void DBImpl::EnableManualCompaction() {
+  manual_compaction_paused_.store(false, std::memory_order_release);
+}
+
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
   if (!opened_successfully_) {
@@ -2319,6 +2345,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       env_->SleepForMicroseconds(10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() &&
                !s.IsColumnFamilyDropped()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
@@ -2336,6 +2363,12 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       LogFlush(immutable_db_options_.info_log);
       env_->SleepForMicroseconds(1000000);
       mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      ManualCompactionState *m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(),
+                       job_context.job_id);
     }
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
@@ -2344,7 +2377,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
-                                        !s.IsColumnFamilyDropped());
+                      !s.IsManualCompactionPaused() &&
+                      !s.IsColumnFamilyDropped());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2427,6 +2461,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!error_handler_.IsBGWorkStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
+    } else if (is_manual &&
+        manual_compaction_paused_.load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
     }
   } else {
     status = error_handler_.GetBGError();
@@ -2744,7 +2781,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         immutable_db_options_.max_subcompactions <= 1 &&
                 c->mutable_cf_options()->snap_refresh_nanos > 0
             ? &fetch_callback
-            : nullptr);
+            : nullptr, is_manual ? &manual_compaction_paused_ : nullptr);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
@@ -2784,7 +2821,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                 compaction_job_stats, job_context->job_id);
   }
 
-  if (status.ok() || status.IsCompactionTooLarge()) {
+  if (status.ok() || status.IsCompactionTooLarge() ||
+      status.IsManualCompactionPaused()) {
     // Done
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
diff --git a/db/db_test.cc b/db/db_test.cc
index 60a077f57f3..9ca550b8d18 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2737,6 +2737,10 @@ class ModelDB : public DB {
     return Status::NotSupported("Not supported operation.");
   }
 
+  void EnableManualCompaction() override { return; }
+
+  void DisableManualCompaction() override { return; }
+
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index cf622973a7f..a4c5b5aa305 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -2402,6 +2402,217 @@ TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, PausingManualCompaction1) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another file containing same keys
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  int manual_compactions_paused = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        manual_compactions_paused += 1;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files_before_compact, files_after_compact;
+  // Remember file name before compaction is triggered
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_before_compact.push_back(file.name);
+  }
+
+  // OK, now trigger a manual compaction
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Wait for compactions to get scheduled and stopped
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Get file names after compaction is stopped
+  files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  // Like nothing happened
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  ASSERT_EQ(manual_compactions_paused, 1);
+
+  manual_compactions_paused = 0;
+  // Now make sure CompactFiles also not run
+  dbfull()->CompactFiles(rocksdb::CompactionOptions(),
+                         files_before_compact, 0);
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  files_meta.clear();
+  files_after_compact.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  // CompactFiles returns at entry point
+  ASSERT_EQ(manual_compactions_paused, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = false;
+
+  DestroyAndReopen(options);
+  dbfull()->DisableManualCompaction();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; i++) {
+    // Generate a file containing 10 keys.
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 50)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels-i+1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels-i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* /*arg*/) {
+        run_manual_compactions++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->DisableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  // As manual compaction disabled, not even reach sync point
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels-i+1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels-i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        run_manual_compactions++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, OptimizeForPointLookup) {
   Options options = CurrentOptions();
   Close();
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index a2961792b53..ca8ab85b8a3 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -999,6 +999,9 @@ class DB {
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
 
+  virtual void DisableManualCompaction() = 0;
+  virtual void EnableManualCompaction() = 0;
+
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index e4360126dbd..507d04168e2 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -77,6 +77,7 @@ class Status {
     kSpaceLimit = 8,
     kPathNotFound = 9,
     KMergeOperandsInsufficientCapacity = 10,
+    kManualCompactionPaused = 11,
     kMaxSubCode
   };
 
@@ -295,6 +296,12 @@ class Status {
     return (code() == kIOError) && (subcode() == kPathNotFound);
   }
 
+  // Returns true iff the status indicates manual compaction paused. This
+  // is caused by a call to PauseManualCompaction
+  bool IsManualCompactionPaused() const {
+    return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 04f1039c21c..9dd038b84fa 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -271,6 +271,13 @@ class StackableDB : public DB {
     return db_->EnableAutoCompaction(column_family_handles);
   }
 
+  virtual void EnableManualCompaction() override {
+    return db_->EnableManualCompaction();
+  }
+  virtual void DisableManualCompaction() override {
+    return db_->DisableManualCompaction();
+  }
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
     return db_->NumberLevels(column_family);
diff --git a/util/status.cc b/util/status.cc
index 9405944808d..0b1b2493539 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -43,6 +43,10 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
     "Memory limit reached",                               // kMemoryLimit
     "Space limit reached",                                // kSpaceLimit
     "No such file or directory",                          // kPathNotFound
+    // KMergeOperandsInsufficientCapacity
+    "Insufficient capacity for merge operands",
+    // kManualCompactionPaused
+    "Manual compaction paused",
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,

From 6287f0d73bf4ec641706dc41d35ba839b7844b58 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 16 Sep 2019 21:02:27 -0700
Subject: [PATCH 395/572] Improve readability of DBIter's two seek functions
 (#5794)

Summary:
Doing some code reordering in DBIter::Seek() and DBIter::SeekForPrev().
The logic largely remains the same, except slight difference when handling some stats when valid_ = false, where they are not supposed to be used anyway.
Also remove prefix_start_key_, which sometimes point a part of seek target, some times prefix_start_buf_, which is confusing.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5794

Test Plan: Run all tests.

Differential Revision: D17375257

fbshipit-source-id: 7339a23898cecd3a8475bf72340fcd6f82b933c5
---
 db/db_iter.cc | 297 +++++++++++++++++++++++++++++---------------------
 db/db_iter.h  |  27 ++++-
 2 files changed, 195 insertions(+), 129 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 8af13b7fa9c..df7e1b45042 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -47,13 +47,14 @@ static void DumpInternalIter(Iterator* iter) {
 #endif
 
 DBIter::DBIter(Env* _env, const ReadOptions& read_options,
-       const ImmutableCFOptions& cf_options,
-       const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-       InternalIterator* iter, SequenceNumber s, bool arena_mode,
-       uint64_t max_sequential_skip_in_iterations,
-       ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-       bool allow_blob)
-    : env_(_env),
+               const ImmutableCFOptions& cf_options,
+               const MutableCFOptions& mutable_cf_options,
+               const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
+               bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+               ReadCallback* read_callback, DBImpl* db_impl,
+               ColumnFamilyData* cfd, bool allow_blob)
+    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      env_(_env),
       logger_(cf_options.info_log),
       user_comparator_(cmp),
       merge_operator_(cf_options.merge_operator),
@@ -68,7 +69,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       valid_(false),
       current_entry_is_merged_(false),
       is_key_seqnum_zero_(false),
-      prefix_same_as_start_(read_options.prefix_same_as_start),
+      prefix_same_as_start_(mutable_cf_options.prefix_extractor
+                                ? read_options.prefix_same_as_start
+                                : false),
       pin_thru_lifetime_(read_options.pin_data),
       total_order_seek_(read_options.total_order_seek),
       allow_blob_(allow_blob),
@@ -79,7 +82,6 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       cfd_(cfd),
       start_seqnum_(read_options.iter_start_seqnum) {
   RecordTick(statistics_, NO_ITERATOR_CREATED);
-  prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
   max_skip_ = max_sequential_skip_in_iterations;
   max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
   if (pin_thru_lifetime_) {
@@ -152,8 +154,13 @@ void DBIter::Next() {
 
   local_stats_.next_count_++;
   if (ok && iter_.Valid()) {
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
     FindNextUserEntry(true /* skipping the current user key */,
-                      prefix_same_as_start_);
+                      prefix_same_as_start_ ? &prefix : nullptr);
   } else {
     is_key_seqnum_zero_ = false;
     valid_ = false;
@@ -164,7 +171,7 @@ void DBIter::Next() {
   }
 }
 
-// PRE: saved_key_ has the current user key if skipping
+// PRE: saved_key_ has the current user key if skipping_saved_key
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
 //           current_entry_is_merged_ => true
@@ -174,17 +181,17 @@ void DBIter::Next() {
 //       a delete marker or a sequence number higher than sequence_
 //       saved_key_ MUST have a proper user_key before calling this function
 //
-// The prefix_check parameter controls whether we check the iterated
-// keys against the prefix of the seeked key. Set to false when
-// performing a seek without a key (e.g. SeekToFirst). Set to
-// prefix_same_as_start_ for other iterations.
-bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
+// The prefix parameter, if not null, indicates that we need to iterator
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
   PERF_TIMER_GUARD(find_next_user_entry_time);
-  return FindNextUserEntryInternal(skipping, prefix_check);
+  return FindNextUserEntryInternal(skipping_saved_key, prefix);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
-bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+                                       const Slice* prefix) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_.Valid());
   assert(status_.ok());
@@ -193,9 +200,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
 
   // How many times in a row we have skipped an entry with user key less than
   // or equal to saved_key_. We could skip these entries either because
-  // sequence numbers were too high or because skipping = true.
+  // sequence numbers were too high or because skipping_saved_key = true.
   // What saved_key_ contains throughout this method:
-  //  - if skipping        : saved_key_ contains the key that we need to skip,
+  //  - if skipping_saved_key        : saved_key_ contains the key that we need
+  //  to skip,
   //                         and we haven't seen any keys greater than that,
   //  - if num_skipped > 0 : saved_key_ contains the key that we have skipped
   //                         num_skipped times, and we haven't seen any keys
@@ -228,9 +236,10 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
       break;
     }
 
-    if (prefix_extractor_ && prefix_check &&
-        prefix_extractor_->Transform(ikey_.user_key)
-                .compare(prefix_start_key_) != 0) {
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
       break;
     }
 
@@ -243,14 +252,15 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
       // possibly be skipped. This condition can potentially be relaxed to
       // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
       // prone to bugs causing the same user key with the same sequence number.
-      if (!is_prev_key_seqnum_zero && skipping &&
+      if (!is_prev_key_seqnum_zero && skipping_saved_key &&
           user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
               0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
       } else {
-        assert(!skipping || user_comparator_.Compare(
-                                ikey_.user_key, saved_key_.GetUserKey()) > 0);
+        assert(!skipping_saved_key ||
+               user_comparator_.Compare(ikey_.user_key,
+                                        saved_key_.GetUserKey()) > 0);
         num_skipped = 0;
         reseek_done = false;
         switch (ikey_.type) {
@@ -271,7 +281,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
               saved_key_.SetUserKey(
                   ikey_.user_key, !pin_thru_lifetime_ ||
                                       !iter_.iter()->IsKeyPinned() /* copy */);
-              skipping = true;
+              skipping_saved_key = true;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
             }
             break;
@@ -289,12 +299,12 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
                 return true;
               } else {
                 // this key and all previous versions shouldn't be included,
-                // skipping
+                // skipping_saved_key
                 saved_key_.SetUserKey(
                     ikey_.user_key,
                     !pin_thru_lifetime_ ||
                         !iter_.iter()->IsKeyPinned() /* copy */);
-                skipping = true;
+                skipping_saved_key = true;
               }
             } else {
               saved_key_.SetUserKey(
@@ -304,7 +314,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
                       ikey_, RangeDelPositioningMode::kForwardTraversal)) {
                 // Arrange to skip all upcoming entries for this key since
                 // they are hidden by this deletion.
-                skipping = true;
+                skipping_saved_key = true;
                 num_skipped = 0;
                 reseek_done = false;
                 PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
@@ -335,7 +345,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
                     ikey_, RangeDelPositioningMode::kForwardTraversal)) {
               // Arrange to skip all upcoming entries for this key since
               // they are hidden by this deletion.
-              skipping = true;
+              skipping_saved_key = true;
               num_skipped = 0;
               reseek_done = false;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
@@ -360,13 +370,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
       // to seek to the target sequence number.
       int cmp =
           user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
-      if (cmp == 0 || (skipping && cmp <= 0)) {
+      if (cmp == 0 || (skipping_saved_key && cmp <= 0)) {
         num_skipped++;
       } else {
         saved_key_.SetUserKey(
             ikey_.user_key,
             !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-        skipping = false;
+        skipping_saved_key = false;
         num_skipped = 0;
         reseek_done = false;
       }
@@ -386,14 +396,14 @@ bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
       num_skipped = 0;
       reseek_done = true;
       std::string last_key;
-      if (skipping) {
+      if (skipping_saved_key) {
         // We're looking for the next user-key but all we see are the same
         // user-key with decreasing sequence numbers. Fast forward to
         // sequence number 0 and type deletion (the smallest type).
         AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                        0, kTypeDeletion));
-        // Don't set skipping = false because we may still see more user-keys
-        // equal to saved_key_.
+        // Don't set skipping_saved_key = false because we may still see more
+        // user-keys equal to saved_key_.
       } else {
         // We saw multiple entries with this user key and sequence numbers
         // higher than sequence_. Fast forward to sequence_.
@@ -534,8 +544,14 @@ void DBIter::Prev() {
     }
   }
   if (ok) {
-    PrevInternal();
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
   }
+
   if (statistics_ != nullptr) {
     local_stats_.prev_count_++;
     if (valid_) {
@@ -613,15 +629,17 @@ bool DBIter::ReverseToBackward() {
   return FindUserKeyBeforeSavedKey();
 }
 
-void DBIter::PrevInternal() {
+void DBIter::PrevInternal(const Slice* prefix) {
   while (iter_.Valid()) {
     saved_key_.SetUserKey(
         ExtractUserKey(iter_.key()),
         !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
 
-    if (prefix_extractor_ && prefix_same_as_start_ &&
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
         prefix_extractor_->Transform(saved_key_.GetUserKey())
-                .compare(prefix_start_key_) != 0) {
+                .compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
       // Current key does not have the same prefix as start
       valid_ = false;
       return;
@@ -1047,120 +1065,152 @@ bool DBIter::IsVisible(SequenceNumber sequence) {
   }
 }
 
-void DBIter::Seek(const Slice& target) {
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
-  StopWatch sw(env_, statistics_, DB_SEEK);
-  status_ = Status::OK();
-  ReleaseTempPinnedData();
-  ResetInternalKeysSkippedCounter();
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
   is_key_seqnum_zero_ = false;
-
   SequenceNumber seq = sequence_;
   saved_key_.Clear();
   saved_key_.SetInternalKey(target, seq);
 
-#ifndef ROCKSDB_LITE
-  if (db_impl_ != nullptr && cfd_ != nullptr) {
-    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
-  }
-#endif  // ROCKSDB_LITE
-
   if (iterate_lower_bound_ != nullptr &&
       user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
           0) {
+    // Seek key is smaller than the lower bound.
     saved_key_.Clear();
     saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
   }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  saved_key_.Clear();
+  // now saved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+                            kValueTypeForSeekForPrev);
+
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(),
+                               *iterate_upper_bound_) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
 
+  // Seek the inner iterator based on the target key.
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
+
+    SetSavedKeyToSeekTarget(target);
     iter_.Seek(saved_key_.GetInternalKey());
+
     range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
   }
-  RecordTick(statistics_, NUMBER_DB_SEEK);
-  if (iter_.Valid()) {
-    if (prefix_extractor_ && prefix_same_as_start_) {
-      prefix_start_key_ = prefix_extractor_->Transform(target);
-    }
-    direction_ = kForward;
-    ClearSavedValue();
-    FindNextUserEntry(false /* not skipping */, prefix_same_as_start_);
-    if (!valid_) {
-      prefix_start_key_.clear();
-    }
-    if (statistics_ != nullptr) {
-      if (valid_) {
-        // Decrement since we don't want to count this key as skipped
-        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
-        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
-        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
-      }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kForward;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the next key that is visible to the user.
+  //
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix;
+    target_prefix = prefix_extractor_->Transform(target);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      &target_prefix /* prefix */);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
     }
   } else {
-    valid_ = false;
+    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+  }
+  if (!valid_) {
+    return;
   }
 
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(prefix_start_key_);
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  // Updating stats and perf context counters.
+  if (statistics_ != nullptr) {
+    // Decrement since we don't want to count this key as skipped
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
   }
+  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
 }
 
 void DBIter::SeekForPrev(const Slice& target) {
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
   status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
-  is_key_seqnum_zero_ = false;
-  saved_key_.Clear();
-  // now saved_key is used to store internal key.
-  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
-                            kValueTypeForSeekForPrev);
-
-  if (iterate_upper_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(),
-                               *iterate_upper_bound_) >= 0) {
-    saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
-  }
 
+  // Seek the inner iterator based on the target key.
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
+    SetSavedKeyToSeekForPrevTarget(target);
     iter_.SeekForPrev(saved_key_.GetInternalKey());
     range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
   }
-
-#ifndef ROCKSDB_LITE
-  if (db_impl_ != nullptr && cfd_ != nullptr) {
-    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
   }
-#endif  // ROCKSDB_LITE
+  direction_ = kReverse;
 
-  RecordTick(statistics_, NUMBER_DB_SEEK);
-  if (iter_.Valid()) {
-    if (prefix_extractor_ && prefix_same_as_start_) {
-      prefix_start_key_ = prefix_extractor_->Transform(target);
-    }
-    direction_ = kReverse;
-    ClearSavedValue();
-    PrevInternal();
-    if (!valid_) {
-      prefix_start_key_.clear();
-    }
-    if (statistics_ != nullptr) {
-      if (valid_) {
-        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
-        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
-        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
-      }
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the first key that is visible to the user in the
+  // backward direction.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix;
+    target_prefix = prefix_extractor_->Transform(target);
+    PrevInternal(&target_prefix);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
     }
   } else {
-    valid_ = false;
+    PrevInternal(nullptr);
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(prefix_start_key_);
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+
+  // Report stats and perf context.
+  if (statistics_ != nullptr && valid_) {
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
   }
 }
 
@@ -1193,7 +1243,8 @@ void DBIter::SeekToFirst() {
     saved_key_.SetUserKey(
         ExtractUserKey(iter_.key()),
         !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-    FindNextUserEntry(false /* not skipping */, false /* no prefix check */);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      nullptr /* no prefix check */);
     if (statistics_ != nullptr) {
       if (valid_) {
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
@@ -1204,10 +1255,9 @@ void DBIter::SeekToFirst() {
   } else {
     valid_ = false;
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(
-        prefix_extractor_->Transform(saved_key_.GetUserKey()));
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
   }
 }
 
@@ -1217,7 +1267,7 @@ void DBIter::SeekToLast() {
     SeekForPrev(*iterate_upper_bound_);
     if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
       ReleaseTempPinnedData();
-      PrevInternal();
+      PrevInternal(nullptr);
     }
     return;
   }
@@ -1240,7 +1290,7 @@ void DBIter::SeekToLast() {
     iter_.SeekToLast();
     range_del_agg_.InvalidateRangeDelMapPositions();
   }
-  PrevInternal();
+  PrevInternal(nullptr);
   if (statistics_ != nullptr) {
     RecordTick(statistics_, NUMBER_DB_SEEK);
     if (valid_) {
@@ -1249,10 +1299,9 @@ void DBIter::SeekToLast() {
       PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
     }
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(
-        prefix_extractor_->Transform(saved_key_.GetUserKey()));
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
   }
 }
 
diff --git a/db/db_iter.h b/db/db_iter.h
index d1b60ca8fe1..15ee56d5036 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -203,15 +203,28 @@ class DBIter final: public Iterator {
   // in this case callers would usually stop what they were doing and return.
   bool ReverseToForward();
   bool ReverseToBackward();
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is smaller than iterator lower bound.
+  void SetSavedKeyToSeekTarget(const Slice& /*target*/);
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is larger than iterator upper bound.
+  void SetSavedKeyToSeekForPrevTarget(const Slice& /*target*/);
   bool FindValueForCurrentKey();
   bool FindValueForCurrentKeyUsingSeek();
   bool FindUserKeyBeforeSavedKey();
-  bool FindNextUserEntry(bool skipping, bool prefix_check);
-  bool FindNextUserEntryInternal(bool skipping, bool prefix_check);
+  // If `skipping_saved_key` is true, the function will keep iterating until it
+  // finds a user key that is larger than `saved_key_`.
+  // If `prefix` is not null, the iterator needs to stop when all keys for the
+  // prefix are exhausted and the interator is set to invalid.
+  bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+  // Internal implementation of FindNextUserEntry().
+  bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
   bool ParseKey(ParsedInternalKey* key);
   bool MergeValuesNewToOld();
 
-  void PrevInternal();
+  // If prefix is not null, we need to set the iterator to invalid if no more
+  // entry can be found within the prefix.
+  void PrevInternal(const Slice* /*prefix*/);
   bool TooManyInternalKeysSkipped(bool increment = true);
   bool IsVisible(SequenceNumber sequence);
 
@@ -273,10 +286,14 @@ class DBIter final: public Iterator {
   const Slice* iterate_lower_bound_;
   const Slice* iterate_upper_bound_;
 
-  IterKey prefix_start_buf_;
+  // The prefix of the seek key. It is only used when prefix_same_as_start_
+  // is true and prefix extractor is not null. In Next() or Prev(), current keys
+  // will be checked against this prefix, so that the iterator can be
+  // invalidated if the keys in this prefix has been exhausted. Set it using
+  // SetUserKey() and use it using GetUserKey().
+  IterKey prefix_;
 
   Status status_;
-  Slice prefix_start_key_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;

From 9a87ae46fd9b5db3e156c8e8680b25d524a58352 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 17 Sep 2019 00:14:29 -0700
Subject: [PATCH 396/572] Use total charge in MaintainPoolSize (#5813)

Summary:
https://github.com/facebook/rocksdb/issues/5797 charges the block cache with the total of user-provided charge plus the metadata charge. It had a bug where in MaintainPoolSize the user-provided charge was used instead of the total charge. The patch fixes that.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5813

Differential Revision: D17412783

Pulled By: maysamyabandeh

fbshipit-source-id: 45c0ac9f1e2233760db5ccd61399605cd74edc87
---
 cache/lru_cache.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 85d2d67ec6a..0e49167ed5b 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -225,8 +225,10 @@ void LRUCacheShard::MaintainPoolSize() {
     lru_low_pri_ = lru_low_pri_->next;
     assert(lru_low_pri_ != &lru_);
     lru_low_pri_->SetInHighPriPool(false);
-    assert(high_pri_pool_usage_ >= lru_low_pri_->charge);
-    high_pri_pool_usage_ -= lru_low_pri_->charge;
+    size_t total_charge =
+        lru_low_pri_->CalcTotalCharge(metadata_charge_policy_);
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
   }
 }
 

From a68d8145703e69161d76b1a31b138749fc0f58f3 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Tue, 17 Sep 2019 16:43:07 -0700
Subject: [PATCH 397/572] fast look up purge_queue (#5796)

Summary:
purge_queue_ maybe contains thousands sst files, for example manual compact a range. If full scan is triggered at the same time and the total sst files number is large, RocksDB will be blocked at https://github.com/facebook/rocksdb/blob/master/db/db_impl_files.cc#L150 for several seconds. In our environment we have 140,000 sst files and the manual compaction delete about 1000 sst files, it blocked about 2 minutes.

Commandeering https://github.com/facebook/rocksdb/issues/5290.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5796

Differential Revision: D17357775

Pulled By: riversand963

fbshipit-source-id: 20eacca917355b8de975ccc7b1c9a3e7bd5b201a
---
 db/db_impl/db_impl.cc                  | 44 ++++++++++++--------------
 db/db_impl/db_impl.h                   |  9 +++---
 db/db_impl/db_impl_compaction_flush.cc | 26 ++++-----------
 db/db_impl/db_impl_files.cc            | 11 ++++---
 db/obsolete_files_test.cc              |  4 +--
 5 files changed, 40 insertions(+), 54 deletions(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 710fde5b477..dde80d451d9 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1314,32 +1314,28 @@ void DBImpl::SchedulePurge() {
 void DBImpl::BackgroundCallPurge() {
   mutex_.Lock();
 
-  // We use one single loop to clear both queues so that after existing the loop
-  // both queues are empty. This is stricter than what is needed, but can make
-  // it easier for us to reason the correctness.
-  while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) {
-    // Check logs_to_free_queue_ first and close log writers.
-    if (!logs_to_free_queue_.empty()) {
-      assert(!logs_to_free_queue_.empty());
-      log::Writer* log_writer = *(logs_to_free_queue_.begin());
-      logs_to_free_queue_.pop_front();
-      mutex_.Unlock();
-      delete log_writer;
-      mutex_.Lock();
-    } else {
-      auto purge_file = purge_queue_.begin();
-      auto fname = purge_file->fname;
-      auto dir_to_sync = purge_file->dir_to_sync;
-      auto type = purge_file->type;
-      auto number = purge_file->number;
-      auto job_id = purge_file->job_id;
-      purge_queue_.pop_front();
+  while (!logs_to_free_queue_.empty()) {
+    assert(!logs_to_free_queue_.empty());
+    log::Writer* log_writer = *(logs_to_free_queue_.begin());
+    logs_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete log_writer;
+    mutex_.Lock();
+  }
+  for (const auto& file : purge_files_) {
+    const PurgeFileInfo& purge_file = file.second;
+    const std::string& fname = purge_file.fname;
+    const std::string& dir_to_sync = purge_file.dir_to_sync;
+    FileType type = purge_file.type;
+    uint64_t number = purge_file.number;
+    int job_id = purge_file.job_id;
 
-      mutex_.Unlock();
-      DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
-      mutex_.Lock();
-    }
+    mutex_.Unlock();
+    DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+    mutex_.Lock();
   }
+  purge_files_.clear();
+
   bg_purge_scheduled_--;
 
   bg_cv_.SignalAll();
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 3552a7cf235..84acc845275 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -347,7 +347,8 @@ class DBImpl : public DB {
                               uint64_t* manifest_file_size,
                               bool flush_memtable = true) override;
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
-  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
@@ -1784,12 +1785,12 @@ class DBImpl : public DB {
   // ColumnFamilyData::pending_compaction_ == true)
   std::deque<ColumnFamilyData*> compaction_queue_;
 
-  // A queue to store filenames of the files to be purged
-  std::deque<PurgeFileInfo> purge_queue_;
+  // A map to store file numbers and filenames of the files to be purged
+  std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
 
   // A vector to store the file numbers that have been assigned to certain
   // JobContext. Current implementation tracks ssts only.
-  std::vector<uint64_t> files_grabbed_for_purge_;
+  std::unordered_set<uint64_t> files_grabbed_for_purge_;
 
   // A queue to store log writers to close
   std::deque<log::Writer*> logs_to_free_queue_;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 724d54f0937..d8e8a5f8d76 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2090,7 +2090,7 @@ void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
                                   FileType type, uint64_t number, int job_id) {
   mutex_.AssertHeld();
   PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
-  purge_queue_.push_back(std::move(file_info));
+  purge_files_.insert({{number, std::move(file_info)}});
 }
 
 void DBImpl::BGWorkFlush(void* arg) {
@@ -3077,34 +3077,20 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
 }
 
 // ShouldPurge is called by FindObsoleteFiles when doing a full scan,
-// and db mutex (mutex_) should already be held. This function performs a
-// linear scan of an vector (files_grabbed_for_purge_) in search of a
-// certain element. We expect FindObsoleteFiles with full scan to occur once
-// every 10 hours by default, and the size of the vector is small.
-// Therefore, the cost is affordable even if the mutex is held.
+// and db mutex (mutex_) should already be held.
 // Actually, the current implementation of FindObsoleteFiles with
 // full_scan=true can issue I/O requests to obtain list of files in
 // directories, e.g. env_->getChildren while holding db mutex.
-// In the future, if we want to reduce the cost of search, we may try to keep
-// the vector sorted.
 bool DBImpl::ShouldPurge(uint64_t file_number) const {
-  for (auto fn : files_grabbed_for_purge_) {
-    if (file_number == fn) {
-      return false;
-    }
-  }
-  for (const auto& purge_file_info : purge_queue_) {
-    if (purge_file_info.number == file_number) {
-      return false;
-    }
-  }
-  return true;
+  return files_grabbed_for_purge_.find(file_number) ==
+             files_grabbed_for_purge_.end() &&
+         purge_files_.find(file_number) == purge_files_.end();
 }
 
 // MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
 // (mutex_) should already be held.
 void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
-  files_grabbed_for_purge_.emplace_back(file_number);
+  files_grabbed_for_purge_.insert(file_number);
 }
 
 void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 3c5fd4fcd7f..670d8e731bf 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -15,6 +15,7 @@
 #include "db/memtable_list.h"
 #include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -495,13 +496,15 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     // After purging obsolete files, remove them from files_grabbed_for_purge_.
     // Use a temporary vector to perform bulk deletion via swap.
     InstrumentedMutexLock guard_lock(&mutex_);
-    std::vector<uint64_t> tmp;
+    autovector<uint64_t> to_be_removed;
     for (auto fn : files_grabbed_for_purge_) {
-      if (files_to_del.count(fn) == 0) {
-        tmp.emplace_back(fn);
+      if (files_to_del.count(fn) != 0) {
+        to_be_removed.emplace_back(fn);
       }
     }
-    files_grabbed_for_purge_.swap(tmp);
+    for (auto fn : to_be_removed) {
+      files_grabbed_for_purge_.erase(fn);
+    }
   }
 
   // Delete old info log files.
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 3a78869c95d..305d1e3a39d 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -171,8 +171,8 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
       });
   SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
-        std::vector<uint64_t>* files_grabbed_for_purge_ptr =
-            reinterpret_cast<std::vector<uint64_t>*>(arg);
+        std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
         ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
       });
   SyncPoint::GetInstance()->EnableProcessing();

From 43a340bebe28a5ee3c17ecd840430e2d95f1f2e0 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 17 Sep 2019 17:08:57 -0700
Subject: [PATCH 398/572] Merging iterator to disble reseek optimization in
 prefix seek (#5815)

Summary:
We are seeing a bug of wrong results with merging iterator's reseek avoidence feature and prefix extractor. Disable this optimization for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5815

Test Plan: Validated the same MyRocks case was fixed; run all existing tests.

Differential Revision: D17430776

fbshipit-source-id: aef664277ba0ab8a2e68331ff0db6ae682535371
---
 db/db_test2.cc            | 40 +++++++++++++++++++++++++++++++++++++++
 table/merging_iterator.cc |  2 +-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index a4c5b5aa305..746966bda7e 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -3987,6 +3987,46 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
   db_ = nullptr;
 }
 
+TEST_F(DBTest2, PrefixBloomReseek) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Put("bbb1", ""));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Seeking into f1, the iterator will check bloom filter which returns the
+  // file iterator ot be invalidate, and the cursor will put into f2, with
+  // the next key to be "ddd0".
+  iter->Seek("bbb1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bbb1", iter->key().ToString());
+
+  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+  iter->Seek("ccc1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("ccc1", iter->key().ToString());
+
+  delete iter;
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, RowCacheSnapshot) {
   Options options = CurrentOptions();
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 82329ae8ab9..62d5ce1ee56 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -111,7 +111,7 @@ class MergingIterator : public InternalIterator {
   void Seek(const Slice& target) override {
     bool is_increasing_reseek = false;
     if (current_ != nullptr && direction_ == kForward && status_.ok() &&
-        comparator_->Compare(target, key()) >= 0) {
+        !prefix_seek_mode_ && comparator_->Compare(target, key()) >= 0) {
       is_increasing_reseek = true;
     }
     ClearHeaps();

From 6d072f2a038d9e5f124f7674179b39aafc65a9ee Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 17 Sep 2019 17:15:13 -0700
Subject: [PATCH 399/572] Move WAL-closing loop out of original loop (#5804)

Summary:
Originally the loop of closing WAL in PurgeObsoleteFiles resides inside a loop
iterating over the candidate files. It should be moved out.

Test plan (devserver)
```
$COMPILE_WITH_ASAN=1 make -j32 all
$make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5804

Differential Revision: D17374350

Pulled By: riversand963

fbshipit-source-id: 2bee7343fc0481d9a385a87c7676491522285c96
---
 db/db_impl/db_impl_files.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 670d8e731bf..1fa2884062c 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -379,6 +379,12 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     }
   }
 
+  // Close WALs before trying to delete them.
+  for (const auto w : state.logs_to_free) {
+    // TODO: maybe check the return value of Close.
+    w->Close();
+  }
+
   std::unordered_set<uint64_t> files_to_del;
   for (const auto& candidate_file : candidate_files) {
     const std::string& to_delete = candidate_file.file_name;
@@ -478,11 +484,6 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     }
 #endif  // !ROCKSDB_LITE
 
-    for (const auto w : state.logs_to_free) {
-      // TODO: maybe check the return value of Close.
-      w->Close();
-    }
-
     Status file_deletion_status;
     if (schedule_only) {
       InstrumentedMutexLock guard_lock(&mutex_);

From 3a408eeae95614150ac930fc7f244524ed8c6f1c Mon Sep 17 00:00:00 2001
From: Tomas Kolda <koldat@gmail.com>
Date: Wed, 18 Sep 2019 09:43:10 -0700
Subject: [PATCH 400/572] Adding support for deleteFilesInRanges in JNI (#4031)

Summary:
It is very useful method call to achieve https://github.com/facebook/rocksdb/wiki/Delete-A-Range-Of-Keys
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4031

Differential Revision: D13515418

Pulled By: vjnadimpalli

fbshipit-source-id: 930b48e0992ef07fd1edd0b0cb5f780fabb1b4b5
---
 java/rocksjni/rocksjni.cc                     | 71 +++++++++++++++++++
 java/src/main/java/org/rocksdb/RocksDB.java   | 29 +++++++-
 .../test/java/org/rocksdb/RocksDBTest.java    | 56 +++++++++++++++
 3 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 53224232c83..58c06fae8ad 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -18,6 +18,7 @@
 
 #include "include/org_rocksdb_RocksDB.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
@@ -3044,3 +3045,73 @@ void Java_org_rocksdb_RocksDB_destroyDB(
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
+
+bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
+                      std::unique_ptr<rocksdb::Slice>& slice,
+                      std::vector<std::unique_ptr<jbyte[]>>& ranges_to_free) {
+  jobject jArray = env->GetObjectArrayElement(ranges, index);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    return false;
+  }
+
+  if (jArray == nullptr) {
+    return true;
+  }
+
+  jbyteArray jba = reinterpret_cast<jbyteArray>(jArray);
+  jsize len_ba = env->GetArrayLength(jba);
+  ranges_to_free.push_back(std::unique_ptr<jbyte[]>(new jbyte[len_ba]));
+  env->GetByteArrayRegion(jba, 0, len_ba, ranges_to_free.back().get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jArray);
+    return false;
+  }
+  env->DeleteLocalRef(jArray);
+  slice.reset(new rocksdb::Slice(
+      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba));
+  return true;
+}
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteFilesInRanges
+ * Signature: (JJLjava/util/List;Z)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_deleteFilesInRanges(
+    JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jcf_handle,
+    jobjectArray ranges, jboolean include_end) {
+  jsize length = env->GetArrayLength(ranges);
+
+  std::vector<rocksdb::RangePtr> rangesVector;
+  std::vector<std::unique_ptr<rocksdb::Slice>> slices;
+  std::vector<std::unique_ptr<jbyte[]>> ranges_to_free;
+  for (jsize i = 0; (i + 1) < length; i += 2) {
+    slices.push_back(std::unique_ptr<rocksdb::Slice>());
+    if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    slices.push_back(std::unique_ptr<rocksdb::Slice>());
+    if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    rangesVector.push_back(rocksdb::RangePtr(slices[slices.size() - 2].get(),
+                                             slices[slices.size() - 1].get()));
+  }
+
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* column_family =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  rocksdb::Status s = rocksdb::DeleteFilesInRanges(
+      db, column_family == nullptr ? db->DefaultColumnFamily() : column_family,
+      rangesVector.data(), rangesVector.size(), include_end);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index b93a51e28a4..0920886c40f 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -3834,6 +3834,32 @@ public void endTrace() throws RocksDBException {
     endTrace(nativeHandle_);
   }
 
+  /*
+   * Delete files in multiple ranges at once
+   * Delete files in a lot of ranges one at a time can be slow, use this API for
+   * better performance in that case.
+   * @param columnFamily - The column family for operation (null for default)
+   * @param includeEnd - Whether ranges should include end
+   * @param ranges - pairs of ranges (from1, to1, from2, to2, ...)
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, final List<byte[]> ranges,
+      final boolean includeEnd) throws RocksDBException {
+    if (ranges.size() == 0) {
+      return;
+    }
+    if ((ranges.size() % 2) != 0) {
+      throw new IllegalArgumentException("Ranges size needs to be multiple of 2 "
+          + "(from1, to1, from2, to2, ...), but is " + ranges.size());
+    }
+
+    final byte[][] rangesArray = ranges.toArray(new byte[ranges.size()][]);
+
+    deleteFilesInRanges(nativeHandle_, columnFamily == null ? 0 : columnFamily.nativeHandle_,
+        rangesArray, includeEnd);
+  }
+
   /**
    * Static method to destroy the contents of the specified database.
    * Be very careful using this method.
@@ -4171,7 +4197,8 @@ private native void promoteL0(final long handle,
   private native void startTrace(final long handle, final long maxTraceFileSize,
       final long traceWriterHandle) throws RocksDBException;
   private native void endTrace(final long handle) throws RocksDBException;
-
+  private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges,
+      boolean include_end) throws RocksDBException;
 
   private native static void destroyDB(final String path,
       final long optionsHandle) throws RocksDBException;
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index a7d7fee14f2..8af4dcaaa46 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -869,6 +869,62 @@ public void compactRangeToLevel()
     }
   }
 
+  @Test
+  public void deleteFilesInRange() throws RocksDBException, InterruptedException {
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 1000;
+    final int FILE_SIZE = 64000;
+    final int NUM_FILES = 10;
+
+    final int KEY_INTERVAL = 10000;
+    /*
+     * Intention of these options is to end up reliably with 10 files
+     * we will be deleting using deleteFilesInRange.
+     * It is writing roughly number of keys that will fit in 10 files (target size)
+     * It is writing interleaved so that files from memory on L0 will overlap
+     * Then compaction cleans everything and we should end up with 10 files
+     */
+    try (final Options opt = new Options()
+                                 .setCreateIfMissing(true)
+                                 .setCompressionType(CompressionType.NO_COMPRESSION)
+                                 .setTargetFileSizeBase(FILE_SIZE)
+                                 .setWriteBufferSize(FILE_SIZE / 2)
+                                 .setDisableAutoCompactions(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE);
+
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int key_init = 0;
+      for (int o = 0; o < NUM_FILES; ++o) {
+        int int_key = key_init++;
+        for (int i = 0; i < records; ++i) {
+          int_key += KEY_INTERVAL;
+          rand.nextBytes(value);
+
+          db.put(String.format("%020d", int_key).getBytes(), value);
+        }
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      db.compactRange();
+      // Make sure we do create one more L0 files.
+      assertThat(db.getProperty("rocksdb.num-files-at-level0")).isEqualTo("0");
+
+      // Should be 10, but we are OK with asserting +- 2
+      int files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(8, 12);
+
+      // Delete lower 60% (roughly). Result should be 5, but we are OK with asserting +- 2
+      // Important is that we know something was deleted (JNI call did something)
+      // Exact assertions are done in C++ unit tests
+      db.deleteFilesInRanges(null,
+          Arrays.asList(null, String.format("%020d", records * KEY_INTERVAL * 6 / 10).getBytes()),
+          false);
+      files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(3, 7);
+    }
+  }
+
   @Test
   public void compactRangeToLevelColumnFamily()
       throws RocksDBException {

From 6a279037cf5bb23b60f81e24f731376a195cbf11 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 18 Sep 2019 11:49:41 -0700
Subject: [PATCH 401/572] Refactor ObsoleteFilesTest to inherit from DBTestBase
 (#5820)

Summary:
Make class ObsoleteFilesTest inherit from DBTestBase.

Test plan (on devserver):
```
$COMPILE_WITH_ASAN=1 make obsolete_files_test
$./obsolete_files_test
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5820

Differential Revision: D17452348

Pulled By: riversand963

fbshipit-source-id: b09f4581a18022ca2bfd79f2836c0bf7083f5f25
---
 Makefile                  |   2 +-
 db/obsolete_files_test.cc | 168 ++++++++++++++------------------------
 2 files changed, 62 insertions(+), 108 deletions(-)

diff --git a/Makefile b/Makefile
index 898570f84ae..b584b81379d 100644
--- a/Makefile
+++ b/Makefile
@@ -1512,7 +1512,7 @@ options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS)
 deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-obsolete_files_test: db/obsolete_files_test.o $(LIBOBJECTS) $(TESTHARNESS)
+obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 305d1e3a39d..096a50c1aed 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -14,9 +14,11 @@
 #include <string>
 #include <vector>
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
@@ -32,60 +34,10 @@ using std::flush;
 
 namespace rocksdb {
 
-class ObsoleteFilesTest : public testing::Test {
+class ObsoleteFilesTest : public DBTestBase {
  public:
-  std::string dbname_;
-  Options options_;
-  DB* db_;
-  Env* env_;
-  int numlevels_;
-
-  ObsoleteFilesTest() {
-    db_ = nullptr;
-    env_ = Env::Default();
-    // Trigger compaction when the number of level 0 files reaches 2.
-    options_.level0_file_num_compaction_trigger = 2;
-    options_.disable_auto_compactions = false;
-    options_.delete_obsolete_files_period_micros = 0;  // always do full purge
-    options_.enable_thread_tracking = true;
-    options_.write_buffer_size = 1024*1024*1000;
-    options_.target_file_size_base = 1024*1024*1000;
-    options_.max_bytes_for_level_base = 1024*1024*1000;
-    options_.WAL_ttl_seconds = 300; // Used to test log files
-    options_.WAL_size_limit_MB = 1024; // Used to test log files
-    dbname_ = test::PerThreadDBPath("obsolete_files_test");
-    options_.wal_dir = dbname_ + "/wal_files";
-
-    // clean up all the files that might have been there before
-    std::vector<std::string> old_files;
-    env_->GetChildren(dbname_, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(dbname_ + "/" + file);
-    }
-    env_->GetChildren(options_.wal_dir, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(options_.wal_dir + "/" + file);
-    }
-
-    DestroyDB(dbname_, options_);
-    numlevels_ = 7;
-    EXPECT_OK(ReopenDB(true));
-  }
-
-  Status ReopenDB(bool create) {
-    delete db_;
-    if (create) {
-      DestroyDB(dbname_, options_);
-    }
-    db_ = nullptr;
-    options_.create_if_missing = create;
-    return DB::Open(options_, dbname_, &db_);
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
-  }
+  ObsoleteFilesTest()
+      : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {}
 
   void AddKeys(int numkeys, int startkey) {
     WriteOptions options;
@@ -98,50 +50,24 @@ class ObsoleteFilesTest : public testing::Test {
     }
   }
 
-  int numKeysInLevels(
-    std::vector<LiveFileMetaData> &metadata,
-    std::vector<int> *keysperlevel = nullptr) {
-
-    if (keysperlevel != nullptr) {
-      keysperlevel->resize(numlevels_);
-    }
-
-    int numKeys = 0;
-    for (size_t i = 0; i < metadata.size(); i++) {
-      int startkey = atoi(metadata[i].smallestkey.c_str());
-      int endkey = atoi(metadata[i].largestkey.c_str());
-      int numkeysinfile = (endkey - startkey + 1);
-      numKeys += numkeysinfile;
-      if (keysperlevel != nullptr) {
-        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
-      }
-      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
-              metadata[i].level, metadata[i].name.c_str(),
-              metadata[i].smallestkey.c_str(),
-              metadata[i].largestkey.c_str());
-    }
-    return numKeys;
-  }
-
   void createLevel0Files(int numFiles, int numKeysPerFile) {
     int startKey = 0;
-    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
     for (int i = 0; i < numFiles; i++) {
       AddKeys(numKeysPerFile, startKey);
       startKey += numKeysPerFile;
-      ASSERT_OK(dbi->TEST_FlushMemTable());
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
   }
 
-  void CheckFileTypeCounts(std::string& dir,
-                            int required_log,
-                            int required_sst,
-                            int required_manifest) {
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
     env_->GetChildren(dir, &filenames);
 
-    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    int log_cnt = 0;
+    int sst_cnt = 0;
+    int manifest_cnt = 0;
     for (auto file : filenames) {
       uint64_t number;
       FileType type;
@@ -155,9 +81,31 @@ class ObsoleteFilesTest : public testing::Test {
     ASSERT_EQ(required_sst, sst_cnt);
     ASSERT_EQ(required_manifest, manifest_cnt);
   }
+
+  void ReopenDB() {
+    Options options = CurrentOptions();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger = 2;
+    options.disable_auto_compactions = false;
+    options.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options.enable_thread_tracking = true;
+    options.write_buffer_size = 1024 * 1024 * 1000;
+    options.target_file_size_base = 1024 * 1024 * 1000;
+    options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options.WAL_ttl_seconds = 300;     // Used to test log files
+    options.WAL_size_limit_MB = 1024;  // Used to test log files
+    options.wal_dir = wal_dir_;
+    Destroy(options);
+    Reopen(options);
+  }
+
+  const std::string wal_dir_;
 };
 
 TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->LoadDependency({
       {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
        "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
@@ -178,28 +126,26 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   createLevel0Files(2, 50000);
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
 
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  port::Thread user_thread([&]() {
+  port::Thread user_thread([this]() {
     JobContext jobCxt(0);
     TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
-    dbi->TEST_LockMutex();
-    dbi->FindObsoleteFiles(&jobCxt,
-      true /* force=true */, false /* no_full_scan=false */);
-    dbi->TEST_UnlockMutex();
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+                                false /* no_full_scan=false */);
+    dbfull()->TEST_UnlockMutex();
     TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
-    dbi->PurgeObsoleteFiles(jobCxt);
+    dbfull()->PurgeObsoleteFiles(jobCxt);
     jobCxt.Clean();
   });
 
   user_thread.join();
-
-  CloseDB();
-  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
   std::vector<uint64_t> optsfiles_nums;
   std::vector<bool> optsfiles_keep;
   SyncPoint::GetInstance()->SetCallBack(
@@ -213,23 +159,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   createLevel0Files(2, 50000);
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
 
-  DBImpl* dbi = static_cast<DBImpl*>(db_);
-  ASSERT_OK(dbi->DisableFileDeletions());
+  ASSERT_OK(dbfull()->DisableFileDeletions());
   for (int i = 0; i != 4; ++i) {
     if (i % 2) {
-      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
-                                {{"paranoid_file_checks", "false"}}));
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "false"}}));
     } else {
-      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
-                                {{"paranoid_file_checks", "true"}}));
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "true"}}));
     }
   }
-  ASSERT_OK(dbi->EnableFileDeletions(true /* force */));
+  ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
   ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
 
-  CloseDB();
+  Close();
 
   std::vector<std::string> files;
   int opts_file_count = 0;
@@ -246,13 +191,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
     }
   }
   ASSERT_EQ(2, opts_file_count);
-  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 } //namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 

From 2389aa2da9856de6a364b47505df5cd76dc55373 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=A3=8E?= <invalid_ms_user@live.com>
Date: Wed, 18 Sep 2019 14:23:40 -0700
Subject: [PATCH 402/572] Remove unneeded unlock statement (#5809)

Summary:
The dtor will automatically do unlock
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5809

Differential Revision: D17453694

Pulled By: ltamasi

fbshipit-source-id: 5348bff8e6a620a05ff639a5454e8d82ae98a22d
---
 util/threadpool_imp.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index 1a597e24822..2e71f223f41 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -313,9 +313,8 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
 
 void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,
   bool allow_reduce) {
-  std::unique_lock<std::mutex> lock(mu_);
+  std::lock_guard<std::mutex> lock(mu_);
   if (exit_all_threads_) {
-    lock.unlock();
     return;
   }
   if (num > total_threads_limit_ ||

From 2cbb61eadb2248d7c31b5334404736cfd8f27744 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Wed, 18 Sep 2019 15:22:46 -0700
Subject: [PATCH 403/572] Make clang-analyzer happy (#5821)

Summary:
clang-analyzer has uncovered a bunch of places where the code is relying
on pointers being valid and one case (in VectorIterator) where a moved-from
object is being used:

In file included from db/range_tombstone_fragmenter.cc:17:
./util/vector_iterator.h:23:18: warning: Method called on moved-from object 'keys' of type 'std::vector'
        current_(keys.size()) {
                 ^~~~~~~~~~~
1 warning generated.
utilities/persistent_cache/block_cache_tier_file.cc:39:14: warning: Called C++ object pointer is null
  Status s = env->NewRandomAccessFile(filepath, file, opt);
             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:47:19: warning: Called C++ object pointer is null
  Status status = env_->GetFileSize(Path(), size);
                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:290:14: warning: Called C++ object pointer is null
  Status s = env_->FileExists(Path());
             ^~~~~~~~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:363:35: warning: Called C++ object pointer is null
    CacheWriteBuffer* const buf = alloc_->Allocate();
                                  ^~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:399:41: warning: Called C++ object pointer is null
  const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
                                        ^~~~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:463:33: warning: Called C++ object pointer is null
  size_t start_idx = lba.off_ / alloc_->BufferSize();
                                ^~~~~~~~~~~~~~~~~~~~
utilities/persistent_cache/block_cache_tier_file.cc:515:5: warning: Called C++ object pointer is null
    alloc_->Deallocate(bufs_[i]);
    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
7 warnings generated.
ar: creating librocksdb_debug.a
utilities/memory/memory_test.cc:68:25: warning: Called C++ object pointer is null
      cache_set->insert(db->GetDBOptions().row_cache.get());
                        ^~~~~~~~~~~~~~~~~~
1 warning generated.

The patch fixes these by adding assertions and explicitly passing in zero
when initializing VectorIterator::current_ (which preserves the existing
behavior).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5821

Test Plan: Ran make check and make analyze to make sure the warnings have disappeared.

Differential Revision: D17455949

Pulled By: ltamasi

fbshipit-source-id: 363619618ea649a0674287f9f3b3393e390571ee
---
 util/vector_iterator.h                              |  2 +-
 utilities/memory/memory_test.cc                     |  2 ++
 utilities/persistent_cache/block_cache_tier_file.cc | 12 ++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/util/vector_iterator.h b/util/vector_iterator.h
index 2a88c260314..d5945b6b7f3 100644
--- a/util/vector_iterator.h
+++ b/util/vector_iterator.h
@@ -20,7 +20,7 @@ class VectorIterator : public InternalIterator {
       : keys_(std::move(keys)),
         values_(std::move(values)),
         indexed_cmp_(icmp, &keys_),
-        current_(keys.size()) {
+        current_(0) {
     assert(keys_.size() == values_.size());
 
     indices_.reserve(keys_.size());
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index 75fb9cd3f92..9181ff7dfd7 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -57,6 +57,8 @@ class MemoryTest : public testing::Test {
     cache_set->clear();
 
     for (auto* db : dbs) {
+      assert(db);
+
       // Cache from DBImpl
       StackableDB* sdb = dynamic_cast<StackableDB*>(db);
       DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index 0fb17b369e3..58671e2bb16 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -34,6 +34,8 @@ Status NewWritableCacheFile(Env* const env, const std::string& filepath,
 Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
                                 std::unique_ptr<RandomAccessFile>* file,
                                 const bool use_direct_reads = true) {
+  assert(env);
+
   EnvOptions opt;
   opt.use_direct_reads = use_direct_reads;
   Status s = env->NewRandomAccessFile(filepath, file, opt);
@@ -44,6 +46,8 @@ Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
 // BlockCacheFile
 //
 Status BlockCacheFile::Delete(uint64_t* size) {
+  assert(env_);
+
   Status status = env_->GetFileSize(Path(), size);
   if (!status.ok()) {
     return status;
@@ -287,6 +291,8 @@ bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
   ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)",
                   Path().c_str(), max_size_);
 
+  assert(env_);
+
   Status s = env_->FileExists(Path());
   if (s.ok()) {
     ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(),
@@ -359,6 +365,8 @@ bool WriteableCacheFile::ExpandBuffer(const size_t size) {
 
   // expand the buffer until there is enough space to write `size` bytes
   assert(free < size);
+  assert(alloc_);
+
   while (free < size) {
     CacheWriteBuffer* const buf = alloc_->Allocate();
     if (!buf) {
@@ -394,6 +402,7 @@ void WriteableCacheFile::DispatchBuffer() {
   assert(eof_ || buf_doff_ < buf_woff_);
   assert(buf_doff_ < bufs_.size());
   assert(file_);
+  assert(alloc_);
 
   auto* buf = bufs_[buf_doff_];
   const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
@@ -453,6 +462,7 @@ bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) {
   rwlock_.AssertHeld();
 
   assert(lba.off_ < disk_woff_);
+  assert(alloc_);
 
   // we read from the buffers like reading from a flat file. The list of buffers
   // are treated as contiguous stream of data
@@ -511,6 +521,8 @@ void WriteableCacheFile::Close() {
 }
 
 void WriteableCacheFile::ClearBuffers() {
+  assert(alloc_);
+
   for (size_t i = 0; i < bufs_.size(); ++i) {
     alloc_->Deallocate(bufs_[i]);
   }

From a9c5e8e9441f18d804af17ea12e527693ad92d11 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 18 Sep 2019 16:56:25 -0700
Subject: [PATCH 404/572] Refactor deletefile_test.cc (#5822)

Summary:
Make DeleteFileTest inherit DBTestBase to avoid code duplication.

Test Plan (on devserver)
```
$make deletefile_test
$./deletefile_test
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5822

Differential Revision: D17456750

Pulled By: riversand963

fbshipit-source-id: 224e97967da7b98838a98981cd5095d3230a814f
---
 Makefile              |   2 +-
 db/db_test_util.cc    |   3 +-
 db/deletefile_test.cc | 270 +++++++++++++++++++++---------------------
 3 files changed, 139 insertions(+), 136 deletions(-)

diff --git a/Makefile b/Makefile
index b584b81379d..fbba9990747 100644
--- a/Makefile
+++ b/Makefile
@@ -1509,7 +1509,7 @@ util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $
 options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+deletefile_test: db/deletefile_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index da71429a7e9..fae03e2abc4 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -586,7 +586,8 @@ void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
   size_t cfi = handles_.size();
   handles_.resize(cfi + cfs.size());
   for (auto cf : cfs) {
-    ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+    ASSERT_OK(s);
   }
 }
 
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index b99c8e9dce8..db6f945a7db 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -14,9 +14,11 @@
 #include <string>
 #include <vector>
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
@@ -27,58 +29,26 @@
 
 namespace rocksdb {
 
-class DeleteFileTest : public testing::Test {
+class DeleteFileTest : public DBTestBase {
  public:
-  std::string dbname_;
-  Options options_;
-  DB* db_;
-  Env* env_;
-  int numlevels_;
-
-  DeleteFileTest() {
-    db_ = nullptr;
-    env_ = Env::Default();
-    options_.delete_obsolete_files_period_micros = 0;  // always do full purge
-    options_.enable_thread_tracking = true;
-    options_.write_buffer_size = 1024*1024*1000;
-    options_.target_file_size_base = 1024*1024*1000;
-    options_.max_bytes_for_level_base = 1024*1024*1000;
-    options_.WAL_ttl_seconds = 300; // Used to test log files
-    options_.WAL_size_limit_MB = 1024; // Used to test log files
-    dbname_ = test::PerThreadDBPath("deletefile_test");
-    options_.wal_dir = dbname_ + "/wal_files";
-
-    // clean up all the files that might have been there before
-    std::vector<std::string> old_files;
-    env_->GetChildren(dbname_, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(dbname_ + "/" + file);
-    }
-    env_->GetChildren(options_.wal_dir, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(options_.wal_dir + "/" + file);
-    }
-
-    DestroyDB(dbname_, options_);
-    numlevels_ = 7;
-    EXPECT_OK(ReopenDB(true));
-  }
-
-  Status ReopenDB(bool create) {
-    delete db_;
-    if (create) {
-      DestroyDB(dbname_, options_);
-    }
-    db_ = nullptr;
-    options_.create_if_missing = create;
-    Status s = DB::Open(options_, dbname_, &db_);
-    assert(db_);
-    return s;
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
+  const int numlevels_;
+  const std::string wal_dir_;
+
+  DeleteFileTest()
+      : DBTestBase("/deletefile_test"),
+        numlevels_(7),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void SetOptions(Options* options) {
+    assert(options);
+    options->delete_obsolete_files_period_micros = 0;  // always do full purge
+    options->enable_thread_tracking = true;
+    options->write_buffer_size = 1024 * 1024 * 1000;
+    options->target_file_size_base = 1024 * 1024 * 1000;
+    options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options->WAL_ttl_seconds = 300;     // Used to test log files
+    options->WAL_size_limit_MB = 1024;  // Used to test log files
+    options->wal_dir = wal_dir_;
   }
 
   void AddKeys(int numkeys, int startkey = 0) {
@@ -120,23 +90,20 @@ class DeleteFileTest : public testing::Test {
 
   void CreateTwoLevels() {
     AddKeys(50000, 10000);
-    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
-    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     for (int i = 0; i < 2; ++i) {
-      ASSERT_OK(dbi->TEST_CompactRange(i, nullptr, nullptr));
+      ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
     }
 
     AddKeys(50000, 10000);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
-    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
-    ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
   }
 
-  void CheckFileTypeCounts(std::string& dir,
-                            int required_log,
-                            int required_sst,
-                            int required_manifest) {
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
     env_->GetChildren(dir, &filenames);
 
@@ -167,6 +134,12 @@ class DeleteFileTest : public testing::Test {
 };
 
 TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
@@ -208,15 +181,19 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
 
   // Lowest level file deletion should succeed.
   ASSERT_OK(db_->DeleteFile(level2file));
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
   // there should be only one (empty) log file because CreateTwoLevels()
   // flushes the memtables to disk
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
   // 2 ssts, 1 manifest
   CheckFileTypeCounts(dbname_, 0, 2, 1);
   std::string first("0"), last("999999");
@@ -229,7 +206,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 
   // this time, we keep an iterator alive
-  ReopenDB(true);
+  Reopen(options);
   Iterator *itr = nullptr;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
@@ -239,11 +216,15 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   delete itr;
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -253,9 +234,9 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   // We keep an iterator alive
   Iterator* itr = nullptr;
   CreateTwoLevels();
-  ReadOptions options;
-  options.background_purge_on_iterator_cleanup = true;
-  itr = db_->NewIterator(options);
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  itr = db_->NewIterator(read_options);
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
@@ -277,11 +258,15 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   sleeping_task_after.WaitUntilDone();
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   auto do_test = [&](bool bg_purge) {
     ColumnFamilyOptions co;
     co.max_write_buffer_size_to_maintain =
@@ -328,20 +313,24 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
         "DBImpl::BGWorkPurge:start"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
-  options_.avoid_unnecessary_blocking_io = true;
-  ASSERT_OK(ReopenDB(false));
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
   {
     SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
     do_test(true);
   }
-
-  CloseDB();
-  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 // This test is to reproduce a bug that read invalid ReadOption in iterator
 // cleanup function
 TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -351,12 +340,13 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
   // We keep an iterator alive
   Iterator* itr = nullptr;
   CreateTwoLevels();
-  ReadOptions* options = new ReadOptions();
-  options->background_purge_on_iterator_cleanup = true;
-  itr = db_->NewIterator(*options);
-  // ReadOptions is deleted, but iterator cleanup function should not be
-  // affected
-  delete options;
+  {
+    ReadOptions read_options;
+    read_options.background_purge_on_iterator_cleanup = true;
+    itr = db_->NewIterator(read_options);
+    // ReadOptions is deleted, but iterator cleanup function should not be
+    // affected
+  }
 
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 3 sst after compaction with live iterator
@@ -372,11 +362,15 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
   sleeping_task_after.WaitUntilDone();
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -385,15 +379,16 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
 
   // We keep an iterator alive
   CreateTwoLevels();
-  ReadOptions options;
-  options.background_purge_on_iterator_cleanup = true;
-  Iterator* itr1 = db_->NewIterator(options);
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  Iterator* itr1 = db_->NewIterator(read_options);
   CreateTwoLevels();
-  Iterator* itr2 = db_->NewIterator(options);
+  Iterator* itr2 = db_->NewIterator(read_options);
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 5 sst files after 2 compactions with 2 live iterators
   CheckFileTypeCounts(dbname_, 0, 5, 1);
 
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   // ~DBImpl should wait until all BGWorkPurge are finished
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
@@ -405,24 +400,29 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
   env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
   delete itr2;
   env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
-  CloseDB();
+  Close();
 
   TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
-  ReadOptions options;
-  Iterator* it = db_->NewIterator(options);
+  ReadOptions read_options;
+  Iterator* it = db_->NewIterator(read_options);
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
 
-  std::string level2file = "";
+  std::string level2file;
 
-  ASSERT_EQ((int)metadata.size(), 2);
+  ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
   if (metadata[0].level == 1) {
     level2file = metadata[1].name;
   } else {
@@ -441,10 +441,15 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) {
   }
   ASSERT_EQ(numKeysIterated, 50000);
   delete it;
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, DeleteLogFiles) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   AddKeys(10, 0);
   VectorLogPtr logfiles;
   db_->GetSortedWalFiles(logfiles);
@@ -453,11 +458,11 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   // Should not succeed because live logs are not allowed to be deleted
   std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
   ASSERT_EQ(alive_log->Type(), kAliveLogFile);
-  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   fprintf(stdout, "Deleting alive log file %s\n",
           alive_log->PathName().c_str());
   ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
-  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   logfiles.clear();
 
   // Call Flush to bring about a new working log file and add more keys
@@ -471,43 +476,36 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   ASSERT_GT(logfiles.size(), 0UL);
   std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
   ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
-  ASSERT_OK(
-      env_->FileExists(options_.wal_dir + "/" + archived_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
   fprintf(stdout, "Deleting archived log file %s\n",
           archived_log->PathName().c_str());
   ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
-  ASSERT_EQ(Status::NotFound(), env_->FileExists(options_.wal_dir + "/" +
-                                                 archived_log->PathName()));
-  CloseDB();
+  ASSERT_EQ(Status::NotFound(),
+            env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
 }
 
 TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
-  CloseDB();
-  DBOptions db_options;
-  db_options.create_if_missing = true;
-  db_options.create_missing_column_families = true;
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.emplace_back();
-  column_families.emplace_back("new_cf", ColumnFamilyOptions());
-
-  std::vector<rocksdb::ColumnFamilyHandle*> handles;
-  rocksdb::DB* db;
-  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+  CreateAndReopenWithCF({"new_cf"}, options);
 
   Random rnd(5);
   for (int i = 0; i < 1000; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
-                      test::RandomKey(&rnd, 10)));
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
   }
-  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
   for (int i = 0; i < 1000; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
-                      test::RandomKey(&rnd, 10)));
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
   }
-  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
 
   std::vector<LiveFileMetaData> metadata;
-  db->GetLiveFilesMetaData(&metadata);
+  db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(2U, metadata.size());
   ASSERT_EQ("new_cf", metadata[0].column_family_name);
   ASSERT_EQ("new_cf", metadata[1].column_family_name);
@@ -517,11 +515,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
   auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
                       ? metadata[0].name
                       : metadata[1].name;
-  ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
-  ASSERT_OK(db->DeleteFile(old_file));
+  ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db_->DeleteFile(old_file));
 
   {
-    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
@@ -530,13 +528,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
     ASSERT_EQ(count, 1000);
   }
 
-  delete handles[0];
-  delete handles[1];
-  delete db;
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
 
-  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
   {
-    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
@@ -544,16 +540,22 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
     }
     ASSERT_EQ(count, 1000);
   }
-
-  delete handles[0];
-  delete handles[1];
-  delete db;
 }
 
 } //namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 

From 6ec6a4a9a49e506eff76aebd104d30be6a2d36cc Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 18 Sep 2019 20:24:17 -0700
Subject: [PATCH 405/572] Remove snap_refresh_nanos option (#5826)

Summary:
The snap_refresh_nanos option didn't bring much benefit. Remove the feature to simplify the code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5826

Differential Revision: D17467147

Pulled By: maysamyabandeh

fbshipit-source-id: 4f950b046990d0d1292d7fc04c2ccafaf751c7f0
---
 HISTORY.md                             |   1 +
 db/c.cc                                |   5 --
 db/compaction/compaction_iterator.cc   |  54 +++++--------
 db/compaction/compaction_iterator.h    |  57 +-------------
 db/compaction/compaction_job.cc        |   9 +--
 db/compaction/compaction_job.h         |   3 +-
 db/compaction/compaction_job_test.cc   | 105 +------------------------
 db/db_impl/db_impl_compaction_flush.cc |  40 +---------
 include/rocksdb/c.h                    |   2 -
 include/rocksdb/options.h              |  12 +--
 monitoring/persistent_stats_history.cc |   1 -
 options/cf_options.cc                  |   2 -
 options/cf_options.h                   |   3 -
 options/options.cc                     |   4 -
 options/options_helper.cc              |   6 +-
 options/options_settable_test.cc       |   2 +-
 options/options_test.cc                |   2 -
 table/mock_table.cc                    |  14 ----
 table/mock_table.h                     |   8 --
 tools/db_crashtest.py                  |   1 -
 tools/db_stress.cc                     |   7 --
 21 files changed, 31 insertions(+), 307 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 85c65484d9a..7bd8f16b468 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -16,6 +16,7 @@
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
 * The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
+* Deprecate `snap_refresh_nanos` option.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
diff --git a/db/c.cc b/db/c.cc
index b1fe7601923..76007e9175a 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2272,11 +2272,6 @@ void rocksdb_options_set_max_bytes_for_level_base(
   opt->rep.max_bytes_for_level_base = n;
 }
 
-void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt,
-                                            uint64_t n) {
-  opt->rep.snap_refresh_nanos = n;
-}
-
 void rocksdb_options_set_level_compaction_dynamic_level_bytes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.level_compaction_dynamic_level_bytes = v;
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 73a7c397407..68519bff9c1 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -38,7 +38,6 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback,
     const std::atomic<bool>* manual_compaction_paused)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
@@ -47,7 +46,6 @@ CompactionIterator::CompactionIterator(
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          snap_list_callback,
           manual_compaction_paused) {}
 
 CompactionIterator::CompactionIterator(
@@ -61,7 +59,6 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback,
     const std::atomic<bool>* manual_compaction_paused)
     : input_(input),
       cmp_(cmp),
@@ -81,8 +78,7 @@ CompactionIterator::CompactionIterator(
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
-      current_key_committed_(false),
-      snap_list_callback_(snap_list_callback) {
+      current_key_committed_(false) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
   assert(snapshots_ != nullptr);
   bottommost_level_ =
@@ -90,7 +86,24 @@ CompactionIterator::CompactionIterator(
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
-  ProcessSnapshotList();
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
+    earliest_snapshot_ = kMaxSequenceNumber;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+#endif
   input_->SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
@@ -212,28 +225,6 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
   }
 }
 
-void CompactionIterator::ProcessSnapshotList() {
-#ifndef NDEBUG
-  // findEarliestVisibleSnapshot assumes this ordering.
-  for (size_t i = 1; i < snapshots_->size(); ++i) {
-    assert(snapshots_->at(i - 1) < snapshots_->at(i));
-  }
-#endif
-  if (snapshots_->size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip_ = true;
-    earliest_snapshot_iter_ = snapshots_->end();
-    earliest_snapshot_ = kMaxSequenceNumber;
-    latest_snapshot_ = 0;
-  } else {
-    visible_at_tip_ = false;
-    earliest_snapshot_iter_ = snapshots_->begin();
-    earliest_snapshot_ = snapshots_->at(0);
-    latest_snapshot_ = snapshots_->back();
-  }
-  released_snapshots_.clear();
-}
-
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
@@ -282,13 +273,6 @@ void CompactionIterator::NextFromInput() {
     // compaction filter). ikey_.user_key is pointing to the copy.
     if (!has_current_user_key_ ||
         !cmp_->Equal(ikey_.user_key, current_user_key_)) {
-      num_keys_++;
-      // Use num_keys_ to reduce the overhead of reading current time
-      if (snap_list_callback_ && snapshots_->size() &&
-          snap_list_callback_->TimeToRefresh(num_keys_)) {
-        snap_list_callback_->Refresh(snapshots_, latest_snapshot_);
-        ProcessSnapshotList();
-      }
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index c063739501b..69bffa85b80 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -21,54 +21,6 @@
 
 namespace rocksdb {
 
-// This callback can be used to refresh the snapshot list from the db. It
-// includes logics to exponentially decrease the refresh rate to limit the
-// overhead of refresh.
-class SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos,
-                            size_t every_nth_key = 1024)
-      : timer_(env, /*auto restart*/ true),
-        snap_refresh_nanos_(snap_refresh_nanos),
-        every_nth_key_minus_one_(every_nth_key - 1) {
-    assert(every_nth_key > 0);
-    assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key))));
-  }
-  // Refresh the snapshot list. snapshots will bre replacted with the new list.
-  // max is the upper bound. Note: this function will acquire the db_mutex_.
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) = 0;
-  inline bool TimeToRefresh(const size_t key_index) {
-    assert(snap_refresh_nanos_ != 0);
-    // skip the key if key_index % every_nth_key (which is of power 2) is not 0.
-    if ((key_index & every_nth_key_minus_one_) != 0) {
-      return false;
-    }
-    const uint64_t elapsed = timer_.ElapsedNanos();
-    auto ret = elapsed > snap_refresh_nanos_;
-    // pre-compute the next time threshold
-    if (ret) {
-      // inc next refresh period exponentially (by x4)
-      auto next_refresh_threshold = snap_refresh_nanos_ << 2;
-      // make sure the shift has not overflown the highest 1 bit
-      snap_refresh_nanos_ =
-          std::max(snap_refresh_nanos_, next_refresh_threshold);
-    }
-    return ret;
-  }
-  static constexpr SnapshotListFetchCallback* kDisabled = nullptr;
-
-  virtual ~SnapshotListFetchCallback() {}
-
- private:
-  // Time since the callback was created
-  StopWatchNano timer_;
-  // The delay before calling ::Refresh. To be increased exponentially.
-  uint64_t snap_refresh_nanos_;
-  // Skip evey nth key. Number n if of power 2. The math will require n-1.
-  const uint64_t every_nth_key_minus_one_;
-};
-
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
@@ -118,7 +70,6 @@ class CompactionIterator {
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
                      const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr,
                      const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
@@ -133,7 +84,6 @@ class CompactionIterator {
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
                      const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr,
                      const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionIterator();
@@ -162,8 +112,6 @@ class CompactionIterator {
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
-  // Process snapshots_ and assign related variables
-  void ProcessSnapshotList();
 
   // Do last preparations before presenting the output to the callee. At this
   // point this only zeroes out the sequence number if possible for better
@@ -198,7 +146,7 @@ class CompactionIterator {
   InternalIterator* input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
-  std::vector<SequenceNumber>* snapshots_;
+  const std::vector<SequenceNumber>* snapshots_;
   // List of snapshots released during compaction.
   // findEarliestVisibleSnapshot() find them out from return of
   // snapshot_checker, and make sure they will not be returned as
@@ -274,9 +222,6 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
-  SnapshotListFetchCallback* snap_list_callback_;
-  // number of distinct keys processed
-  size_t num_keys_ = 0;
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index d49783144c6..2111599f9dc 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -311,8 +311,7 @@ CompactionJob::CompactionJob(
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback,
-    const std::atomic<bool>* manual_compaction_paused)
+    Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
@@ -334,7 +333,6 @@ CompactionJob::CompactionJob(
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
-      snap_list_callback_(snap_list_callback),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
@@ -892,10 +890,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
       &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_,
-      // Currently range_del_agg is incompatible with snapshot refresh feature.
-      range_del_agg.IsEmpty() ? snap_list_callback_ : nullptr,
-      manual_compaction_paused_));
+      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 79069770c0e..783000506e5 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -75,7 +75,7 @@ class CompactionJob {
       std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
       bool paranoid_file_checks, bool measure_io_stats,
       const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback,
+      Env::Priority thread_pri,
       const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionJob();
@@ -168,7 +168,6 @@ class CompactionJob {
   // entirely within s1 and s2, then the earlier version of k1 can be safely
   // deleted because that version is not visible in any snapshot.
   std::vector<SequenceNumber> existing_snapshots_;
-  SnapshotListFetchCallback* snap_list_callback_;
 
   // This is the earliest snapshot that could be used for write-conflict
   // checking by a transaction.  For any user-key newer than this snapshot, we
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 8f9db07362d..f2939f2208b 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -250,9 +250,7 @@ class CompactionJobTest : public testing::Test {
       const stl_wrappers::KVMap& expected_results,
       const std::vector<SequenceNumber>& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
-      int output_level = 1, bool verify = true,
-      SnapshotListFetchCallback* snapshot_fetcher =
-          SnapshotListFetchCallback::kDisabled) {
+      int output_level = 1, bool verify = true) {
     auto cfd = versions_->GetColumnFamilySet()->GetDefault();
 
     size_t num_input_files = 0;
@@ -285,7 +283,7 @@ class CompactionJobTest : public testing::Test {
         nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER, snapshot_fetcher);
+        Env::Priority::USER);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
@@ -962,105 +960,6 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
   RunCompaction({files}, expected_results);
 }
 
-// Test the snapshot fetcher in compaction
-TEST_F(CompactionJobTest, SnapshotRefresh) {
-  uint64_t time_seed = env_->NowMicros();
-  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
-  Random64 rand(time_seed);
-  std::vector<SequenceNumber> db_snapshots;
-  class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback {
-   public:
-    SnapshotListFetchCallbackTest(Env* env, Random64& rand,
-                                  std::vector<SequenceNumber>* snapshots)
-        : SnapshotListFetchCallback(env, 1 /*short time delay*/,
-                                    1 /*fetch after each key*/),
-          rand_(rand),
-          snapshots_(snapshots) {}
-    virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                         SequenceNumber) override {
-      assert(snapshots->size());
-      assert(snapshots_->size());
-      assert(snapshots_->size() == snapshots->size());
-      if (rand_.OneIn(2)) {
-        uint64_t release_index = rand_.Uniform(snapshots_->size());
-        snapshots_->erase(snapshots_->begin() + release_index);
-        *snapshots = *snapshots_;
-      }
-    }
-
-   private:
-    Random64 rand_;
-    std::vector<SequenceNumber>* snapshots_;
-  } snapshot_fetcher(env_, rand, &db_snapshots);
-
-  std::vector<std::pair<const std::string, std::string>> file1_kvs, file2_kvs;
-  std::array<ValueType, 4> types = {kTypeValue, kTypeDeletion,
-                                    kTypeSingleDeletion};
-  SequenceNumber last_seq = 0;
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq = seq;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file1_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file1 = mock::MakeMockFile(file1_kvs);
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq++;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file2_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file2 = mock::MakeMockFile(file2_kvs);
-  for (SequenceNumber i = 1; i < last_seq + 1; i++) {
-    if (rand.OneIn(5)) {
-      db_snapshots.push_back(i);
-    }
-  }
-
-  const bool kVerify = true;
-  const int output_level_0 = 0;
-  NewDB();
-  AddMockFile(file1);
-  AddMockFile(file2);
-  SetLastSequence(last_seq);
-  auto files = cfd_->current()->storage_info()->LevelFiles(0);
-  // put the output on L0 since it is easier to feed them again to the 2nd
-  // compaction
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify, &snapshot_fetcher);
-
-  // Now db_snapshots are changed. Run the compaction again without snapshot
-  // fetcher but with the updated snapshot list.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
-  // The result should be what we get if we run compaction without snapshot
-  // fetcher on the updated list of snapshots
-  auto expected = mock_table_factory_->output();
-
-  NewDB();
-  AddMockFile(file1);
-  AddMockFile(file2);
-  SetLastSequence(last_seq);
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify);
-  // The 2nd compaction above would get rid of useless delete markers. To get
-  // the output here exactly as what we got above after two compactions, we also
-  // run the compaction for 2nd time.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
-}
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index d8e8a5f8d76..6724f3e6fdc 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -800,31 +800,6 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   return s;
 }
 
-namespace {
-class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env,
-                                uint64_t snap_refresh_nanos, Logger* info_log)
-      : SnapshotListFetchCallback(env, snap_refresh_nanos),
-        db_impl_(db_impl),
-        info_log_(info_log) {}
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) override {
-    size_t prev = snapshots->size();
-    snapshots->clear();
-    db_impl_->LoadSnapshots(snapshots, nullptr, max);
-    size_t now = snapshots->size();
-    ROCKS_LOG_DEBUG(info_log_,
-                    "Compaction snapshot count refreshed from %zu to %zu", prev,
-                    now);
-  }
-
- private:
-  DBImpl* db_impl_;
-  Logger* info_log_;
-};
-}  // namespace
-
 Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
                             ColumnFamilyHandle* column_family,
                             const std::vector<std::string>& input_file_names,
@@ -999,9 +974,6 @@ Status DBImpl::CompactFilesImpl(
 
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
-  SnapshotListFetchCallbackImpl fetch_callback(
-      this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-      immutable_db_options_.info_log.get());
   CompactionJob compaction_job(
       job_context->job_id, c.get(), immutable_db_options_,
       env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -1012,10 +984,6 @@ Status DBImpl::CompactFilesImpl(
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
       &compaction_job_stats, Env::Priority::USER,
-      immutable_db_options_.max_subcompactions <= 1 &&
-              c->mutable_cf_options()->snap_refresh_nanos > 0
-          ? &fetch_callback
-          : nullptr,
       &manual_compaction_paused_);
 
   // Creating a compaction influences the compaction score because the score
@@ -2765,9 +2733,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     GetSnapshotContext(job_context, &snapshot_seqs,
                        &earliest_write_conflict_snapshot, &snapshot_checker);
     assert(is_snapshot_supported_ || snapshots_.empty());
-    SnapshotListFetchCallbackImpl fetch_callback(
-        this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-        immutable_db_options_.info_log.get());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
         env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -2778,10 +2743,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri,
-        immutable_db_options_.max_subcompactions <= 1 &&
-                c->mutable_cf_options()->snap_refresh_nanos > 0
-            ? &fetch_callback
-            : nullptr, is_manual ? &manual_compaction_paused_ : nullptr);
+        is_manual ? &manual_compaction_paused_ : nullptr);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 525e3813833..ba54085080a 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -827,8 +827,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
-extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos(
-    rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
                                                          unsigned char);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 8c08b50d156..624aa524573 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -269,17 +269,7 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
-  // If non-zero, compactions will periodically refresh the snapshot list. The
-  // delay for the first refresh is snap_refresh_nanos nano seconds and
-  // exponentially increases afterwards. When having many short-lived snapshots,
-  // this option helps reducing the cpu usage of long-running compactions. The
-  // feature is disabled when max_subcompactions is greater than one.
-  //
-  // NOTE: This feautre is currently incompatible with RangeDeletes.
-  //
-  // Default: 0
-  //
-  // Dynamically changeable through SetOptions() API
+  // Deprecated.
   uint64_t snap_refresh_nanos = 0;
 
   // Disable automatic compactions. Manual compactions can still
diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc
index c1704f56747..74f7303648f 100644
--- a/monitoring/persistent_stats_history.cc
+++ b/monitoring/persistent_stats_history.cc
@@ -69,7 +69,6 @@ void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) {
   cfo->write_buffer_size = 2 << 20;
   cfo->target_file_size_base = 2 * 1048576;
   cfo->max_bytes_for_level_base = 10 * 1048576;
-  cfo->snap_refresh_nanos = 0;
   cfo->soft_pending_compaction_bytes_limit = 256 * 1048576;
   cfo->hard_pending_compaction_bytes_limit = 1073741824ul;
   cfo->compression = kNoCompression;
diff --git a/options/cf_options.cc b/options/cf_options.cc
index ded9ca01b64..488c42b968f 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -167,8 +167,6 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_multiplier);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
-  ROCKS_LOG_INFO(log, "                       snap_refresh_nanos: %" PRIu64,
-                 snap_refresh_nanos);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
                  max_bytes_for_level_multiplier);
   ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
diff --git a/options/cf_options.h b/options/cf_options.h
index e13eae80140..c119c9b6b96 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -151,7 +151,6 @@ struct MutableCFOptions {
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
-        snap_refresh_nanos(options.snap_refresh_nanos),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
         periodic_compaction_seconds(options.periodic_compaction_seconds),
@@ -188,7 +187,6 @@ struct MutableCFOptions {
         target_file_size_base(0),
         target_file_size_multiplier(0),
         max_bytes_for_level_base(0),
-        snap_refresh_nanos(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
         periodic_compaction_seconds(0),
@@ -240,7 +238,6 @@ struct MutableCFOptions {
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
-  uint64_t snap_refresh_nanos;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
   uint64_t periodic_compaction_seconds;
diff --git a/options/options.cc b/options/options.cc
index 11804d88f9b..db1cd7130bf 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -216,9 +216,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "               Options.max_bytes_for_level_base: %" PRIu64,
         max_bytes_for_level_base);
-    ROCKS_LOG_HEADER(
-        log, "                     Options.snap_refresh_nanos: %" PRIu64,
-        snap_refresh_nanos);
     ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d",
                      level_compaction_dynamic_level_bytes);
     ROCKS_LOG_HEADER(log, "         Options.max_bytes_for_level_multiplier: %f",
@@ -494,7 +491,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
   write_buffer_size = 2 << 20;
   target_file_size_base = 2 * 1048576;
   max_bytes_for_level_base = 10 * 1048576;
-  snap_refresh_nanos = 0;
   soft_pending_compaction_bytes_limit = 256 * 1048576;
   hard_pending_compaction_bytes_limit = 1073741824ul;
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 42695a613b7..d30264e8234 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -182,7 +182,6 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
       mutable_cf_options.target_file_size_multiplier;
   cf_opts.max_bytes_for_level_base =
       mutable_cf_options.max_bytes_for_level_base;
-  cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos;
   cf_opts.max_bytes_for_level_multiplier =
       mutable_cf_options.max_bytes_for_level_multiplier;
   cf_opts.ttl = mutable_cf_options.ttl;
@@ -1943,9 +1942,8 @@ std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kUInt64T, OptionVerificationType::kNormal, true,
           offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
         {"snap_refresh_nanos",
-         {offset_of(&ColumnFamilyOptions::snap_refresh_nanos),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, snap_refresh_nanos)}},
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, true,
+          0}},
         {"max_bytes_for_level_multiplier",
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
           OptionType::kDouble, OptionVerificationType::kNormal, true,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2208b4f3013..d46a23da7ea 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -353,6 +353,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::shared_ptr<CompactionFilterFactory>)},
       {offset_of(&ColumnFamilyOptions::prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
+      {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), sizeof(uint64_t)},
       {offset_of(&ColumnFamilyOptions::table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
       {offset_of(&ColumnFamilyOptions::cf_paths),
@@ -416,7 +417,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
       "kSnappyCompression;"
       "max_bytes_for_level_base=986;"
-      "snap_refresh_nanos=1000000000;"
       "bloom_locality=8016;"
       "target_file_size_base=4294976376;"
       "memtable_huge_page_size=2557;"
diff --git a/options/options_test.cc b/options/options_test.cc
index 9f3e2f0926a..13328c319fd 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -71,7 +71,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"target_file_size_base", "12"},
       {"target_file_size_multiplier", "13"},
       {"max_bytes_for_level_base", "14"},
-      {"snap_refresh_nanos", "1000000000"},
       {"level_compaction_dynamic_level_bytes", "true"},
       {"max_bytes_for_level_multiplier", "15.0"},
       {"max_bytes_for_level_multiplier_additional", "16:17:18"},
@@ -183,7 +182,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
-  ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U);
   ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 551c1ba5d1c..fb092bd5eb9 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -21,12 +21,6 @@ const InternalKeyComparator icmp_(BytewiseComparator());
 
 }  // namespace
 
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l) {
-  return stl_wrappers::KVMap(l.begin(), l.end(),
-                             stl_wrappers::LessOfComparator(&icmp_));
-}
-
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l) {
   return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
@@ -143,14 +137,6 @@ void MockTableFactory::AssertLatestFile(
       ParseInternalKey(Slice(key), &ikey);
       std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
     }
-    std::cout << "Expected:" << std::endl;
-    for (const auto& kv : file_contents) {
-      ParsedInternalKey ikey;
-      std::string key, value;
-      std::tie(key, value) = kv;
-      ParseInternalKey(Slice(key), &ikey);
-      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
-    }
     FAIL();
   }
 }
diff --git a/table/mock_table.h b/table/mock_table.h
index deb383231d0..8ba937ac83b 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -28,8 +28,6 @@ namespace mock {
 
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l = {});
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l);
 
 struct MockTableFileSystem {
   port::Mutex mutex;
@@ -194,12 +192,6 @@ class MockTableFactory : public TableFactory {
   // contents are equal to file_contents
   void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
   void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
-  stl_wrappers::KVMap output() {
-    assert(!file_system_.files.empty());
-    auto latest = file_system_.files.end();
-    --latest;
-    return latest->second;
-  }
 
  private:
   uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 32c5abbe7cb..4013910ab2c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -148,7 +148,6 @@ def is_direct_io_supported(dbname):
     "write_buffer_size": 1024 * 1024,
     # disable pipelined write when test_atomic_flush is true
     "enable_pipelined_write": 0,
-    "snap_refresh_nanos": 0,
 }
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index bf1af305f14..129ea7aa43d 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -541,10 +541,6 @@ DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
 static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 
-DEFINE_uint64(
-    snap_refresh_nanos, 100 * 1000 * 1000,
-    "If non-zero, compactions will periodically refresh snapshot list.");
-
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
@@ -2757,8 +2753,6 @@ class StressTest {
         fprintf(stdout, "  %s\n", p.c_str());
       }
     }
-    fprintf(stdout, "Snapshot refresh nanos    : %" PRIu64 "\n",
-            FLAGS_snap_refresh_nanos);
     fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
             FLAGS_periodic_compaction_seconds);
     fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
@@ -2919,7 +2913,6 @@ class StressTest {
     } else {
       options_.merge_operator = MergeOperators::CreatePutOperator();
     }
-    options_.snap_refresh_nanos = FLAGS_snap_refresh_nanos;
 
     fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
 

From c06b54d0c6fb8c46ee98744544b003ff4f87c269 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 19 Sep 2019 12:32:33 -0700
Subject: [PATCH 406/572] Apply formatter on recent 45 commits. (#5827)

Summary:
Some recent commits might not have passed through the formatter. I formatted recent 45 commits. The script hangs for more commits so I stopped there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5827

Test Plan: Run all existing tests.

Differential Revision: D17483727

fbshipit-source-id: af23113ee63015d8a43d89a3bc2c1056189afe8f
---
 db/arena_wrapped_db_iter.cc                   |   2 +-
 db/compaction/compaction_iterator.cc          |   2 +-
 db/compaction/compaction_iterator.h           |  50 ++++----
 db/compaction/compaction_job.cc               |  14 ++-
 db/compaction/compaction_job.h                |  32 ++---
 db/db_impl/db_impl_compaction_flush.cc        |  17 ++-
 db/db_impl/db_impl_readonly.cc                |   2 +-
 db/db_iter.h                                  |   9 +-
 db/db_test2.cc                                |  16 ++-
 .../org/rocksdb/benchmark/DbBenchmark.java    |  42 +++----
 java/rocksjni/sst_file_reader_iterator.cc     | 111 +++++++++---------
 java/rocksjni/sst_file_readerjni.cc           |  44 +++----
 .../main/java/org/rocksdb/SstFileReader.java  |  25 ++--
 .../org/rocksdb/SstFileReaderIterator.java    |   4 +-
 .../java/org/rocksdb/SstFileReaderTest.java   |  18 ++-
 options/options_settable_test.cc              |   3 +-
 .../block_cache_trace_analyzer.cc             |   6 +-
 tools/db_stress.cc                            |   5 +-
 tools/sst_dump_tool.cc                        |   5 +-
 util/bloom_impl.h                             |  30 +++--
 util/concurrent_task_limiter_impl.h           |   4 +-
 util/crc32c_arm64.h                           |  26 ++--
 util/dynamic_bloom.cc                         |   2 +-
 util/dynamic_bloom_test.cc                    |  66 +++++------
 util/mutexlock.h                              |  12 +-
 .../transactions/write_unprepared_txn_db.cc   |   2 +-
 26 files changed, 270 insertions(+), 279 deletions(-)

diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 6a1635f62c7..840c20e9e4c 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -19,7 +19,7 @@
 namespace rocksdb {
 
 Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
-                                              std::string* prop) {
+                                       std::string* prop) {
   if (prop_name == "rocksdb.iterator.super-version-number") {
     // First try to pass the value returned from inner iterator.
     if (!db_iter_->GetProperty(prop_name, prop).ok()) {
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 68519bff9c1..89ceb06eacf 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -230,7 +230,7 @@ void CompactionIterator::NextFromInput() {
   valid_ = false;
 
   while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
-      !IsShuttingDown()) {
+         !IsShuttingDown()) {
     key_ = input_->key();
     value_ = input_->value();
     iter_stats_.num_input_records++;
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 69bffa85b80..a06a867dd71 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -59,32 +59,32 @@ class CompactionIterator {
     const Compaction* compaction_;
   };
 
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     const Compaction* compaction = nullptr,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     const std::atomic<bool>* manual_compaction_paused = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     std::unique_ptr<CompactionProxy> compaction,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     const std::atomic<bool>* manual_compaction_paused = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionIterator();
 
@@ -231,7 +231,7 @@ class CompactionIterator {
   bool IsPausingManualCompaction() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
     return manual_compaction_paused_ &&
-      manual_compaction_paused_->load(std::memory_order_relaxed);
+           manual_compaction_paused_->load(std::memory_order_relaxed);
   }
 };
 }  // namespace rocksdb
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 2111599f9dc..5d2a6bfd8f7 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -870,9 +870,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       db_options_.statistics.get());
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
-  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
-      reinterpret_cast<void *>(
-        const_cast<std::atomic<bool> *>(manual_compaction_paused_)));
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
 
   Slice* start = sub_compact->start;
   Slice* end = sub_compact->end;
@@ -954,9 +955,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       input_status = input->status();
       output_file_ended = true;
     }
-    TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2",
-        reinterpret_cast<void *>(
-          const_cast<std::atomic<bool> *>(manual_compaction_paused_)));
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void*>(
+            const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
     c_iter->Next();
     if (c_iter->status().IsManualCompactionPaused()) {
       break;
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 783000506e5..bca4c994265 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -62,21 +62,23 @@ class VersionSet;
 // if needed.
 class CompactionJob {
  public:
-  CompactionJob(
-      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-      const EnvOptions env_options, VersionSet* versions,
-      const std::atomic<bool>* shutting_down,
-      const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
-      Directory* db_directory, Directory* output_directory, Statistics* stats,
-      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      SequenceNumber earliest_write_conflict_snapshot,
-      const SnapshotChecker* snapshot_checker,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      bool paranoid_file_checks, bool measure_io_stats,
-      const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri,
-      const std::atomic<bool>* manual_compaction_paused = nullptr);
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const EnvOptions env_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down,
+                const SequenceNumber preserve_deletes_seqnum,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Directory* output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                std::vector<SequenceNumber> existing_snapshots,
+                SequenceNumber earliest_write_conflict_snapshot,
+                const SnapshotChecker* snapshot_checker,
+                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+                bool paranoid_file_checks, bool measure_io_stats,
+                const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri,
+                const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionJob();
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 6724f3e6fdc..42b0b93bb4f 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -983,8 +983,7 @@ Status DBImpl::CompactFilesImpl(
       snapshot_checker, table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER,
-      &manual_compaction_paused_);
+      &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -2313,8 +2312,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       env_->SleepForMicroseconds(10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
-               !s.IsManualCompactionPaused() &&
-               !s.IsColumnFamilyDropped()) {
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -2332,11 +2330,10 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       env_->SleepForMicroseconds(1000000);
       mutex_.Lock();
     } else if (s.IsManualCompactionPaused()) {
-      ManualCompactionState *m = prepicked_compaction->manual_compaction_state;
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
       assert(m);
       ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
-                       m->cfd->GetName().c_str(),
-                       job_context.job_id);
+                       m->cfd->GetName().c_str(), job_context.job_id);
     }
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
@@ -2345,8 +2342,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
-                      !s.IsManualCompactionPaused() &&
-                      !s.IsColumnFamilyDropped());
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2430,7 +2427,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
     } else if (is_manual &&
-        manual_compaction_paused_.load(std::memory_order_acquire)) {
+               manual_compaction_paused_.load(std::memory_order_acquire)) {
       status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
     }
   } else {
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index d989bd8c771..8a8a9c9d051 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/arena_wrapped_db_iter.h"
 #include "db/db_impl/db_impl_readonly.h"
+#include "db/arena_wrapped_db_iter.h"
 
 #include "db/compacted_db_impl.h"
 #include "db/db_impl/db_impl.h"
diff --git a/db/db_iter.h b/db/db_iter.h
index 15ee56d5036..e6e072c5051 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -53,7 +53,7 @@ namespace rocksdb {
 // combines multiple entries for the same userkey found in the DB
 // representation into a single entry while accounting for sequence
 // numbers, deletion markers, overwrites, etc.
-class DBIter final: public Iterator {
+class DBIter final : public Iterator {
  public:
   // The following is grossly complicated. TODO: clean it up
   // Which direction is the iterator currently moving?
@@ -66,10 +66,7 @@ class DBIter final: public Iterator {
   //        this->key().
   // (2) When moving backwards, the internal iterator is positioned
   //     just before all entries whose user key == this->key().
-  enum Direction {
-    kForward,
-    kReverse
-  };
+  enum Direction { kForward, kReverse };
 
   // LocalStatistics contain Statistics counters that will be aggregated per
   // each iterator instance and then will be sent to the global statistics when
@@ -148,7 +145,7 @@ class DBIter final: public Iterator {
   bool Valid() const override { return valid_; }
   Slice key() const override {
     assert(valid_);
-    if(start_seqnum_ > 0) {
+    if (start_seqnum_ > 0) {
       return saved_key_.GetInternalKey();
     } else {
       return saved_key_.GetUserKey();
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 746966bda7e..82df2a9082e 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -2458,8 +2458,7 @@ TEST_F(DBTest2, PausingManualCompaction1) {
 
   manual_compactions_paused = 0;
   // Now make sure CompactFiles also not run
-  dbfull()->CompactFiles(rocksdb::CompactionOptions(),
-                         files_before_compact, 0);
+  dbfull()->CompactFiles(rocksdb::CompactionOptions(), files_before_compact, 0);
   // Wait for manual compaction to get scheduled and finish
   dbfull()->TEST_WaitForCompact(true);
 
@@ -2510,14 +2509,14 @@ TEST_F(DBTest2, PausingManualCompaction3) {
   Random rnd(301);
   auto generate_files = [&]() {
     for (int i = 0; i < options.num_levels; i++) {
-      for (int j = 0; j < options.num_levels-i+1; j++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
         for (int k = 0; k < 1000; k++) {
           ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
         }
         Flush();
       }
 
-      for (int l = 1; l < options.num_levels-i; l++) {
+      for (int l = 1; l < options.num_levels - i; l++) {
         MoveFilesToLevel(l);
       }
     }
@@ -2530,9 +2529,8 @@ TEST_F(DBTest2, PausingManualCompaction3) {
 #endif  // !ROCKSDB_LITE
   int run_manual_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():PausingManualCompaction:1", [&](void* /*arg*/) {
-        run_manual_compactions++;
-      });
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   dbfull()->DisableManualCompaction();
@@ -2565,14 +2563,14 @@ TEST_F(DBTest2, PausingManualCompaction4) {
   Random rnd(301);
   auto generate_files = [&]() {
     for (int i = 0; i < options.num_levels; i++) {
-      for (int j = 0; j < options.num_levels-i+1; j++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
         for (int k = 0; k < 1000; k++) {
           ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
         }
         Flush();
       }
 
-      for (int l = 1; l < options.num_levels-i; l++) {
+      for (int l = 1; l < options.num_levels - i; l++) {
         MoveFilesToLevel(l);
       }
     }
diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
index da515219fc5..ff36c74a4c8 100644
--- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -646,8 +646,8 @@ private void run() throws RocksDBException {
               currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
           break;
         case "fillbatch":
-          tasks.add(new WriteSequentialTask(
-              currentTaskId++, randSeed_, num_, num_, writeOpt, 1000));
+          tasks.add(
+              new WriteSequentialTask(currentTaskId++, randSeed_, num_, num_, writeOpt, 1000));
           break;
         case "fillrandom":
           tasks.add(new WriteRandomTask(
@@ -901,27 +901,23 @@ public static void main(String[] args) throws Exception {
   }
 
   private enum Flag {
-    benchmarks(
-        Arrays.asList(
-            "fillseq",
-            "readrandom",
-            "fillrandom"),
-        "Comma-separated list of operations to run in the specified order\n" +
-        "\tActual benchmarks:\n" +
-        "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
-        "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
-        "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
-        "\t\t                   in sequential key order in sync mode.\n" +
-        "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
-        "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
-        "\t\treadseq          -- read N times sequentially.\n" +
-        "\t\treadrandom       -- read N times in random order.\n" +
-        "\t\treadhot          -- read N times in random order from 1% section of DB.\n" +
-        "\t\treadwhilewriting -- measure the read performance of multiple readers\n" +
-        "\t\t                   with a bg single writer.  The write rate of the bg\n" +
-        "\t\t                   is capped by --writes_per_second.\n" +
-        "\tMeta Operations:\n" +
-        "\t\tdelete            -- delete DB") {
+    benchmarks(Arrays.asList("fillseq", "readrandom", "fillrandom"),
+        "Comma-separated list of operations to run in the specified order\n"
+            + "\tActual benchmarks:\n"
+            + "\t\tfillseq          -- write N values in sequential key order in async mode.\n"
+            + "\t\tfillrandom       -- write N values in random key order in async mode.\n"
+            + "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n"
+            + "\t\t                   in sequential key order in sync mode.\n"
+            + "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n"
+            + "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n"
+            + "\t\treadseq          -- read N times sequentially.\n"
+            + "\t\treadrandom       -- read N times in random order.\n"
+            + "\t\treadhot          -- read N times in random order from 1% section of DB.\n"
+            + "\t\treadwhilewriting -- measure the read performance of multiple readers\n"
+            + "\t\t                   with a bg single writer.  The write rate of the bg\n"
+            + "\t\t                   is capped by --writes_per_second.\n"
+            + "\tMeta Operations:\n"
+            + "\t\tdelete            -- delete DB") {
       @Override public Object parseValue(String value) {
         return new ArrayList<String>(Arrays.asList(value.split(",")));
       }
diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc
index 7ab97e412ba..4cbbf04bdc6 100644
--- a/java/rocksjni/sst_file_reader_iterator.cc
+++ b/java/rocksjni/sst_file_reader_iterator.cc
@@ -6,13 +6,13 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::Iterator methods from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 
 #include "include/org_rocksdb_SstFileReaderIterator.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/iterator.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_SstFileReaderIterator
@@ -22,9 +22,9 @@
 void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/,
                                                             jobject /*jobj*/,
                                                             jlong handle) {
-    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
-    assert(it != nullptr);
-    delete it;
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  assert(it != nullptr);
+  delete it;
 }
 
 /*
@@ -35,7 +35,7 @@ void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/,
 jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/,
                                                          jobject /*jobj*/,
                                                          jlong handle) {
-    return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
+  return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
 }
 
 /*
@@ -46,7 +46,7 @@ jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/,
 void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/,
                                                          jobject /*jobj*/,
                                                          jlong handle) {
-    reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
 }
 
 /*
@@ -57,7 +57,7 @@ void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/,
 void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/,
                                                         jobject /*jobj*/,
                                                         jlong handle) {
-    reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
 }
 
 /*
@@ -65,9 +65,10 @@ void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/,
  * Method:    next0
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
                                                   jlong handle) {
-    reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
 }
 
 /*
@@ -75,9 +76,10 @@ void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, jobject /*job
  * Method:    prev0
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
                                                   jlong handle) {
-    reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
 }
 
 /*
@@ -86,21 +88,21 @@ void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, jobject /*job
  * Signature: (J[BI)V
  */
 void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/,
-                                                  jlong handle, jbyteArray jtarget,
+                                                  jlong handle,
+                                                  jbyteArray jtarget,
                                                   jint jtarget_len) {
-    jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
-    if(target == nullptr) {
-        // exception thrown: OutOfMemoryError
-        return;
-    }
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
 
-    rocksdb::Slice target_slice(
-            reinterpret_cast<char*>(target), jtarget_len);
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
 
-    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
-    it->Seek(target_slice);
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  it->Seek(target_slice);
 
-    env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
 }
 
 /*
@@ -108,23 +110,23 @@ void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/,
  * Method:    seekForPrev0
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/,
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env,
+                                                         jobject /*jobj*/,
                                                          jlong handle,
                                                          jbyteArray jtarget,
                                                          jint jtarget_len) {
-    jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
-    if(target == nullptr) {
-        // exception thrown: OutOfMemoryError
-        return;
-    }
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
 
-    rocksdb::Slice target_slice(
-            reinterpret_cast<char*>(target), jtarget_len);
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
 
-    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
-    it->SeekForPrev(target_slice);
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  it->SeekForPrev(target_slice);
 
-    env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
 }
 
 /*
@@ -132,16 +134,17 @@ void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, jobject /*
  * Method:    status0
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, jobject /*jobj*/,
+void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env,
+                                                    jobject /*jobj*/,
                                                     jlong handle) {
-    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
-    rocksdb::Status s = it->status();
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Status s = it->status();
 
-    if (s.ok()) {
-        return;
-    }
+  if (s.ok()) {
+    return;
+  }
 
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
 /*
@@ -149,19 +152,21 @@ void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, jobject /*jobj*
  * Method:    key0
  * Signature: (J)[B
  */
-jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, jobject /*jobj*/,
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env,
+                                                       jobject /*jobj*/,
                                                        jlong handle) {
-    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
-    rocksdb::Slice key_slice = it->key();
-
-    jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
-    if(jkey == nullptr) {
-        // exception thrown: OutOfMemoryError
-        return nullptr;
-    }
-    env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
-                            const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
-    return jkey;
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  if (jkey == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkey, 0, static_cast<jsize>(key_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+  return jkey;
 }
 
 /*
diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc
index 32b914a2d2d..c8348c2e256 100644
--- a/java/rocksjni/sst_file_readerjni.cc
+++ b/java/rocksjni/sst_file_readerjni.cc
@@ -23,8 +23,8 @@
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
-                                                         jclass /*jcls*/,
-                                                          jlong joptions) {
+                                                      jclass /*jcls*/,
+                                                      jlong joptions) {
   auto *options = reinterpret_cast<const rocksdb::Options *>(joptions);
   rocksdb::SstFileReader *sst_file_reader =
       new rocksdb::SstFileReader(*options);
@@ -53,18 +53,18 @@ void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/,
 }
 
 /*
-* Class:     org_rocksdb_SstFileReader
-* Method:    newIterator
-* Signature: (JJ)J
-*/
-jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/,
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newIterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
                                                  jobject /*jobj*/,
                                                  jlong jhandle,
                                                  jlong jread_options_handle) {
-    auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
-    auto* read_options =
-        reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
-    return reinterpret_cast<jlong>(sst_file_reader->NewIterator(*read_options));
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  auto *read_options =
+      reinterpret_cast<rocksdb::ReadOptions *>(jread_options_handle);
+  return reinterpret_cast<jlong>(sst_file_reader->NewIterator(*read_options));
 }
 
 /*
@@ -75,7 +75,7 @@ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/,
 void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/,
                                                     jobject /*jobj*/,
                                                     jlong jhandle) {
-    delete reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  delete reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
 }
 
 /*
@@ -84,9 +84,9 @@ void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/,
  * Signature: (J)V
  */
 void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env,
-                                             jobject /*jobj*/,
-                                              jlong jhandle) {
-  auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
   auto s = sst_file_reader->VerifyChecksum();
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
@@ -99,12 +99,12 @@ void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env,
  * Signature: (J)J
  */
 jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
-                                                  jobject /*jobj*/,
-                                                  jlong jhandle) {
-  auto* sst_file_reader = reinterpret_cast<rocksdb::SstFileReader*>(jhandle);
-  std::shared_ptr<const rocksdb::TableProperties> tp = sst_file_reader->GetTableProperties();
-  jobject jtable_properties = rocksdb::TablePropertiesJni::fromCppTableProperties(
-      env, *(tp.get()));
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  std::shared_ptr<const rocksdb::TableProperties> tp =
+      sst_file_reader->GetTableProperties();
+  jobject jtable_properties =
+      rocksdb::TablePropertiesJni::fromCppTableProperties(env, *(tp.get()));
   return jtable_properties;
 }
-
diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java
index 66349f32f72..53f96e3cc5e 100644
--- a/java/src/main/java/org/rocksdb/SstFileReader.java
+++ b/java/src/main/java/org/rocksdb/SstFileReader.java
@@ -31,19 +31,18 @@ public SstFileReader(final Options options) {
    * @return instance of iterator object.
    */
   public SstFileReaderIterator newIterator(final ReadOptions readOptions) {
-    assert(isOwningHandle());
-    long iter = newIterator(nativeHandle_,
-        readOptions.nativeHandle_);
+    assert (isOwningHandle());
+    long iter = newIterator(nativeHandle_, readOptions.nativeHandle_);
     return new SstFileReaderIterator(this, iter);
   }
 
   /**
-     * Prepare SstFileReader to read a file.
-     *
-     * @param filePath the location of file
-     *
-     * @throws RocksDBException thrown if error happens in underlying
-     *    native library.
+   * Prepare SstFileReader to read a file.
+   *
+   * @param filePath the location of file
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void open(final String filePath) throws RocksDBException {
     open(nativeHandle_, filePath);
@@ -68,14 +67,10 @@ public TableProperties getTableProperties() throws RocksDBException {
     return getTableProperties(nativeHandle_);
   }
 
-
-
   @Override protected final native void disposeInternal(final long handle);
-  private native long newIterator(final long handle,
-                                  final long readOptionsHandle);
+  private native long newIterator(final long handle, final long readOptionsHandle);
 
-  private native void open(final long handle, final String filePath)
-        throws RocksDBException;
+  private native void open(final long handle, final String filePath) throws RocksDBException;
 
   private native static long newSstFileReader(final long optionsHandle);
   private native void verifyChecksum(final long handle) throws RocksDBException;
diff --git a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
index ef303f04293..d01b7a39031 100644
--- a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
+++ b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
@@ -33,7 +33,7 @@ protected SstFileReaderIterator(SstFileReader reader, long nativeHandle) {
    * @return key for the current entry.
    */
   public byte[] key() {
-    assert(isOwningHandle());
+    assert (isOwningHandle());
     return key0(nativeHandle_);
   }
 
@@ -46,7 +46,7 @@ public byte[] key() {
    * @return value for the current entry.
    */
   public byte[] value() {
-    assert(isOwningHandle());
+    assert (isOwningHandle());
     return value0(nativeHandle_);
   }
 
diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
index 5ccc8dec981..c0e3a73d88b 100644
--- a/java/src/test/java/org/rocksdb/SstFileReaderTest.java
+++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -48,9 +48,10 @@ OpType getOpType() {
 
   @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
 
-  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES}
+  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES }
 
-  private File newSstFile(final List<KeyValueWithOp> keyValues) throws IOException, RocksDBException {
+  private File newSstFile(final List<KeyValueWithOp> keyValues)
+      throws IOException, RocksDBException {
     final EnvOptions envOptions = new EnvOptions();
     final StringAppendOperator stringAppendOperator = new StringAppendOperator();
     final Options options = new Options().setMergeOperator(stringAppendOperator);
@@ -105,15 +106,11 @@ public void readSstFile() throws RocksDBException, IOException {
     final List<KeyValueWithOp> keyValues = new ArrayList<>();
     keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
 
-
     final File sstFile = newSstFile(keyValues);
-    try(final StringAppendOperator stringAppendOperator =
-            new StringAppendOperator();
-        final Options options = new Options()
-            .setCreateIfMissing(true)
-            .setMergeOperator(stringAppendOperator);
-        final SstFileReader reader = new SstFileReader(options)
-    ) {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options options =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final SstFileReader reader = new SstFileReader(options)) {
       // Open the sst file and iterator
       reader.open(sstFile.getAbsolutePath());
       final ReadOptions readOptions = new ReadOptions();
@@ -133,5 +130,4 @@ public void readSstFile() throws RocksDBException, IOException {
       assertThat(iterator.value()).isEqualTo("value1".getBytes());
     }
   }
-
 }
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index d46a23da7ea..bc2e088a6e4 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -356,8 +356,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), sizeof(uint64_t)},
       {offset_of(&ColumnFamilyOptions::table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
-      {offset_of(&ColumnFamilyOptions::cf_paths),
-       sizeof(std::vector<DbPath>)},
+      {offset_of(&ColumnFamilyOptions::cf_paths), sizeof(std::vector<DbPath>)},
       {offset_of(&ColumnFamilyOptions::compaction_thread_limiter),
        sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
   };
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 0dd99dab802..709f0722171 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -1830,8 +1830,10 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
             hist_naccess_per_key.Add(caller_access.second);
           }
         }
-        uint64_t avg_accesses = static_cast<uint64_t>(hist_naccess_per_key.Average());
-        uint64_t stdev_accesses = static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
+        uint64_t avg_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.Average());
+        uint64_t stdev_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
         avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
         cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
         stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 129ea7aa43d..fb002592ee8 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -353,8 +353,9 @@ DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
 DEFINE_bool(use_block_based_filter, false, "use block based filter"
               "instead of full filter for block based table");
 
-DEFINE_bool(partition_filters, false, "use partitioned filters "
-    "for block-based table");
+DEFINE_bool(partition_filters, false,
+            "use partitioned filters "
+            "for block-based table");
 
 DEFINE_int32(
     index_type,
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index efe272c093c..9f3e7d6b2df 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -405,8 +405,9 @@ Status SstFileDumper::ReadTableProperties(
 namespace {
 
 void print_help() {
-  fprintf(stderr,
-          R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
+  fprintf(
+      stderr,
+      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index 13c7a7ec6b0..500e0496871 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -26,13 +26,13 @@ namespace rocksdb {
 // See e.g. RocksDB DynamicBloom.
 //
 class LegacyNoLocalityBloomImpl {
-public:
-  static inline void AddHash(uint32_t h, uint32_t total_bits,
-                             int num_probes, char *data) {
+ public:
+  static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
+                             char *data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
-      data[bitpos/8] |= (1 << (bitpos % 8));
+      data[bitpos / 8] |= (1 << (bitpos % 8));
       h += delta;
     }
   }
@@ -42,7 +42,7 @@ class LegacyNoLocalityBloomImpl {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
-      if ((data[bitpos/8] & (1 << (bitpos % 8))) == 0) {
+      if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
         return false;
       }
       h += delta;
@@ -51,7 +51,6 @@ class LegacyNoLocalityBloomImpl {
   }
 };
 
-
 // A legacy Bloom filter implementation with probes local to a single
 // cache line (fast). Because SST files might be transported between
 // platforms, the cache line size is a parameter rather than hard coded.
@@ -72,15 +71,15 @@ class LegacyNoLocalityBloomImpl {
 //
 template <bool ExtraRotates>
 class LegacyLocalityBloomImpl {
-private:
+ private:
   static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
     uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
     return offset_h % num_lines;
   }
-public:
-  static inline void AddHash(uint32_t h, uint32_t num_lines,
-                             int num_probes, char *data,
-                             int log2_cache_line_bytes) {
+
+ public:
+  static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
+                             char *data, int log2_cache_line_bytes) {
     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
 
     char *data_at_offset =
@@ -99,12 +98,12 @@ class LegacyLocalityBloomImpl {
 
   static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
                                          const char *data,
-                                         uint32_t /*out*/*byte_offset,
+                                         uint32_t /*out*/ *byte_offset,
                                          int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
     PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
-    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1),
-             0 /* rw */, 1 /* locality */);
+    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
+             1 /* locality */);
     *byte_offset = b;
   }
 
@@ -112,8 +111,7 @@ class LegacyLocalityBloomImpl {
                                   int num_probes, const char *data,
                                   int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
-    return HashMayMatchPrepared(h, num_probes,
-                                data + b, log2_cache_line_bytes);
+    return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
   }
 
   static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h
index 91b7bbe3d89..4e6251f6ae3 100644
--- a/util/concurrent_task_limiter_impl.h
+++ b/util/concurrent_task_limiter_impl.h
@@ -24,8 +24,8 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
                                      int32_t max_outstanding_task);
   // No copying allowed
   ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
-  ConcurrentTaskLimiterImpl& operator=(
-      const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) =
+      delete;
 
   virtual ~ConcurrentTaskLimiterImpl();
 
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 2594f247083..ec87720eccc 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -17,17 +17,21 @@
 #define crc32c_u16(crc, v) __crc32ch(crc, v)
 #define crc32c_u32(crc, v) __crc32cw(crc, v)
 #define crc32c_u64(crc, v) __crc32cd(crc, v)
-#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \
-        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
-        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
-        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
-        __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
-
-#define PREF1KL1(buffer,PREF_OFFSET) \
-        PREF4X64L1(buffer,(PREF_OFFSET), 0) \
-        PREF4X64L1(buffer,(PREF_OFFSET), 4) \
-        PREF4X64L1(buffer,(PREF_OFFSET), 8) \
-        PREF4X64L1(buffer,(PREF_OFFSET), 12)
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR)                \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64));
+
+#define PREF1KL1(buffer, PREF_OFFSET)  \
+  PREF4X64L1(buffer, (PREF_OFFSET), 0) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 4) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 8) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 12)
 
 extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
 extern uint32_t crc32c_runtime_check(void);
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 0ad06942e7c..26ff1c3aaf3 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -42,7 +42,7 @@ DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
   uint32_t block_bits = block_bytes * 8;
   uint32_t blocks = (total_bits + block_bits - 1) / block_bits;
   uint32_t sz = blocks * block_bytes;
-  kLen = sz / /*bytes/u64*/8;
+  kLen = sz / /*bytes/u64*/ 8;
   assert(kLen > 0);
 #ifndef NDEBUG
   for (uint32_t i = 0; i < kNumDoubleProbes; ++i) {
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 4799b35a6a3..4feaa1f6486 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -134,41 +134,41 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
   // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys.
   // But that effect is hidden if using sequential keys (unique hashes).
   for (bool nonseq : {false, true}) {
-  const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
-  for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
-    uint32_t bloom_bits = 0;
-    Arena arena;
-    bloom_bits = num * FLAGS_bits_per_key;
-    DynamicBloom bloom(&arena, bloom_bits, num_probes);
-    for (uint64_t i = 0; i < num; i++) {
-      bloom.Add(km.Key(i, nonseq));
-      ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
-    }
+    const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
+    for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
+      uint32_t bloom_bits = 0;
+      Arena arena;
+      bloom_bits = num * FLAGS_bits_per_key;
+      DynamicBloom bloom(&arena, bloom_bits, num_probes);
+      for (uint64_t i = 0; i < num; i++) {
+        bloom.Add(km.Key(i, nonseq));
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+      }
 
-    // All added keys must match
-    for (uint64_t i = 0; i < num; i++) {
-      ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
-    }
+      // All added keys must match
+      for (uint64_t i = 0; i < num; i++) {
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+      }
 
-    // Check false positive rate
-    int result = 0;
-    for (uint64_t i = 0; i < 30000; i++) {
-      if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
-        result++;
+      // Check false positive rate
+      int result = 0;
+      for (uint64_t i = 0; i < 30000; i++) {
+        if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
+          result++;
+        }
       }
-    }
-    double rate = result / 30000.0;
+      double rate = result / 30000.0;
 
-    fprintf(stderr,
-            "False positives (%s keys): "
-            "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
-            nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
+      fprintf(stderr,
+              "False positives (%s keys): "
+              "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
+              nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
 
-    if (rate > 0.0125)
-      mediocre_filters++;  // Allowed, but not too often
-    else
-      good_filters++;
-  }
+      if (rate > 0.0125)
+        mediocre_filters++;  // Allowed, but not too often
+      else
+        good_filters++;
+    }
   }
 
   fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
@@ -263,8 +263,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       StopWatchNano timer(Env::Default());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-        bool f =
-            std_bloom.MayContain(km.Seq(i));
+        bool f = std_bloom.MayContain(km.Seq(i));
         ASSERT_TRUE(f);
       }
       elapsed += timer.ElapsedNanos();
@@ -289,8 +288,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       timer.Start();
       for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
            i += num_threads) {
-        bool f =
-            std_bloom.MayContain(km.Seq(i));
+        bool f = std_bloom.MayContain(km.Seq(i));
         if (f) {
           ++false_positives;
         }
diff --git a/util/mutexlock.h b/util/mutexlock.h
index 90e6c8b99c8..7516683a77a 100644
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
@@ -32,8 +32,8 @@ class MutexLock {
     this->mu_->Lock();
   }
   // No copying allowed
-  MutexLock(const MutexLock&) = delete;
-  void operator=(const MutexLock&) = delete;
+  MutexLock(const MutexLock &) = delete;
+  void operator=(const MutexLock &) = delete;
 
   ~MutexLock() { this->mu_->Unlock(); }
 
@@ -52,8 +52,8 @@ class ReadLock {
     this->mu_->ReadLock();
   }
   // No copying allowed
-  ReadLock(const ReadLock&) = delete;
-  void operator=(const ReadLock&) = delete;
+  ReadLock(const ReadLock &) = delete;
+  void operator=(const ReadLock &) = delete;
 
   ~ReadLock() { this->mu_->ReadUnlock(); }
 
@@ -88,8 +88,8 @@ class WriteLock {
     this->mu_->WriteLock();
   }
   // No copying allowed
-  WriteLock(const WriteLock&) = delete;
-  void operator=(const WriteLock&) = delete;
+  WriteLock(const WriteLock &) = delete;
+  void operator=(const WriteLock &) = delete;
 
   ~WriteLock() { this->mu_->WriteUnlock(); }
 
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index b883f44967d..9d3417f9cd1 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -5,8 +5,8 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/arena_wrapped_db_iter.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
+#include "db/arena_wrapped_db_iter.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
 

From a5fa8735e9a7d9eed78a106b6665d7442840778d Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 20 Sep 2019 10:05:52 -0700
Subject: [PATCH 407/572] Code comment for Version Edit (#5829)

Summary:
Added comment for Version Edit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5829

Test Plan: Run existing tests

Differential Revision: D17486229

Pulled By: vjnadimpalli

fbshipit-source-id: b4b31104fadd667356b64bd2dc409b3376ee46ca
---
 db/version_edit.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/version_edit.h b/db/version_edit.h
index a9c4c5a9d98..06a8b0cafea 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -191,6 +191,10 @@ struct LevelFilesBrief {
   }
 };
 
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
 class VersionEdit {
  public:
   VersionEdit() { Clear(); }

From e8263dbdaad0546c54bddd01a8454c2e750a86c2 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 20 Sep 2019 12:00:55 -0700
Subject: [PATCH 408/572] Apply formatter to recent 200+ commits. (#5830)

Summary:
Further apply formatter to more recent commits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5830

Test Plan: Run all existing tests.

Differential Revision: D17488031

fbshipit-source-id: 137458fd94d56dd271b8b40c522b03036943a2ab
---
 cache/cache_bench.cc                          |   4 +-
 db/column_family.cc                           |   6 +-
 db/compaction/compaction_job.cc               |   2 +-
 db/compaction/compaction_job_stats_test.cc    |   2 +-
 db/compaction/compaction_job_test.cc          |   2 +-
 db/db_basic_test.cc                           |  73 +++-----
 db/db_compaction_test.cc                      |   2 +-
 db/db_filesnapshot.cc                         |   3 +-
 db/db_impl/db_impl.cc                         |  11 +-
 db/db_impl/db_impl_compaction_flush.cc        |   3 +-
 db/db_impl/db_impl_open.cc                    |   9 +-
 db/db_impl/db_impl_secondary.cc               |   3 +-
 db/db_impl/db_secondary_test.cc               |   9 +-
 db/db_info_dumper.cc                          |   4 +-
 db/db_test.cc                                 |   5 +-
 db/db_test_util.cc                            |   4 +-
 db/db_universal_compaction_test.cc            |   8 +-
 db/dbformat.cc                                |   2 +-
 db/external_sst_file_ingestion_job.cc         |  27 ++-
 db/import_column_family_job.cc                |  12 +-
 db/import_column_family_job.h                 |  10 +-
 db/import_column_family_test.cc               |   6 +-
 db/internal_stats.cc                          |   2 +-
 db/plain_table_db_test.cc                     |  11 +-
 db/range_tombstone_fragmenter.cc              |   2 +-
 db/table_cache.cc                             |  40 ++--
 db/table_cache.h                              |   3 +-
 db/version_builder.cc                         |  17 +-
 db/version_edit.h                             |   2 +-
 db/version_set.cc                             |   3 +-
 db/wal_manager.cc                             |  18 +-
 db/wal_manager_test.cc                        |   4 +-
 db/write_batch.cc                             |   4 +-
 env/env_test.cc                               |   8 +-
 examples/multi_processes_example.cc           |   2 +-
 file/delete_scheduler_test.cc                 |   2 +-
 include/rocksdb/db.h                          |   3 +-
 include/rocksdb/utilities/ldb_cmd.h           |   2 +-
 include/rocksdb/utilities/stackable_db.h      |   3 +-
 .../AdvancedColumnFamilyOptionsInterface.java |   5 +-
 ...edMutableColumnFamilyOptionsInterface.java |   5 +-
 .../rocksdb/ColumnFamilyOptionsInterface.java |   6 +-
 .../java/org/rocksdb/DBOptionsInterface.java  |   1 -
 .../MutableColumnFamilyOptionsInterface.java  |   7 +-
 .../rocksdb/MutableDBOptionsInterface.java    |   1 -
 .../java/org/rocksdb/util/Environment.java    |   4 +-
 .../org/rocksdb/util/EnvironmentTest.java     |  16 +-
 logging/auto_roll_logger.cc                   |   2 +-
 logging/env_logger.h                          |   2 +-
 logging/env_logger_test.cc                    |   6 +-
 logging/event_logger.cc                       |   2 +-
 monitoring/histogram.cc                       |   4 +-
 monitoring/perf_context_imp.h                 |   3 +-
 monitoring/statistics.cc                      |   6 +-
 options/cf_options.cc                         |   4 +-
 options/options_test.cc                       |   2 +-
 port/jemalloc_helper.h                        |   4 +-
 port/win/env_win.cc                           |   3 +-
 .../block_based/block_based_table_factory.cc  |   2 +-
 table/block_based/block_based_table_reader.cc |  70 +++----
 table/block_based/filter_block.h              |   4 +-
 table/block_based/full_filter_block.cc        |  13 +-
 .../block_based/uncompression_dict_reader.cc  |   6 +-
 table/cuckoo/cuckoo_table_reader.h            |   3 +-
 table/cuckoo/cuckoo_table_reader_test.cc      |   4 +-
 table/mock_table.cc                           |   3 +-
 table/mock_table.h                            |   2 +-
 table/plain/plain_table_bloom.cc              |  16 +-
 table/plain/plain_table_bloom.h               |   2 +-
 table/plain/plain_table_reader.h              |   3 +-
 table/table_reader.h                          |  11 +-
 test_util/transaction_test_util.cc            |   2 +-
 tools/db_bench_tool.cc                        |   2 +-
 tools/db_stress.cc                            |   2 +-
 tools/ldb_cmd.cc                              |   6 +-
 tools/trace_analyzer_tool.cc                  |  63 ++++---
 tools/write_stress.cc                         |   2 +-
 util/bloom_test.cc                            |  36 ++--
 util/crc32c_arm64.cc                          |  55 +++---
 util/crc32c_arm64.h                           |   2 +-
 util/dynamic_bloom.cc                         |   9 +-
 util/dynamic_bloom.h                          |  14 +-
 util/dynamic_bloom_test.cc                    |  20 +-
 util/hash_test.cc                             |  12 +-
 util/rate_limiter_test.cc                     |   2 +-
 util/threadpool_imp.cc                        |  35 ++--
 utilities/backupable/backupable_db.cc         |   2 +-
 utilities/blob_db/blob_file.cc                |   2 +-
 utilities/checkpoint/checkpoint_impl.cc       |   2 +-
 utilities/checkpoint/checkpoint_test.cc       | 175 +++++++++---------
 utilities/merge_operators/sortlist.cc         |   2 +-
 .../persistent_cache/persistent_cache_tier.cc |   2 +-
 utilities/simulator_cache/sim_cache.cc        |   4 +-
 .../optimistic_transaction_test.cc            |   1 -
 utilities/transactions/transaction_base.cc    |   3 +-
 utilities/transactions/transaction_test.h     |   2 +-
 .../write_prepared_transaction_test.cc        |   4 +-
 .../transactions/write_prepared_txn_db.cc     |   2 +-
 .../transactions/write_prepared_txn_db.h      |   2 +-
 .../transactions/write_unprepared_txn.cc      |   5 +-
 .../transactions/write_unprepared_txn_db.cc   |   2 +-
 utilities/ttl/db_ttl_impl.h                   |   4 +-
 utilities/ttl/ttl_test.cc                     |  21 +--
 .../write_batch_with_index.cc                 |   6 +-
 .../write_batch_with_index_test.cc            |   1 -
 105 files changed, 515 insertions(+), 544 deletions(-)

diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc
index 35deb200596..288662ad9df 100644
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@@ -11,9 +11,9 @@ int main() {
 }
 #else
 
-#include <cinttypes>
-#include <sys/types.h>
 #include <stdio.h>
+#include <sys/types.h>
+#include <cinttypes>
 
 #include "port/port.h"
 #include "rocksdb/cache.h"
diff --git a/db/column_family.cc b/db/column_family.cc
index 4c67b7d7604..16688d6cee2 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -9,11 +9,11 @@
 
 #include "db/column_family.h"
 
-#include <cinttypes>
-#include <vector>
-#include <string>
 #include <algorithm>
+#include <cinttypes>
 #include <limits>
+#include <string>
+#include <vector>
 
 #include "db/compaction/compaction_picker.h"
 #include "db/compaction/compaction_picker_fifo.h"
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 5d2a6bfd8f7..7ec7e2b3301 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <functional>
 #include <list>
 #include <memory>
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index 221ee3eaad3..f25a38ec65e 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <iostream>
 #include <mutex>
 #include <queue>
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index f2939f2208b..3f403e1e507 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -5,9 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <cinttypes>
 #include <algorithm>
 #include <array>
+#include <cinttypes>
 #include <map>
 #include <string>
 #include <tuple>
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 907fd3b4338..23f7d445116 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1356,9 +1356,9 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
 // Test class for batched MultiGet with prefix extractor
 // Param bool - If true, use partitioned filters
 //              If false, use full filter block
-class MultiGetPrefixExtractorTest
-    : public DBBasicTest,
-      public ::testing::WithParamInterface<bool> {};
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+                                    public ::testing::WithParamInterface<bool> {
+};
 
 TEST_P(MultiGetPrefixExtractorTest, Batched) {
   Options options = CurrentOptions();
@@ -1396,14 +1396,12 @@ TEST_P(MultiGetPrefixExtractorTest, Batched) {
   ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
 }
 
-INSTANTIATE_TEST_CASE_P(
-    MultiGetPrefix, MultiGetPrefixExtractorTest,
-    ::testing::Bool());
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+                        ::testing::Bool());
 
 #ifndef ROCKSDB_LITE
-class DBMultiGetRowCacheTest
-    : public DBBasicTest,
-      public ::testing::WithParamInterface<bool> {};
+class DBMultiGetRowCacheTest : public DBBasicTest,
+                               public ::testing::WithParamInterface<bool> {};
 
 TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
   do {
@@ -1543,10 +1541,9 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
 
 class DBBasicTestWithParallelIO
     : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool,bool,bool,bool>> {
+      public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
  public:
-  DBBasicTestWithParallelIO()
-      : DBTestBase("/db_basic_test_with_parallel_io") {
+  DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") {
     bool compressed_cache = std::get<0>(GetParam());
     bool uncompressed_cache = std::get<1>(GetParam());
     compression_enabled_ = std::get<2>(GetParam());
@@ -1570,7 +1567,7 @@ class DBBasicTestWithParallelIO
     table_options.block_cache = uncompressed_cache_;
     table_options.block_cache_compressed = compressed_cache_;
     table_options.flush_block_policy_factory.reset(
-                      new MyFlushBlockPolicyFactory());
+        new MyFlushBlockPolicyFactory());
     options.table_factory.reset(new BlockBasedTableFactory(table_options));
     if (!compression_enabled_) {
       options.compression = kNoCompression;
@@ -1598,15 +1595,9 @@ class DBBasicTestWithParallelIO
   int num_found() { return uncompressed_cache_->num_found(); }
   int num_inserts() { return uncompressed_cache_->num_inserts(); }
 
-  int num_lookups_compressed() {
-    return compressed_cache_->num_lookups();
-  }
-  int num_found_compressed() {
-    return compressed_cache_->num_found();
-  }
-  int num_inserts_compressed() {
-    return compressed_cache_->num_inserts();
-  }
+  int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+  int num_found_compressed() { return compressed_cache_->num_found(); }
+  int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
 
   bool fill_cache() { return fill_cache_; }
 
@@ -1614,8 +1605,7 @@ class DBBasicTestWithParallelIO
   static void TearDownTestCase() {}
 
  private:
-  class MyFlushBlockPolicyFactory
-    : public FlushBlockPolicyFactory {
+  class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
    public:
     MyFlushBlockPolicyFactory() {}
 
@@ -1630,11 +1620,10 @@ class DBBasicTestWithParallelIO
     }
   };
 
-  class MyFlushBlockPolicy
-    : public FlushBlockPolicy {
+  class MyFlushBlockPolicy : public FlushBlockPolicy {
    public:
     explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
-      : num_keys_(0), data_block_builder_(data_block_builder) {}
+        : num_keys_(0), data_block_builder_(data_block_builder) {}
 
     bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
       if (data_block_builder_.empty()) {
@@ -1656,11 +1645,10 @@ class DBBasicTestWithParallelIO
     const BlockBuilder& data_block_builder_;
   };
 
-  class MyBlockCache
-    : public Cache {
+  class MyBlockCache : public Cache {
    public:
     explicit MyBlockCache(std::shared_ptr<Cache>& target)
-      : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
+        : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
 
     virtual const char* Name() const override { return "MyBlockCache"; }
 
@@ -1682,9 +1670,7 @@ class DBBasicTestWithParallelIO
       return handle;
     }
 
-    virtual bool Ref(Handle* handle) override {
-      return target_->Ref(handle);
-    }
+    virtual bool Ref(Handle* handle) override { return target_->Ref(handle); }
 
     virtual bool Release(Handle* handle, bool force_erase = false) override {
       return target_->Release(handle, force_erase);
@@ -1694,12 +1680,8 @@ class DBBasicTestWithParallelIO
       return target_->Value(handle);
     }
 
-    virtual void Erase(const Slice& key) override {
-      target_->Erase(key);
-    }
-    virtual uint64_t NewId() override {
-      return target_->NewId();
-    }
+    virtual void Erase(const Slice& key) override { target_->Erase(key); }
+    virtual uint64_t NewId() override { return target_->NewId(); }
 
     virtual void SetCapacity(size_t capacity) override {
       target_->SetCapacity(capacity);
@@ -1717,9 +1699,7 @@ class DBBasicTestWithParallelIO
       return target_->GetCapacity();
     }
 
-    virtual size_t GetUsage() const override {
-      return target_->GetUsage();
-    }
+    virtual size_t GetUsage() const override { return target_->GetUsage(); }
 
     virtual size_t GetUsage(Handle* handle) const override {
       return target_->GetUsage(handle);
@@ -1745,6 +1725,7 @@ class DBBasicTestWithParallelIO
     int num_found() { return num_found_; }
 
     int num_inserts() { return num_inserts_; }
+
    private:
     std::shared_ptr<Cache> target_;
     int num_lookups_;
@@ -1777,7 +1758,7 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
   statuses.resize(keys.size());
 
   dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
-           keys.data(), values.data(), statuses.data(), true);
+                     keys.data(), values.data(), statuses.data(), true);
   ASSERT_TRUE(CheckValue(0, values[0].ToString()));
   ASSERT_TRUE(CheckValue(50, values[1].ToString()));
 
@@ -1789,7 +1770,7 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
   values[0].Reset();
   values[1].Reset();
   dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
-           keys.data(), values.data(), statuses.data(), true);
+                     keys.data(), values.data(), statuses.data(), true);
   ASSERT_TRUE(CheckValue(1, values[0].ToString()));
   ASSERT_TRUE(CheckValue(51, values[1].ToString()));
 
@@ -1798,7 +1779,7 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
 
   keys.resize(10);
   statuses.resize(10);
-  std::vector<int> key_ints{1,2,15,16,55,81,82,83,84,85};
+  std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
   for (size_t i = 0; i < key_ints.size(); ++i) {
     key_data[i] = Key(key_ints[i]);
     keys[i] = Slice(key_data[i]);
@@ -1806,7 +1787,7 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
     values[i].Reset();
   }
   dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
-           keys.data(), values.data(), statuses.data(), true);
+                     keys.data(), values.data(), statuses.data(), true);
   for (size_t i = 0; i < key_ints.size(); ++i) {
     ASSERT_OK(statuses[i]);
     ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 1ff8bd38c54..dad19921c12 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -4664,7 +4664,7 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) {
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "VersionBuilder::CheckConsistency", [&](void* arg) {
-      auto p =
+        auto p =
             reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
         // just swap the two FileMetaData so that we hit error
         // in CheckConsistency funcion
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a6dcdcccbf3..dd5f8f67f09 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -6,9 +6,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <cinttypes>
 #include <stdint.h>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include "db/db_impl/db_impl.h"
 #include "db/job_context.h"
@@ -172,7 +172,6 @@ Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
 
   return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
 }
-
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index dde80d451d9..1c2df575de7 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3286,9 +3286,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
         } else if (type == kTableFile || type == kLogFile) {
-          del =
-              DeleteDBFile(&soptions, path_to_delete, dbname,
-                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+          del = DeleteDBFile(&soptions, path_to_delete, dbname,
+                             /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
         } else {
           del = env->DeleteFile(path_to_delete);
         }
@@ -4003,8 +4002,7 @@ Status DBImpl::IngestExternalFiles(
 Status DBImpl::CreateColumnFamilyWithImport(
     const ColumnFamilyOptions& options, const std::string& column_family_name,
     const ImportColumnFamilyOptions& import_options,
-    const ExportImportFilesMetaData& metadata,
-    ColumnFamilyHandle** handle) {
+    const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
   assert(handle != nullptr);
   assert(*handle == nullptr);
   std::string cf_comparator_name = options.comparator->Name();
@@ -4045,8 +4043,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
       // reuse the file number that has already assigned to the internal file,
       // and this will overwrite the external file. To protect the external
       // file, we have to make sure the file number will never being reused.
-      next_file_number =
-          versions_->FetchAddFileNumber(metadata.files.size());
+      next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
       auto cf_options = cfd->GetLatestMutableCFOptions();
       status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
                                       directories_.GetDbDir());
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 42b0b93bb4f..2c8be284742 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1565,7 +1565,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
           if (stats_cf_flush_needed) {
             ROCKS_LOG_INFO(immutable_db_options_.info_log,
                            "Force flushing stats CF with manual flush of %s "
-                           "to avoid holding old logs", cfd->GetName().c_str());
+                           "to avoid holding old logs",
+                           cfd->GetName().c_str());
             s = SwitchMemtable(cfd_stats, &context);
             flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
             flush_req.emplace_back(cfd_stats, flush_memtable_id);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 5078748d036..076985de156 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -135,9 +135,9 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     std::vector<std::string> filenames;
     result.env->GetChildren(result.wal_dir, &filenames);
     for (std::string& filename : filenames) {
-      if (filename.find(".log.trash",
-                  filename.length() - std::string(".log.trash").length()) !=
-                  std::string::npos) {
+      if (filename.find(".log.trash", filename.length() -
+                                          std::string(".log.trash").length()) !=
+          std::string::npos) {
         std::string trash_file = result.wal_dir + "/" + filename;
         result.env->DeleteFile(trash_file);
       }
@@ -1352,8 +1352,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     return s;
   }
 
-  impl->wal_in_db_path_ =
-      IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
 
   impl->mutex_.Lock();
   // Handles create_if_missing, error_if_exists
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index da4a1da3a24..8eac41dedb8 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -588,8 +588,7 @@ Status DB::OpenAsSecondary(
       &impl->write_controller_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
-  impl->wal_in_db_path_ =
-      IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
 
   impl->mutex_.Lock();
   s = impl->Recover(column_families, true, false, false);
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
index 26f43c10745..6caff005eb4 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -605,9 +605,9 @@ TEST_F(DBSecondaryTest, SwitchWAL) {
 TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
   const int kNumKeysPerMemtable = 1;
   SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->LoadDependency({
-      {"DBImpl::BackgroundCallFlush:ContextCleanedUp",
-       "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
   SyncPoint::GetInstance()->EnableProcessing();
   const std::string kCFName1 = "pikachu";
   Options options;
@@ -662,7 +662,8 @@ TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
         Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
     ASSERT_OK(
         Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
-    TEST_SYNC_POINT("DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
     verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
     SyncPoint::GetInstance()->ClearTrace();
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index e2bb01e0e97..7ab7e3337aa 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -5,10 +5,10 @@
 
 #include "db/db_info_dumper.h"
 
-#include <cinttypes>
 #include <stdio.h>
-#include <string>
 #include <algorithm>
+#include <cinttypes>
+#include <string>
 #include <vector>
 
 #include "file/filename.h"
diff --git a/db/db_test.cc b/db/db_test.cc
index 9ca550b8d18..04785c3c1da 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2793,7 +2793,8 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
-  Status GetCurrentWalFile(std::unique_ptr<LogFile>* /*current_log_file*/) override {
+  Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* /*current_log_file*/) override {
     return Status::OK();
   }
 
@@ -6265,7 +6266,7 @@ TEST_F(DBTest, LargeBlockSizeTest) {
   CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_OK(Put(0, "foo", "bar"));
   BlockBasedTableOptions table_options;
-  table_options.block_size = 8LL*1024*1024*1024LL;
+  table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index fae03e2abc4..41e4c5105d2 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -48,9 +48,7 @@ ROT13BlockCipher rot13Cipher_(16);
 #endif  // ROCKSDB_LITE
 
 DBTestBase::DBTestBase(const std::string path)
-    : mem_env_(nullptr),
-      encrypted_env_(nullptr),
-      option_config_(kDefault) {
+    : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
   Env* base_env = Env::Default();
 #ifndef ROCKSDB_LITE
   const char* test_env_uri = getenv("TEST_ENV_URI");
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 4f3ddef10c0..fa2277bad28 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -452,7 +452,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
   ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
-                     .max_size_amplification_percent);
+                      .max_size_amplification_percent);
 
   dbfull()->TEST_WaitForCompact();
   // Verify that size amplification did happen
@@ -534,8 +534,10 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
   ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2u);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+            2u);
 
   dbfull()->TEST_WaitForCompact();
 
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 130ba4e8adf..a20e2a02d39 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -8,8 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/dbformat.h"
 
-#include <cinttypes>
 #include <stdio.h>
+#include <cinttypes>
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/coding.h"
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index c11b346a2ec..6b53ce391eb 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -86,8 +86,7 @@ Status ExternalSstFileIngestionJob::Prepare(
       return Status::InvalidArgument("File contain no entries");
     }
 
-    if (!f.smallest_internal_key.Valid() ||
-        !f.largest_internal_key.Valid()) {
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
       return Status::Corruption("Generated table have corrupted keys");
     }
   }
@@ -448,8 +447,10 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       table_reader->NewRangeTombstoneIterator(ro));
 
   // Get first (smallest) and last (largest) key from file.
-  file_to_ingest->smallest_internal_key = InternalKey("", 0, ValueType::kTypeValue);
-  file_to_ingest->largest_internal_key = InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->smallest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
   bool bounds_set = false;
   iter->SeekToFirst();
   if (iter->Valid()) {
@@ -485,11 +486,15 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       RangeTombstone tombstone(key, range_del_iter->value());
 
       InternalKey start_key = tombstone.SerializeKey();
-      if (!bounds_set || sstableKeyCompare(ucmp, start_key, file_to_ingest->smallest_internal_key) < 0) {
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, start_key,
+                            file_to_ingest->smallest_internal_key) < 0) {
         file_to_ingest->smallest_internal_key = start_key;
       }
       InternalKey end_key = tombstone.SerializeEndKey();
-      if (!bounds_set || sstableKeyCompare(ucmp, end_key, file_to_ingest->largest_internal_key) > 0) {
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, end_key,
+                            file_to_ingest->largest_internal_key) > 0) {
         file_to_ingest->largest_internal_key = end_key;
       }
       bounds_set = true;
@@ -531,9 +536,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
     if (vstorage->NumLevelFiles(lvl) > 0) {
       bool overlap_with_level = false;
-      status = sv->current->OverlapWithLevelIterator(ro, env_options_,
-          file_to_ingest->smallest_internal_key.user_key(), file_to_ingest->largest_internal_key.user_key(),
-          lvl, &overlap_with_level);
+      status = sv->current->OverlapWithLevelIterator(
+          ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+          file_to_ingest->largest_internal_key.user_key(), lvl,
+          &overlap_with_level);
       if (!status.ok()) {
         return status;
       }
@@ -672,7 +678,8 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   }
 
   auto* vstorage = cfd_->current()->storage_info();
-  Slice file_smallest_user_key(file_to_ingest->smallest_internal_key.user_key());
+  Slice file_smallest_user_key(
+      file_to_ingest->smallest_internal_key.user_key());
   Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
 
   if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 7a6aa734420..66b8b16220a 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -2,8 +2,8 @@
 
 #include "db/import_column_family_job.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include <vector>
 
@@ -64,7 +64,8 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
 
       for (size_t i = 0; i < sorted_files.size() - 1; i++) {
         if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
-                              sorted_files[i + 1]->smallest_internal_key) >= 0) {
+                              sorted_files[i + 1]->smallest_internal_key) >=
+            0) {
           return Status::InvalidArgument("Files have overlapping ranges");
         }
       }
@@ -76,8 +77,7 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
       return Status::InvalidArgument("File contain no entries");
     }
 
-    if (!f.smallest_internal_key.Valid() ||
-        !f.largest_internal_key.Valid()) {
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
       return Status::Corruption("File has corrupted keys");
     }
   }
@@ -198,8 +198,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   if (!status.ok()) {
     return status;
   }
-  sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
-                                                   external_file));
+  sst_file_reader.reset(
+      new RandomAccessFileReader(std::move(sst_file), external_file));
 
   status = cfd_->ioptions()->table_factory->NewTableReader(
       TableReaderOptions(*cfd_->ioptions(),
diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h
index 5b8577df1d5..05796590b61 100644
--- a/db/import_column_family_job.h
+++ b/db/import_column_family_job.h
@@ -20,11 +20,11 @@ namespace rocksdb {
 // to ExternalSstFileIngestionJob.
 class ImportColumnFamilyJob {
  public:
-  ImportColumnFamilyJob(
-      Env* env, VersionSet* versions, ColumnFamilyData* cfd,
-      const ImmutableDBOptions& db_options, const EnvOptions& env_options,
-      const ImportColumnFamilyOptions& import_options,
-      const std::vector<LiveFileMetaData>& metadata)
+  ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+                        const ImmutableDBOptions& db_options,
+                        const EnvOptions& env_options,
+                        const ImportColumnFamilyOptions& import_options,
+                        const std::vector<LiveFileMetaData>& metadata)
       : env_(env),
         versions_(versions),
         cfd_(cfd),
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index 4f695d33f90..1138b16bae5 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -45,8 +45,7 @@ class ImportColumnFamilyTest : public DBTestBase {
     test::DestroyDir(env_, export_files_dir_);
   }
 
-  LiveFileMetaData LiveFileMetaDataInit(std::string name,
-                                        std::string path,
+  LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
                                         int level,
                                         SequenceNumber smallest_seqno,
                                         SequenceNumber largest_seqno) {
@@ -64,7 +63,7 @@ class ImportColumnFamilyTest : public DBTestBase {
   std::string export_files_dir_;
   ColumnFamilyHandle* import_cfh_;
   ColumnFamilyHandle* import_cfh2_;
-  ExportImportFilesMetaData *metadata_ptr_;
+  ExportImportFilesMetaData* metadata_ptr_;
 };
 
 TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
@@ -545,7 +544,6 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
                                                 metadata, &import_cfh_));
     ASSERT_NE(import_cfh_, nullptr);
   }
-
 }
 
 }  // namespace rocksdb
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 2ddd9122e84..94d9cd8ac5b 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -10,8 +10,8 @@
 
 #include "db/internal_stats.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <limits>
 #include <string>
 #include <utility>
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index b4e983a7b2a..186f4717d72 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -745,20 +745,19 @@ TEST_P(PlainTableDBTest, BloomSchema) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
-   options.bloom_locality = bloom_locality;
+    options.bloom_locality = bloom_locality;
     PlainTableOptions plain_table_options;
     plain_table_options.user_key_len = 16;
-    plain_table_options.bloom_bits_per_key = 3; // high FP rate for test
+    plain_table_options.bloom_bits_per_key = 3;  // high FP rate for test
     plain_table_options.hash_table_ratio = 0.75;
     plain_table_options.index_sparseness = 16;
     plain_table_options.huge_page_tlb_size = 0;
     plain_table_options.encoding_type = kPlain;
 
-
     bool expect_bloom_not_match = false;
     options.table_factory.reset(new TestPlainTableFactory(
-        &expect_bloom_not_match, plain_table_options,
-        0 /* column_family_id */, kDefaultColumnFamilyName));
+        &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+        kDefaultColumnFamilyName));
     DestroyAndReopen(&options);
 
     for (unsigned i = 0; i < 2345; ++i) {
@@ -782,7 +781,7 @@ TEST_P(PlainTableDBTest, BloomSchema) {
         pattern = 163905UL;
       }
       bool expect_fp = pattern & (1UL << i);
-      //fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+      // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
       expect_bloom_not_match = !expect_fp;
       ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
     }
diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc
index 3d3a5c4520f..fe64623fee5 100644
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@@ -9,8 +9,8 @@
 #include <functional>
 #include <set>
 
-#include <cinttypes>
 #include <stdio.h>
+#include <cinttypes>
 
 #include "util/autovector.h"
 #include "util/kv_map.h"
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 98070be6989..89acd3d84e6 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -277,10 +277,11 @@ Status TableCache::GetRangeTombstoneIterator(
 }
 
 #ifndef ROCKSDB_LITE
-void TableCache::CreateRowCacheKeyPrefix(
-        const ReadOptions& options,
-        const FileDescriptor& fd, const Slice& internal_key,
-        GetContext* get_context, IterKey& row_cache_key) {
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+                                         const FileDescriptor& fd,
+                                         const Slice& internal_key,
+                                         GetContext* get_context,
+                                         IterKey& row_cache_key) {
   uint64_t fd_number = fd.GetNumber();
   // We use the user key as cache key instead of the internal key,
   // otherwise the whole cache would be invalidated every time the
@@ -312,13 +313,11 @@ void TableCache::CreateRowCacheKeyPrefix(
   AppendVarint64(&row_cache_key, seq_no);
 }
 
-bool TableCache::GetFromRowCache(
-          const Slice& user_key, IterKey& row_cache_key,
-          size_t prefix_size, GetContext* get_context) {
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                                 size_t prefix_size, GetContext* get_context) {
   bool found = false;
 
-  row_cache_key.TrimAppend(prefix_size, user_key.data(),
-                           user_key.size());
+  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
   if (auto row_handle =
           ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
     // Cleanable routine to release the cache entry
@@ -327,8 +326,8 @@ bool TableCache::GetFromRowCache(
                                        void* cache_handle) {
       ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
     };
-    auto found_row_cache_entry = static_cast<const std::string*>(
-        ioptions_.row_cache->Value(row_handle));
+    auto found_row_cache_entry =
+        static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
     // If it comes here value is located on the cache.
     // found_row_cache_entry points to the value on cache,
     // and value_pinner has cleanup procedure for the cached entry.
@@ -442,14 +441,15 @@ Status TableCache::MultiGet(const ReadOptions& options,
   Status s;
   TableReader* t = fd.table_reader;
   Cache::Handle* handle = nullptr;
-  MultiGetRange table_range(*mget_range, mget_range->begin(), mget_range->end());
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
 #ifndef ROCKSDB_LITE
   autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
   IterKey row_cache_key;
   size_t row_cache_key_prefix_size = 0;
   KeyContext& first_key = *table_range.begin();
-  bool lookup_row_cache = ioptions_.row_cache &&
-          !first_key.get_context->NeedToReadSequence();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
 
   // Check row cache if enabled. Since row cache does not currently store
   // sequence numbers, we cannot use it if we need to fetch the sequence.
@@ -459,8 +459,10 @@ Status TableCache::MultiGet(const ReadOptions& options,
                             row_cache_key);
     row_cache_key_prefix_size = row_cache_key.Size();
 
-    for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) {
-      const Slice& user_key = miter->ukey;;
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey;
+      ;
       GetContext* get_context = miter->get_context;
 
       if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
@@ -519,9 +521,11 @@ Status TableCache::MultiGet(const ReadOptions& options,
   if (lookup_row_cache) {
     size_t row_idx = 0;
 
-    for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) {
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
       std::string& row_cache_entry = row_cache_entries[row_idx++];
-      const Slice& user_key = miter->ukey;;
+      const Slice& user_key = miter->ukey;
+      ;
       GetContext* get_context = miter->get_context;
 
       get_context->SetReplayLog(nullptr);
diff --git a/db/table_cache.h b/db/table_cache.h
index 85592858a8c..088040672d8 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -208,8 +208,7 @@ class TableCache {
   void CreateRowCacheKeyPrefix(const ReadOptions& options,
                                const FileDescriptor& fd,
                                const Slice& internal_key,
-                               GetContext* get_context,
-                               IterKey& row_cache_key);
+                               GetContext* get_context, IterKey& row_cache_key);
 
   // Helper function to lookup the row cache for a key. It appends the
   // user key to row_cache_key at offset prefix_size
diff --git a/db/version_builder.cc b/db/version_builder.cc
index b97853f2d6d..8a8aefcb881 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -9,9 +9,9 @@
 
 #include "db/version_builder.h"
 
-#include <cinttypes>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <map>
 #include <set>
@@ -173,14 +173,13 @@ class VersionBuilder::Rep {
                       " vs. file with global_seqno %" PRIu64 "\n",
                       f1->fd.smallest_seqno, f1->fd.largest_seqno,
                       external_file_seqno);
-              return Status::Corruption("L0 file with seqno " +
-                                        NumberToString(f1->fd.smallest_seqno) +
-                                        " " +
-                                        NumberToString(f1->fd.largest_seqno) +
-                                        " vs. file with global_seqno" +
-                                        NumberToString(external_file_seqno) +
-                                        " with fileNumber " +
-                                        NumberToString(f1->fd.GetNumber()));
+              return Status::Corruption(
+                  "L0 file with seqno " +
+                  NumberToString(f1->fd.smallest_seqno) + " " +
+                  NumberToString(f1->fd.largest_seqno) +
+                  " vs. file with global_seqno" +
+                  NumberToString(external_file_seqno) + " with fileNumber " +
+                  NumberToString(f1->fd.GetNumber()));
             }
           } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
             fprintf(stderr,
diff --git a/db/version_edit.h b/db/version_edit.h
index 06a8b0cafea..7ab8fc0f161 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -52,7 +52,7 @@ struct FileDescriptor {
         smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno) {}
 
-  FileDescriptor(const FileDescriptor& fd) { *this=fd; }
+  FileDescriptor(const FileDescriptor& fd) { *this = fd; }
 
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
diff --git a/db/version_set.cc b/db/version_set.cc
index a8ae98550be..61c67b42a42 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5727,7 +5727,8 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
     return Status::OK();
   }
   if (active_version_builders_.find(edit.column_family_) ==
-      active_version_builders_.end() && !cfd->IsDropped()) {
+          active_version_builders_.end() &&
+      !cfd->IsDropped()) {
     std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
         new BaseReferencedVersionBuilder(cfd));
     active_version_builders_.insert(
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 783e1c7acd3..d3f59a10e3f 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -9,10 +9,10 @@
 
 #include "db/wal_manager.h"
 
-#include <cinttypes>
 #include <algorithm>
-#include <vector>
+#include <cinttypes>
 #include <memory>
+#include <vector>
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -415,12 +415,13 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   return s;
 }
 
-Status WalManager::GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file) {
+Status WalManager::GetLiveWalFile(uint64_t number,
+                                  std::unique_ptr<LogFile>* log_file) {
   if (!log_file) {
     return Status::InvalidArgument("log_file not preallocated.");
   }
 
-  if(!number) {
+  if (!number) {
     return Status::PathNotFound("log file not available");
   }
 
@@ -433,16 +434,13 @@ Status WalManager::GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log
     return s;
   }
 
-  log_file->reset(new LogFileImpl(
-      number,
-      kAliveLogFile,
-      0,      // SequenceNumber
-      size_bytes));
+  log_file->reset(new LogFileImpl(number, kAliveLogFile,
+                                  0,  // SequenceNumber
+                                  size_bytes));
 
   return Status::OK();
 }
 
-
 // the function returns status.ok() and sequence == 0 if the file exists, but is
 // empty
 Status WalManager::ReadFirstLine(const std::string& fname,
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 089c49cc637..4f15a064d1f 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -293,7 +293,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
   // Check that an empty iterator is returned
   ASSERT_TRUE(!iter->Valid());
 }
-  
+
 TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
   Init();
   CreateArchiveLogs(2, 100);
@@ -307,7 +307,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
   // A new log file was added after the iterator was created.
   // TryAgain indicates a new iterator is needed to fetch the new data
   ASSERT_TRUE(iter->status().IsTryAgain());
-  
+
   iter = OpenTransactionLogIter(0);
   i = 0;
   for (; iter->Valid(); iter->Next()) {
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 1b878f3b093..350c1a1c072 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -334,9 +334,7 @@ void WriteBatch::Clear() {
   wal_term_point_.clear();
 }
 
-uint32_t WriteBatch::Count() const {
-  return WriteBatchInternal::Count(this);
-}
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
 
 uint32_t WriteBatch::ComputeContentFlags() const {
   auto rv = content_flags_.load(std::memory_order_relaxed);
diff --git a/env/env_test.cc b/env/env_test.cc
index f9c597823ef..fe42b852033 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -1110,7 +1110,8 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
   // Create file.
   {
     std::unique_ptr<WritableFile> wfile;
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
     if (soptions.use_direct_writes) {
       soptions.use_direct_writes = false;
     }
@@ -1137,7 +1138,8 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
       data.emplace_back(NewAligned(kSectorSize, 0));
       reqs[i].scratch = data.back().get();
     }
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
     if (soptions.use_direct_reads) {
       soptions.use_direct_reads = false;
     }
@@ -1145,7 +1147,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
     ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
     ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
     for (size_t i = 0; i < reqs.size(); ++i) {
-      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i*2 + 1));
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i * 2 + 1));
       ASSERT_OK(reqs[i].status);
       ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
     }
diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc
index 7350e1be253..921041a576e 100644
--- a/examples/multi_processes_example.cc
+++ b/examples/multi_processes_example.cc
@@ -14,8 +14,8 @@
 // run for a while, tailing the logs of the primary. After process with primary
 // instance exits, this process will keep running until you hit 'CTRL+C'.
 
-#include <cinttypes>
 #include <chrono>
+#include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
index 3549a9f84eb..b6d4b903c16 100644
--- a/file/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <cinttypes>
 #include <atomic>
+#include <cinttypes>
 #include <thread>
 #include <vector>
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index ca8ab85b8a3..b03c12e003c 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1137,7 +1137,8 @@ class DB {
   //
   // Additionally, for the sake of optimization current_log_file->StartSequence
   // would always be set to 0
-  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) = 0;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) = 0;
 
   // Note: this API is not yet consistent with WritePrepared transactions.
   // Sets iter to an iterator that is positioned at a write-batch containing
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index efac3f84d84..3a5c980f48f 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -131,7 +131,7 @@ class LDBCommand {
   std::string db_path_;
   // If empty, open DB as primary. If non-empty, open the DB as secondary
   // with this secondary path. When running against a database opened by
-  // another process, ldb wll leave the source directory completely intact. 
+  // another process, ldb wll leave the source directory completely intact.
   std::string secondary_path_;
   std::string column_family_name_;
   DB* db_;
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 9dd038b84fa..32202d7a073 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -378,7 +378,8 @@ class StackableDB : public DB {
     return db_->GetSortedWalFiles(files);
   }
 
-  virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override {
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override {
     return db_->GetCurrentWalFile(current_log_file);
   }
 
diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index 532db473407..91e3b2fa2b0 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -13,9 +13,8 @@
  *
  * Taken from include/rocksdb/advanced_options.h
  */
-public interface AdvancedColumnFamilyOptionsInterface
-    <T extends AdvancedColumnFamilyOptionsInterface<T>> {
-
+public interface AdvancedColumnFamilyOptionsInterface<
+    T extends AdvancedColumnFamilyOptionsInterface<T>> {
   /**
    * The minimum number of write buffers that will be merged together
    * before writing to storage.  If set to 1, then
diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
index 64a6f9dccc7..03a7b098352 100644
--- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -11,9 +11,8 @@
  * Taken from include/rocksdb/advanced_options.h
  * and MutableCFOptions in util/cf_options.h
  */
-public interface AdvancedMutableColumnFamilyOptionsInterface
-    <T extends AdvancedMutableColumnFamilyOptionsInterface<T>> {
-
+public interface AdvancedMutableColumnFamilyOptionsInterface<
+    T extends AdvancedMutableColumnFamilyOptionsInterface<T>> {
   /**
    * The maximum number of write buffers that are built up in memory.
    * The default is 2, so that when 1 write buffer is being flushed to
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index 3c8cd5d5182..6d8dfd161c6 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -5,10 +5,8 @@
 
 package org.rocksdb;
 
-public interface ColumnFamilyOptionsInterface
-    <T extends ColumnFamilyOptionsInterface<T>>
-        extends AdvancedColumnFamilyOptionsInterface<T> {
-
+public interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
+    extends AdvancedColumnFamilyOptionsInterface<T> {
   /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
    * spend lots of memory for memtables.
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index 611f4f5da71..f4217412fc9 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -9,7 +9,6 @@
 import java.util.List;
 
 public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
-
   /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
    * spend lots of memory for memtables.
diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
index 4ae96daaf8a..be3a5d483ba 100644
--- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -5,10 +5,9 @@
 
 package org.rocksdb;
 
-public interface MutableColumnFamilyOptionsInterface
-    <T extends MutableColumnFamilyOptionsInterface<T>>
-        extends AdvancedMutableColumnFamilyOptionsInterface<T> {
-
+public interface MutableColumnFamilyOptionsInterface<
+    T extends MutableColumnFamilyOptionsInterface<T>>
+    extends AdvancedMutableColumnFamilyOptionsInterface<T> {
   /**
    * Amount of data to build up in memory (backed by an unsorted log
    * on disk) before converting to a sorted on-disk file.
diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
index 00087a43cae..50347d38d53 100644
--- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
@@ -2,7 +2,6 @@
 package org.rocksdb;
 
 public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface<T>> {
-
   /**
    * Specifies the maximum number of concurrent background jobs (both flushes
    * and compactions combined).
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index 03611a248a6..bac1c559ac8 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -64,9 +64,9 @@ public static String getSharedLibraryFileName(final String name) {
   public static String getJniLibraryName(final String name) {
     if (isUnix()) {
       final String arch = is64Bit() ? "64" : "32";
-      if(isPowerPC() || isAarch64()) {
+      if (isPowerPC() || isAarch64()) {
         return String.format("%sjni-linux-%s", name, ARCH);
-      } else if(isS390x()) {
+      } else if (isS390x()) {
         return String.format("%sjni-linux%s", name, ARCH);
       } else {
         return String.format("%sjni-linux%s", name, arch);
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index 49c8bf19a91..ff13ddb5d9f 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -136,16 +136,12 @@ public void aarch64() {
     assertThat(Environment.isUnix()).isTrue();
     assertThat(Environment.isAarch64()).isTrue();
     assertThat(Environment.is64Bit()).isTrue();
-    assertThat(Environment.getJniLibraryExtension()).
-        isEqualTo(".so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
-        isEqualTo("rocksdbjni");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
-        isEqualTo("rocksdbjni-linux-aarch64");
-    assertThat(Environment.getJniLibraryFileName("rocksdb")).
-        isEqualTo("librocksdbjni-linux-aarch64.so");
-    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
-        isEqualTo("librocksdbjni.so");
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
   }
 
   private void setEnvironmentClassFields(String osName,
diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc
index 3109f0bc69c..73a02a89957 100644
--- a/logging/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -171,7 +171,7 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
   if (!logger_) {
     return;
   }
-  
+
   std::shared_ptr<Logger> logger;
   {
     MutexLock l(&mutex_);
diff --git a/logging/env_logger.h b/logging/env_logger.h
index 5d7ff7afe17..7e8212dd2ec 100644
--- a/logging/env_logger.h
+++ b/logging/env_logger.h
@@ -11,10 +11,10 @@
 
 #pragma once
 
+#include <time.h>
 #include <atomic>
 #include <memory>
 #include "port/sys_time.h"
-#include <time.h>
 
 #include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc
index 316c231fad9..6e1af2d5590 100644
--- a/logging/env_logger_test.cc
+++ b/logging/env_logger_test.cc
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 
-#include "env/mock_env.h"
 #include "logging/env_logger.h"
+#include "env/mock_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
@@ -44,9 +44,7 @@ class EnvLoggerTest : public testing::Test {
     return result;
   }
 
-  void DeleteLogFile() {
-    ASSERT_OK(env_->DeleteFile(kLogFile));
-  }
+  void DeleteLogFile() { ASSERT_OK(env_->DeleteFile(kLogFile)); }
 
   static const std::string kSampleMessage;
   static const std::string kTestDir;
diff --git a/logging/event_logger.cc b/logging/event_logger.cc
index 182e282b2f0..4ae9d2d66c1 100644
--- a/logging/event_logger.cc
+++ b/logging/event_logger.cc
@@ -5,8 +5,8 @@
 
 #include "logging/event_logger.h"
 
-#include <cinttypes>
 #include <cassert>
+#include <cinttypes>
 #include <sstream>
 #include <string>
 
diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc
index 29bf78ad7c9..4449ade6408 100644
--- a/monitoring/histogram.cc
+++ b/monitoring/histogram.cc
@@ -9,10 +9,10 @@
 
 #include "monitoring/histogram.h"
 
-#include <cinttypes>
-#include <cassert>
 #include <math.h>
 #include <stdio.h>
+#include <cassert>
+#include <cinttypes>
 
 #include "port/port.h"
 #include "util/cast_util.h"
diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h
index 6e26988f137..7bf62060557 100644
--- a/monitoring/perf_context_imp.h
+++ b/monitoring/perf_context_imp.h
@@ -27,7 +27,8 @@ extern thread_local PerfContext perf_context;
 #define PERF_TIMER_GUARD(metric)
 #define PERF_TIMER_GUARD_WITH_ENV(metric, env)
 #define PERF_CPU_TIMER_GUARD(metric, env)
-#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, ticker_type)
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
+                                               ticker_type)
 #define PERF_TIMER_MEASURE(metric)
 #define PERF_COUNTER_ADD(metric, value)
 #define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 70c993b201a..6942fc579f8 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -5,11 +5,11 @@
 //
 #include "monitoring/statistics.h"
 
-#include <cinttypes>
-#include "rocksdb/statistics.h"
-#include "port/likely.h"
 #include <algorithm>
+#include <cinttypes>
 #include <cstdio>
+#include "port/likely.h"
+#include "rocksdb/statistics.h"
 
 namespace rocksdb {
 
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 488c42b968f..ef06eaf15bb 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -5,15 +5,15 @@
 
 #include "options/cf_options.h"
 
-#include <cinttypes>
 #include <cassert>
+#include <cinttypes>
 #include <limits>
 #include <string>
 #include "options/db_options.h"
 #include "port/port.h"
+#include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "rocksdb/concurrent_task_limiter.h"
 
 namespace rocksdb {
 
diff --git a/options/options_test.cc b/options/options_test.cc
index 13328c319fd..d3bbe87c8fd 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <cctype>
+#include <cinttypes>
 #include <cstring>
 #include <unordered_map>
-#include <cinttypes>
 
 #include "cache/lru_cache.h"
 #include "cache/sharded_cache.h"
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index a9095ec98dc..f6f72f8cb8c 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -32,8 +32,8 @@
 
 #if defined(OS_WIN) && defined(_MSC_VER)
 
-// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is defined,
-// Jemalloc memory allocator is used.
+// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is
+// defined, Jemalloc memory allocator is used.
 static inline bool HasJemalloc() { return true; }
 
 #else
diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 7718ebd72c5..c12d0ee4fc2 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -979,7 +979,8 @@ uint64_t WinEnvIO::NowMicros() {
     return li.QuadPart;
   }
   using namespace std::chrono;
-  return duration_cast<microseconds>(system_clock::now().time_since_epoch()).count();
+  return duration_cast<microseconds>(system_clock::now().time_since_epoch())
+      .count();
 }
 
 uint64_t WinEnvIO::NowNanos() {
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 9dca2a6f0c1..5f48b78cae0 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -7,8 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <cinttypes>
 #include <stdint.h>
+#include <cinttypes>
 
 #include <memory>
 #include <string>
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index f6afab43fe3..4c58c23eca3 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2019,9 +2019,10 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
 // If input_iter is null, new a iterator
 // If input_iter is not null, update this iter and return it
 template <typename TBlockIter>
-TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    const ReadOptions& ro, CachableEntry<Block>& block, TBlockIter* input_iter,
-    Status s) const {
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
   TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
@@ -2167,11 +2168,10 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
         SequenceNumber seq_no = rep_->get_global_seqno(block_type);
         // If filling cache is allowed and a cache is configured, try to put the
         // block to the cache.
-        s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
-                                block_entry, contents,
-                                raw_block_comp_type, uncompression_dict, seq_no,
-                                GetMemoryAllocator(rep_->table_options),
-                                block_type, get_context);
+        s = PutDataBlockToCache(
+            key, ckey, block_cache, block_cache_compressed, block_entry,
+            contents, raw_block_comp_type, uncompression_dict, seq_no,
+            GetMemoryAllocator(rep_->table_options), block_type, get_context);
       }
     }
   }
@@ -2344,12 +2344,10 @@ void BlockBasedTable::RetrieveMultipleBlocks(
         // BlockContents so it can free the memory
         assert(req.result.data() == req.scratch);
         std::unique_ptr<char[]> raw_block(req.scratch);
-        raw_block_contents = BlockContents(std::move(raw_block),
-                                 handle.size());
+        raw_block_contents = BlockContents(std::move(raw_block), handle.size());
       } else {
         // We used the scratch buffer, so no need to free anything
-        raw_block_contents = BlockContents(Slice(req.scratch,
-                                 handle.size()));
+        raw_block_contents = BlockContents(Slice(req.scratch, handle.size()));
       }
 #ifndef NDEBUG
       raw_block_contents.is_raw_block = true;
@@ -2370,35 +2368,36 @@ void BlockBasedTable::RetrieveMultipleBlocks(
         // MaybeReadBlockAndLoadToCache will insert into the block caches if
         // necessary. Since we're passing the raw block contents, it will
         // avoid looking up the block cache
-        s = MaybeReadBlockAndLoadToCache(nullptr, options, handle,
-              uncompression_dict, block_entry, BlockType::kData,
-              mget_iter->get_context, &lookup_data_block_context,
-              &raw_block_contents);
+        s = MaybeReadBlockAndLoadToCache(
+            nullptr, options, handle, uncompression_dict, block_entry,
+            BlockType::kData, mget_iter->get_context,
+            &lookup_data_block_context, &raw_block_contents);
       } else {
         CompressionType compression_type =
-                raw_block_contents.get_compression_type();
+            raw_block_contents.get_compression_type();
         BlockContents contents;
         if (compression_type != kNoCompression) {
           UncompressionContext context(compression_type);
           UncompressionInfo info(context, uncompression_dict, compression_type);
           s = UncompressBlockContents(info, req.result.data(), handle.size(),
-                    &contents, footer.version(), rep_->ioptions,
-                    memory_allocator);
+                                      &contents, footer.version(),
+                                      rep_->ioptions, memory_allocator);
         } else {
           if (scratch != nullptr) {
             // If we used the scratch buffer, then the contents need to be
             // copied to heap
             Slice raw = Slice(req.result.data(), handle.size());
-            contents = BlockContents(CopyBufferToHeap(
-                  GetMemoryAllocator(rep_->table_options), raw),
-                  handle.size());
+            contents = BlockContents(
+                CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
+                handle.size());
           } else {
             contents = std::move(raw_block_contents);
           }
         }
         if (s.ok()) {
-          (*results)[idx_in_batch].SetOwnedValue(new Block(std::move(contents),
-                global_seqno, read_amp_bytes_per_bit, ioptions.statistics));
+          (*results)[idx_in_batch].SetOwnedValue(
+              new Block(std::move(contents), global_seqno,
+                        read_amp_bytes_per_bit, ioptions.statistics));
         }
       }
     }
@@ -3036,7 +3035,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
 }
 
 template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::CheckDataBlockWithinUpperBound() {
+void BlockBasedTableIterator<TBlockIter,
+                             TValue>::CheckDataBlockWithinUpperBound() {
   if (read_options_.iterate_upper_bound != nullptr &&
       block_iter_points_to_real_block_) {
     data_block_within_upper_bound_ =
@@ -3047,7 +3047,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckDataBlockWithinUpperBound
 
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
-    Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {
+    Arena* arena, bool skip_filters, TableReaderCaller caller,
+    size_t compaction_readahead_size) {
   BlockCacheLookupContext lookup_context{caller};
   bool need_upper_bound_check =
       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
@@ -3068,10 +3069,11 @@ InternalIterator* BlockBasedTable::NewIterator(
         arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
     return new (mem) BlockBasedTableIterator<DataBlockIter>(
         this, read_options, rep_->internal_comparator,
-        NewIndexIterator(read_options, need_upper_bound_check &&
-                         rep_->index_type == BlockBasedTableOptions::kHashSearch,
-                         /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                         &lookup_context),
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
@@ -3395,7 +3397,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       ro.read_tier = kBlockCacheTier;
 
       for (auto miter = data_block_range.begin();
-            miter != data_block_range.end(); ++miter) {
+           miter != data_block_range.end(); ++miter) {
         const Slice& key = miter->ikey;
         iiter->Seek(miter->ikey);
 
@@ -3405,9 +3407,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         }
         if (!iiter->Valid() ||
             (!v.first_internal_key.empty() && !skip_filters &&
-            UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                    .Compare(ExtractUserKey(key),
-                             ExtractUserKey(v.first_internal_key)) < 0)) {
+             UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                     .Compare(ExtractUserKey(key),
+                              ExtractUserKey(v.first_internal_key)) < 0)) {
           // The requested key falls between highest key in previous block and
           // lowest key in current block.
           *(miter->s) = iiter->status();
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 38c3cc05fa2..90766b1140b 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -138,8 +138,8 @@ class FilterBlockReader {
       GetContext* const get_context = iter->get_context;
       if (prefix_extractor->InDomain(ukey) &&
           !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                       block_offset, no_io, &ikey, get_context,
-                       lookup_context)) {
+                          block_offset, no_io, &ikey, get_context,
+                          lookup_context)) {
         range->SkipKey(iter);
       }
     }
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 905bbd217ba..b00949c3c8a 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <array>
 #include "table/block_based/full_filter_block.h"
+#include <array>
 
 #include "monitoring/perf_context_imp.h"
 #include "port/malloc.h"
@@ -218,8 +218,7 @@ void FullFilterBlockReader::PrefixesMayMatch(
 }
 
 void FullFilterBlockReader::MayMatch(
-    MultiGetRange* range, bool no_io,
-    const SliceTransform* prefix_extractor,
+    MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
     BlockCacheLookupContext* lookup_context) const {
   CachableEntry<BlockContents> filter_block;
 
@@ -252,8 +251,7 @@ void FullFilterBlockReader::MayMatch(
   autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
   int num_keys = 0;
   MultiGetRange filter_range(*range, range->begin(), range->end());
-  for (auto iter = filter_range.begin();
-       iter != filter_range.end(); ++iter) {
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
     if (!prefix_extractor) {
       keys[num_keys++] = &iter->ukey;
     } else if (prefix_extractor->InDomain(iter->ukey)) {
@@ -266,15 +264,14 @@ void FullFilterBlockReader::MayMatch(
   filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
 
   int i = 0;
-  for (auto iter = filter_range.begin();
-       iter != filter_range.end(); ++iter) {
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
     if (!may_match[i]) {
       // Update original MultiGet range to skip this key. The filter_range
       // was temporarily used just to skip keys not in prefix_extractor domain
       range->SkipKey(iter);
       PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
     } else {
-      //PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
       PerfContext* perf_ctx = get_perf_context();
       perf_ctx->bloom_sst_hit_count++;
     }
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index 559e8af5cbd..10ceab16f22 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -102,12 +102,12 @@ size_t UncompressionDictReader::ApproximateMemoryUsage() const {
                      : 0;
 
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+  usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
 #else
-    usage += sizeof(*this);
+  usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
 
-    return usage;
+  return usage;
 }
 
 bool UncompressionDictReader::cache_dictionary_blocks() const {
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index d90a147573a..fb0445bcfb3 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -51,7 +51,8 @@ class CuckooTableReader: public TableReader {
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
-                                TableReaderCaller caller, size_t compaction_readahead_size = 0) override;
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index 8043d36ab8e..2dfe887ba81 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -14,9 +14,9 @@ int main() {
 #else
 
 #include <cinttypes>
-#include <vector>
-#include <string>
 #include <map>
+#include <string>
+#include <vector>
 
 #include "memory/arena.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
diff --git a/table/mock_table.cc b/table/mock_table.cc
index fb092bd5eb9..50cbb202475 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -28,7 +28,8 @@ stl_wrappers::KVMap MakeMockFile(
 
 InternalIterator* MockTableReader::NewIterator(
     const ReadOptions&, const SliceTransform* /* prefix_extractor */,
-    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, size_t /*compaction_readahead_size*/) {
+    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/) {
   return new MockTableIterator(table_);
 }
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 8ba937ac83b..81d178810f2 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -42,7 +42,7 @@ class MockTableReader : public TableReader {
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
                                 TableReaderCaller caller,
-                              size_t compaction_readahead_size = 0) override;
+                                size_t compaction_readahead_size = 0) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
              GetContext* get_context, const SliceTransform* prefix_extractor,
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
index 92461bc86e5..0de541b5685 100644
--- a/table/plain/plain_table_bloom.cc
+++ b/table/plain/plain_table_bloom.cc
@@ -5,13 +5,12 @@
 
 #include "table/plain/plain_table_bloom.h"
 
-#include <string>
 #include <algorithm>
+#include <string>
 #include "util/dynamic_bloom.h"
 
 #include "memory/allocator.h"
 
-
 namespace rocksdb {
 
 namespace {
@@ -28,7 +27,7 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 
   return num_blocks * (CACHE_LINE_SIZE * 8);
 }
-}
+}  // namespace
 
 PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
     : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
@@ -40,10 +39,10 @@ void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
   kNumBlocks = num_blocks;
 }
 
-void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
-                                uint32_t total_bits, uint32_t locality,
-                                size_t huge_page_tlb_size,
-                                Logger* logger) {
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                                     uint32_t locality,
+                                     size_t huge_page_tlb_size,
+                                     Logger* logger) {
   kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
                               : (total_bits + 7) / 8 * 8;
   kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
@@ -66,7 +65,8 @@ void PlainTableBloomV1::SetTotalBits(Allocator* allocator,
   data_ = raw;
 }
 
-void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
+void BloomBlockBuilder::AddKeysHashes(
+    const std::vector<uint32_t>& keys_hashes) {
   for (auto hash : keys_hashes) {
     bloom_.AddHash(hash);
   }
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
index ae08db1aa65..8da256b3bb8 100644
--- a/table/plain/plain_table_bloom.h
+++ b/table/plain/plain_table_bloom.h
@@ -4,8 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "rocksdb/slice.h"
 
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index fa248d54ff5..e2d0e859282 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -83,7 +83,8 @@ class PlainTableReader: public TableReader {
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
-                                TableReaderCaller caller, size_t compaction_readahead_size = 0) override;
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
 
   void Prepare(const Slice& target) override;
 
diff --git a/table/table_reader.h b/table/table_reader.h
index bf4dad766de..712c20c9a53 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -45,11 +45,12 @@ class TableReader {
   //        all the states but those allocated in arena.
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
-  // compaction_readahead_size: its value will only be used if caller = kCompaction
-  virtual InternalIterator* NewIterator(const ReadOptions&,
-                                        const SliceTransform* prefix_extractor,
-                                        Arena* arena, bool skip_filters,
-                                        TableReaderCaller caller, size_t compaction_readahead_size = 0) = 0;
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction
+  virtual InternalIterator* NewIterator(
+      const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena,
+      bool skip_filters, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0) = 0;
 
   virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc
index b71ad0a1f56..7043eb2d789 100644
--- a/test_util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -6,8 +6,8 @@
 
 #include "test_util/transaction_test_util.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <numeric>
 #include <random>
 #include <string>
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index caf73e7f75c..4f2a9f8bf95 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -16,12 +16,12 @@
 #include <unistd.h>
 #endif
 #include <fcntl.h>
-#include <cinttypes>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstddef>
 #include <memory>
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index fb002592ee8..cb8f96765ac 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -29,13 +29,13 @@ int main() {
 #else
 
 #include <fcntl.h>
-#include <cinttypes>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <algorithm>
 #include <array>
 #include <chrono>
+#include <cinttypes>
 #include <exception>
 #include <queue>
 #include <thread>
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 83a3038255f..50514c8a029 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -226,8 +226,8 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
                                          parsed_params.flags);
   } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) {
     return new DropColumnFamilyCommand(parsed_params.cmd_params,
-                                         parsed_params.option_map,
-                                         parsed_params.flags);
+                                       parsed_params.option_map,
+                                       parsed_params.flags);
   } else if (parsed_params.cmd == DBFileDumperCommand::Name()) {
     return new DBFileDumperCommand(parsed_params.cmd_params,
                                    parsed_params.option_map,
@@ -1176,7 +1176,7 @@ DropColumnFamilyCommand::DropColumnFamilyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-  : LDBCommand(options, flags, true, {ARG_DB}) {
+    : LDBCommand(options, flags, true, {ARG_DB}) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
         "The name of column family to drop must be specified");
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index c5576873dbf..ae0b20bea51 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -573,8 +573,9 @@ Status TraceAnalyzer::MakeStatistics() {
       // output the access count distribution
       if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
         for (auto& record : stat.second.a_count_stats) {
-          ret = snprintf(buffer_, sizeof(buffer_), "access_count: %" PRIu64 " num: %" PRIu64 "\n",
-                        record.first, record.second);
+          ret = snprintf(buffer_, sizeof(buffer_),
+                         "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+                         record.first, record.second);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -597,8 +598,8 @@ Status TraceAnalyzer::MakeStatistics() {
           get_mid = true;
         }
         if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
-          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n", record.first,
-                        record.second);
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
+                         record.first, record.second);
           if (ret < 0) {
             return Status::IOError("Format output failed");
           }
@@ -626,9 +627,9 @@ Status TraceAnalyzer::MakeStatistics() {
             (type == TraceOperationType::kPut ||
              type == TraceOperationType::kMerge)) {
           ret = snprintf(buffer_, sizeof(buffer_),
-                        "Number_of_value_size_between %" PRIu64 " and %" PRIu64
-                        " is: %" PRIu64 "\n",
-                        v_begin, v_end, record.second);
+                         "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+                         " is: %" PRIu64 "\n",
+                         v_begin, v_end, record.second);
           if (ret < 0) {
             return Status::IOError("Format output failed");
           }
@@ -676,9 +677,10 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
       succ_ratio = (static_cast<double>(record.second.succ_count)) /
                    record.second.access_count;
     }
-    ret = snprintf(buffer_, sizeof(buffer_), "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
-                  record.second.cf_id, record.second.value_size,
-                  record.second.key_id, record.second.access_count, succ_ratio);
+    ret = snprintf(buffer_, sizeof(buffer_),
+                   "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id,
+                   record.second.value_size, record.second.key_id,
+                   record.second.access_count, succ_ratio);
     if (ret < 0) {
       return Status::IOError("Format output failed");
     }
@@ -704,9 +706,11 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
           prefix_succ_ratio =
               (static_cast<double>(prefix_succ_access)) / prefix_access;
         }
-        ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
-                      record.second.key_id, prefix_access, prefix_count,
-                      prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+        ret =
+            snprintf(buffer_, sizeof(buffer_),
+                     "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+                     record.second.key_id, prefix_access, prefix_count,
+                     prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
         if (ret < 0) {
           return Status::IOError("Format output failed");
         }
@@ -871,7 +875,8 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
             cur_num = find_time->second;
           }
-          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n",
+                         cur_num, cur_ratio);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -889,8 +894,8 @@ Status TraceAnalyzer::MakeStatisticQPS() {
       if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
         while (!stat.second.top_k_qps_sec.empty()) {
           ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n",
-                        stat.second.top_k_qps_sec.top().second,
-                        stat.second.top_k_qps_sec.top().first);
+                         stat.second.top_k_qps_sec.top().second,
+                         stat.second.top_k_qps_sec.top().first);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -907,8 +912,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
               std::string qps_prefix_out =
                   rocksdb::LDBCommand::StringToHex(qps_prefix.first);
-              ret = snprintf(buffer_, sizeof(buffer_), "The prefix: %s Access count: %u\n",
-                            qps_prefix_out.c_str(), qps_prefix.second);
+              ret = snprintf(buffer_, sizeof(buffer_),
+                             "The prefix: %s Access count: %u\n",
+                             qps_prefix_out.c_str(), qps_prefix.second);
               if (ret < 0) {
                 return Status::IOError("Format the output failed");
               }
@@ -1017,9 +1023,10 @@ Status TraceAnalyzer::ReProcessing() {
           if (found != stat.a_key_stats.end()) {
             key_id = found->second.key_id;
           }
-          ret = snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
-                        stat.time_series.front().type,
-                        stat.time_series.front().ts, key_id);
+          ret =
+              snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
+                       stat.time_series.front().type,
+                       stat.time_series.front().ts, key_id);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -1065,9 +1072,9 @@ Status TraceAnalyzer::ReProcessing() {
             TraceStats& stat = ta_[type].stats[cf_id];
             if (stat.w_key_f) {
               if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
-                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
-                              cfs_[cf_id].w_count,
-                              stat.a_key_stats[input_key].access_count);
+                ret = snprintf(buffer_, sizeof(buffer_),
+                               "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count,
+                               stat.a_key_stats[input_key].access_count);
                 if (ret < 0) {
                   return Status::IOError("Format the output failed");
                 }
@@ -1087,8 +1094,8 @@ Status TraceAnalyzer::ReProcessing() {
                 prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
                 std::string prefix_out =
                     rocksdb::LDBCommand::StringToHex(prefix[type]);
-                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
-                              prefix_out.c_str());
+                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n",
+                               cfs_[cf_id].w_count, prefix_out.c_str());
                 if (ret < 0) {
                   return Status::IOError("Format the output failed");
                 }
@@ -1904,8 +1911,8 @@ Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
                                          const uint64_t ts) {
   std::string hex_key = rocksdb::LDBCommand::StringToHex(key);
   int ret;
-  ret =
-      snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
+  ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type,
+                 cf_id, value_size, ts);
   if (ret < 0) {
     return Status::IOError("failed to format the output");
   }
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 95948ef5730..7e7a1a76541 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -56,8 +56,8 @@ int main() {
 }
 #else
 
-#include <cinttypes>
 #include <atomic>
+#include <cinttypes>
 #include <random>
 #include <set>
 #include <string>
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 26146152106..cda5e73e123 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -23,8 +23,8 @@ int main() {
 #include "table/full_filter_bits_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/hash.h"
 #include "util/gflags_compat.h"
+#include "util/hash.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
@@ -98,9 +98,7 @@ class BloomTest : public testing::Test {
     return filter_.size();
   }
 
-  Slice FilterData() const {
-    return Slice(filter_);
-  }
+  Slice FilterData() const { return Slice(filter_); }
 
   void DumpFilter() {
     fprintf(stderr, "F(");
@@ -190,28 +188,28 @@ TEST_F(BloomTest, VaryingLengths) {
 TEST_F(BloomTest, Schema) {
   char buffer[sizeof(int)];
 
-  ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5
+  ResetPolicy(NewBloomFilterPolicy(8));  // num_probes = 5
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
 
-  ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(9));  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 969445585);
 
-  ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7
+  ResetPolicy(NewBloomFilterPolicy(11));  // num_probes = 7
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 1694458207);
 
-  ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(10));  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
@@ -235,7 +233,6 @@ TEST_F(BloomTest, Schema) {
   ResetPolicy();
 }
 
-
 // Different bits-per-byte
 
 class FullBloomTest : public testing::Test {
@@ -287,9 +284,7 @@ class FullBloomTest : public testing::Test {
     return filter_size_;
   }
 
-  Slice FilterData() {
-    return Slice(buf_.get(), filter_size_);
-  }
+  Slice FilterData() { return Slice(buf_.get(), filter_size_); }
 
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
@@ -381,9 +376,8 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
 }
 
 namespace {
-inline uint32_t SelectByCacheLineSize(uint32_t for64,
-                                  uint32_t for128,
-                                  uint32_t for256) {
+inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
+                                      uint32_t for256) {
   (void)for64;
   (void)for128;
   (void)for256;
@@ -394,10 +388,10 @@ inline uint32_t SelectByCacheLineSize(uint32_t for64,
 #elif CACHE_LINE_SIZE == 256
   return for256;
 #else
-  #error "CACHE_LINE_SIZE unknown or unrecognized"
+#error "CACHE_LINE_SIZE unknown or unrecognized"
 #endif
 }
-} // namespace
+}  // namespace
 
 // Ensure the implementation doesn't accidentally change in an
 // incompatible way
@@ -407,7 +401,7 @@ TEST_F(FullBloomTest, Schema) {
   // Use enough keys so that changing bits / key by 1 is guaranteed to
   // change number of allocated cache lines. So keys > max cache line bits.
 
-  ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5
+  ResetPolicy(NewBloomFilterPolicy(8));  // num_probes = 5
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
@@ -415,7 +409,7 @@ TEST_F(FullBloomTest, Schema) {
   ASSERT_EQ(BloomHash(FilterData()),
             SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
 
-  ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(9));  // num_probes = 6
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
@@ -423,7 +417,7 @@ TEST_F(FullBloomTest, Schema) {
   ASSERT_EQ(BloomHash(FilterData()),
             SelectByCacheLineSize(2092755149, 661139132, 1182970461));
 
-  ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7
+  ResetPolicy(NewBloomFilterPolicy(11));  // num_probes = 7
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
@@ -431,7 +425,7 @@ TEST_F(FullBloomTest, Schema) {
   ASSERT_EQ(BloomHash(FilterData()),
             SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
 
-  ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(10));  // num_probes = 6
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 79081298d4e..591c623a5e5 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -15,21 +15,22 @@
 
 #ifdef HAVE_ARM64_CRYPTO
 /* unfolding to compute 8 * 3 = 24 bytes parallelly */
-#define CRC32C24BYTES(ITR) \
-  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));\
-  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH*2 + (ITR)));\
+#define CRC32C24BYTES(ITR)                                    \
+  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));     \
+  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
   crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
 
 /* unfolding to compute 24 * 7 = 168 bytes parallelly */
-#define CRC32C7X24BYTES(ITR) do {\
-  CRC32C24BYTES((ITR)*7+0) \
-  CRC32C24BYTES((ITR)*7+1) \
-  CRC32C24BYTES((ITR)*7+2) \
-  CRC32C24BYTES((ITR)*7+3) \
-  CRC32C24BYTES((ITR)*7+4) \
-  CRC32C24BYTES((ITR)*7+5) \
-  CRC32C24BYTES((ITR)*7+6) \
-} while(0)
+#define CRC32C7X24BYTES(ITR)   \
+  do {                         \
+    CRC32C24BYTES((ITR)*7 + 0) \
+    CRC32C24BYTES((ITR)*7 + 1) \
+    CRC32C24BYTES((ITR)*7 + 2) \
+    CRC32C24BYTES((ITR)*7 + 3) \
+    CRC32C24BYTES((ITR)*7 + 4) \
+    CRC32C24BYTES((ITR)*7 + 5) \
+    CRC32C24BYTES((ITR)*7 + 6) \
+  } while (0)
 #endif
 
 uint32_t crc32c_runtime_check(void) {
@@ -45,15 +46,15 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
   crc ^= 0xffffffff;
 
 #ifdef HAVE_ARM64_CRYPTO
-  /* Crc32c Parallel computation
-   *   Algorithm comes from Intel whitepaper:
-   *   crc-iscsi-polynomial-crc32-instruction-paper
-   *
-   * Input data is divided into three equal-sized blocks
-   *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
-   *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
-   */
-  #define BLK_LENGTH 42
+/* Crc32c Parallel computation
+ *   Algorithm comes from Intel whitepaper:
+ *   crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
   while (length >= 1024) {
     uint64_t t0, t1;
     uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
@@ -97,31 +98,29 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
     length -= 1024;
   }
 
-  if (length == 0)
-    return crc ^ (0xffffffffU);
+  if (length == 0) return crc ^ (0xffffffffU);
 #endif
   buf8 = (const uint8_t *)buf64;
   while (length >= 8) {
-    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
+    crc = crc32c_u64(crc, *(const uint64_t *)buf8);
     buf8 += 8;
     length -= 8;
   }
 
   /* The following is more efficient than the straight loop */
   if (length >= 4) {
-    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
+    crc = crc32c_u32(crc, *(const uint32_t *)buf8);
     buf8 += 4;
     length -= 4;
   }
 
   if (length >= 2) {
-    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
+    crc = crc32c_u16(crc, *(const uint16_t *)buf8);
     buf8 += 2;
     length -= 2;
   }
 
-  if (length >= 1)
-    crc = crc32c_u8(crc, *buf8);
+  if (length >= 1) crc = crc32c_u8(crc, *buf8);
 
   crc ^= 0xffffffff;
   return crc;
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index ec87720eccc..66fe30c14f2 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -13,7 +13,7 @@
 #ifdef __ARM_FEATURE_CRC32
 #define HAVE_ARM64_CRC
 #include <arm_acle.h>
-#define crc32c_u8(crc, v)  __crc32cb(crc, v)
+#define crc32c_u8(crc, v) __crc32cb(crc, v)
 #define crc32c_u16(crc, v) __crc32ch(crc, v)
 #define crc32c_u32(crc, v) __crc32cw(crc, v)
 #define crc32c_u64(crc, v) __crc32cd(crc, v)
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 26ff1c3aaf3..ebbae149de3 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -23,16 +23,15 @@ uint32_t roundUpToPow2(uint32_t x) {
   }
   return rv;
 }
-
 }
 
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
-                           uint32_t num_probes,
-                           size_t huge_page_tlb_size, Logger* logger)
+                           uint32_t num_probes, size_t huge_page_tlb_size,
+                           Logger* logger)
     // Round down, except round up with 1
     : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
-  assert(num_probes % 2 == 0); // limitation of current implementation
-  assert(num_probes <= 10); // limitation of current implementation
+  assert(num_probes % 2 == 0);  // limitation of current implementation
+  assert(num_probes <= 10);     // limitation of current implementation
   assert(kNumDoubleProbes > 0);
 
   // Determine how much to round off + align by so that x ^ i (that's xor) is
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index cf6ac40600a..312d2805be6 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -44,10 +44,8 @@ class DynamicBloom {
   //                      it to be allocated, like:
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
-  explicit DynamicBloom(Allocator* allocator,
-                        uint32_t total_bits,
-                        uint32_t num_probes = 6,
-                        size_t huge_page_tlb_size = 0,
+  explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                        uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
 
   ~DynamicBloom() {}
@@ -159,8 +157,8 @@ inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
   uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
   for (unsigned i = 0;; ++i) {
     // Two bit probes per uint64_t probe
-    uint64_t mask = ((uint64_t)1 << (h & 63))
-                  | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
     uint64_t val = data_[a ^ i].load(std::memory_order_relaxed);
     if (i + 1 >= kNumDoubleProbes) {
       return (val & mask) == mask;
@@ -179,8 +177,8 @@ inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
   uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
   for (unsigned i = 0;; ++i) {
     // Two bit probes per uint64_t probe
-    uint64_t mask = ((uint64_t)1 << (h & 63))
-                  | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
     or_func(&data_[a ^ i], mask);
     if (i + 1 >= kNumDoubleProbes) {
       return;
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 4feaa1f6486..e4a8c66e307 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -11,9 +11,9 @@ int main() {
 }
 #else
 
-#include <cinttypes>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <memory>
 #include <thread>
@@ -253,8 +253,9 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       threads.pop_back();
     }
 
-    fprintf(stderr, "dynamic bloom, avg parallel add latency %3g"
-                    " nanos/key\n",
+    fprintf(stderr,
+            "dynamic bloom, avg parallel add latency %3g"
+            " nanos/key\n",
             static_cast<double>(elapsed) / num_threads / num_keys);
 
     elapsed = 0;
@@ -276,8 +277,9 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       threads.pop_back();
     }
 
-    fprintf(stderr, "dynamic bloom, avg parallel hit latency %3g"
-                    " nanos/key\n",
+    fprintf(stderr,
+            "dynamic bloom, avg parallel hit latency %3g"
+            " nanos/key\n",
             static_cast<double>(elapsed) / num_threads / num_keys);
 
     elapsed = 0;
@@ -286,8 +288,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       KeyMaker km;
       StopWatchNano timer(Env::Default());
       timer.Start();
-      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
-           i += num_threads) {
+      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
         bool f = std_bloom.MayContain(km.Seq(i));
         if (f) {
           ++false_positives;
@@ -303,8 +304,9 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       threads.pop_back();
     }
 
-    fprintf(stderr, "dynamic bloom, avg parallel miss latency %3g"
-                    " nanos/key, %f%% false positive rate\n",
+    fprintf(stderr,
+            "dynamic bloom, avg parallel miss latency %3g"
+            " nanos/key, %f%% false positive rate\n",
             static_cast<double>(elapsed) / num_threads / num_keys,
             false_positives.load() * 100.0 / num_keys);
   }
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 9d8eb1fa114..dcfe39fbc14 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -52,10 +52,14 @@ TEST(HashTest, Values) {
   EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u);
   EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u);
   EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u);
-  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434u);
-  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151u);
-  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696u);
-  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912u);
+  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            2706087434u);
+  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            1534654151u);
+  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            2355554696u);
+  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            1400800912u);
   EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
             3420325137u);
   EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index 7795e01fc9d..f7809b989b0 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -9,8 +9,8 @@
 
 #include "util/rate_limiter.h"
 
-#include <cinttypes>
 #include <chrono>
+#include <cinttypes>
 #include <limits>
 
 #include "db/db_test_util.h"
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index 2e71f223f41..8d0e01e0386 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -98,24 +98,23 @@ struct ThreadPoolImpl::Impl {
   void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
 
 private:
-
-  static void BGThreadWrapper(void* arg);
-
-  bool low_io_priority_;
-  bool low_cpu_priority_;
-  Env::Priority priority_;
-  Env*         env_;
-
-  int total_threads_limit_;
-  std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
-  bool exit_all_threads_;
-  bool wait_for_jobs_to_complete_;
-
-  // Entry per Schedule()/Submit() call
-  struct BGItem {
-    void* tag = nullptr;
-    std::function<void()> function;
-    std::function<void()> unschedFunction;
+ static void BGThreadWrapper(void* arg);
+
+ bool low_io_priority_;
+ bool low_cpu_priority_;
+ Env::Priority priority_;
+ Env* env_;
+
+ int total_threads_limit_;
+ std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
+ bool exit_all_threads_;
+ bool wait_for_jobs_to_complete_;
+
+ // Entry per Schedule()/Submit() call
+ struct BGItem {
+   void* tag = nullptr;
+   std::function<void()> function;
+   std::function<void()> unschedFunction;
   };
 
   using BGQueue = std::deque<BGItem>;
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 9adeb721bcc..aca50a2b6c5 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -24,10 +24,10 @@
 #include "util/string_util.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
-#include <cinttypes>
 #include <stdlib.h>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <future>
 #include <limits>
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index de16b1522a8..14bfb430954 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -6,8 +6,8 @@
 #ifndef ROCKSDB_LITE
 #include "utilities/blob_db/blob_file.h"
 
-#include <cinttypes>
 #include <stdio.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <limits>
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 0639ed2f2b4..6d025662e0f 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -11,8 +11,8 @@
 
 #include "utilities/checkpoint/checkpoint_impl.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include <vector>
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index d748f500ebc..d7410960a44 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -313,106 +313,105 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
 }
 
 TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
-    // Create a database
-    Status s;
-    auto options = CurrentOptions();
-    options.create_if_missing = true;
-    CreateAndReopenWithCF({}, options);
-
-    // Helper to verify the number of files in metadata and export dir
-    auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
-                                     int num_files_expected) {
-      ASSERT_EQ(metadata.files.size(), num_files_expected);
-      std::vector<std::string> subchildren;
-      env_->GetChildren(export_path_, &subchildren);
-      int num_children = 0;
-      for (const auto& child : subchildren) {
-        if (child != "." && child != "..") {
-          ++num_children;
-        }
+  // Create a database
+  Status s;
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  // Helper to verify the number of files in metadata and export dir
+  auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
+                                   int num_files_expected) {
+    ASSERT_EQ(metadata.files.size(), num_files_expected);
+    std::vector<std::string> subchildren;
+    env_->GetChildren(export_path_, &subchildren);
+    int num_children = 0;
+    for (const auto& child : subchildren) {
+      if (child != "." && child != "..") {
+        ++num_children;
       }
-      ASSERT_EQ(num_children, num_files_expected);
-    };
-
-    // Test DefaultColumnFamily
-    {
-      const auto key = std::string("foo");
-      ASSERT_OK(Put(key, "v1"));
-
-      Checkpoint* checkpoint;
-      ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-
-      // Export the Tables and verify
-      ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
-                                               export_path_, &metadata_));
-      verify_files_exported(*metadata_, 1);
-      ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-      test::DestroyDir(env_, export_path_);
-      delete metadata_;
-      metadata_ = nullptr;
-
-      // Check again after compaction
-      CompactAll();
-      ASSERT_OK(Put(key, "v2"));
-      ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
-                                               export_path_, &metadata_));
-      verify_files_exported(*metadata_, 2);
-      ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-      test::DestroyDir(env_, export_path_);
-      delete metadata_;
-      metadata_ = nullptr;
-      delete checkpoint;
-    }
-
-    // Test non default column family with non default comparator
-    {
-      auto cf_options = CurrentOptions();
-      cf_options.comparator = ReverseBytewiseComparator();
-      ASSERT_OK(
-          db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
-
-      const auto key = std::string("foo");
-      ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
-
-      Checkpoint* checkpoint;
-      ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-
-      // Export the Tables and verify
-      ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
-                                               &metadata_));
-      verify_files_exported(*metadata_, 1);
-      ASSERT_EQ(metadata_->db_comparator_name,
-                ReverseBytewiseComparator()->Name());
-      delete checkpoint;
     }
-}
-
-TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
-    // Create a database
-    Status s;
-    auto options = CurrentOptions();
-    options.create_if_missing = true;
-    CreateAndReopenWithCF({}, options);
+    ASSERT_EQ(num_children, num_files_expected);
+  };
 
+  // Test DefaultColumnFamily
+  {
     const auto key = std::string("foo");
     ASSERT_OK(Put(key, "v1"));
 
     Checkpoint* checkpoint;
     ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
 
-    // Export onto existing directory
-    env_->CreateDirIfMissing(export_path_);
-    ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
-                                             export_path_, &metadata_),
-              Status::InvalidArgument("Specified export_dir exists"));
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+    test::DestroyDir(env_, export_path_);
+    delete metadata_;
+    metadata_ = nullptr;
+
+    // Check again after compaction
+    CompactAll();
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 2);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
     test::DestroyDir(env_, export_path_);
+    delete metadata_;
+    metadata_ = nullptr;
+    delete checkpoint;
+  }
+
+  // Test non default column family with non default comparator
+  {
+    auto cf_options = CurrentOptions();
+    cf_options.comparator = ReverseBytewiseComparator();
+    ASSERT_OK(db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
+
+    const auto key = std::string("foo");
+    ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
 
-    // Export with invalid directory specification
-    export_path_ = "";
-    ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
-                                             export_path_, &metadata_),
-              Status::InvalidArgument("Specified export_dir invalid"));
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
+                                             &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name,
+              ReverseBytewiseComparator()->Name());
     delete checkpoint;
+  }
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
+  // Create a database
+  Status s;
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  const auto key = std::string("foo");
+  ASSERT_OK(Put(key, "v1"));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  // Export onto existing directory
+  env_->CreateDirIfMissing(export_path_);
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir exists"));
+  test::DestroyDir(env_, export_path_);
+
+  // Export with invalid directory specification
+  export_path_ = "";
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir invalid"));
+  delete checkpoint;
 }
 
 TEST_F(CheckpointTest, CheckpointCF) {
diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc
index 5dbf051157e..3d0de94f502 100644
--- a/utilities/merge_operators/sortlist.cc
+++ b/utilities/merge_operators/sortlist.cc
@@ -2,10 +2,10 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
+#include "utilities/merge_operators/sortlist.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "utilities/merge_operators.h"
-#include "utilities/merge_operators/sortlist.h"
 
 using rocksdb::Logger;
 using rocksdb::MergeOperator;
diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc
index 752a6fb70b6..01b675aec57 100644
--- a/utilities/persistent_cache/persistent_cache_tier.cc
+++ b/utilities/persistent_cache/persistent_cache_tier.cc
@@ -8,8 +8,8 @@
 #include "utilities/persistent_cache/persistent_cache_tier.h"
 
 #include <cinttypes>
-#include <string>
 #include <sstream>
+#include <string>
 
 namespace rocksdb {
 
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index ac57a423012..cf0390a1bd7 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -235,7 +235,9 @@ class SimCacheImpl : public SimCache {
     return cache_->GetUsage(handle);
   }
 
-  size_t GetCharge(Handle* handle) const override { return cache_->GetCharge(handle); }
+  size_t GetCharge(Handle* handle) const override {
+    return cache_->GetCharge(handle);
+  }
 
   size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index 2a693349549..35495e9e159 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -9,7 +9,6 @@
 #include <string>
 #include <thread>
 
-
 #include "db/db_impl/db_impl.h"
 #include "logging/logging.h"
 #include "port/port.h"
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index f10c1b60ff6..637a38b9ab8 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -638,8 +638,7 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
   // in case it is not already in the map
   auto result = cf_key_map.try_emplace(key, seq);
   auto iter = result.first;
-  if (!result.second && 
-      seq < iter->second.seq) {
+  if (!result.second && seq < iter->second.seq) {
     // Now tracking this key with an earlier sequence number
     iter->second.seq = seq;
   }
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 03bfb6537fd..62e03328249 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -5,8 +5,8 @@
 
 #pragma once
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <functional>
 #include <string>
 #include <thread>
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index d4f0d993aa8..7357907d068 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1445,8 +1445,8 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
       s = s_vec[0];
       ASSERT_TRUE(s.ok() || s.IsTryAgain());
       Slice key("key");
-      txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val,
-                    &s, true);
+      txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s,
+                    true);
       ASSERT_TRUE(s.ok() || s.IsTryAgain());
       delete txn;
     }
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index d06c2b62d2b..504541ba018 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -43,7 +43,7 @@ Status WritePreparedTxnDB::Initialize(
     ordered_seq_cnt[seq] = cnt;
   }
   // AddPrepared must be called in order
-  for (auto seq_cnt: ordered_seq_cnt) {
+  for (auto seq_cnt : ordered_seq_cnt) {
     auto seq = seq_cnt.first;
     auto cnt = seq_cnt.second;
     for (size_t i = 0; i < cnt; i++) {
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index cdcee494100..fdeaf879ee1 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -544,7 +544,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       } else {
         assert(heap_top_.load() < v);
       }
-      heap_.push_back(v); 
+      heap_.push_back(v);
     }
     void pop(bool locked = false) {
       if (!locked) {
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index d4e5abad542..9b58d8bc87c 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -411,9 +411,8 @@ Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
     bool trailing_batch = i == unflushed_save_points_->size();
     SavePointBatchHandler sp_handler(&write_batch_,
                                      *wupt_db_->GetCFHandleMap().get());
-    size_t curr_boundary = trailing_batch
-                               ? wb.GetWriteBatch()->GetDataSize()
-                               : (*unflushed_save_points_)[i];
+    size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize()
+                                          : (*unflushed_save_points_)[i];
 
     // Construct the partial write batch up to the savepoint.
     //
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 9d3417f9cd1..a2290b8d4e2 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -293,7 +293,7 @@ Status WriteUnpreparedTxnDB::Initialize(
     }
   }
   // AddPrepared must be called in order
-  for (auto seq_cnt: ordered_seq_cnt) {
+  for (auto seq_cnt : ordered_seq_cnt) {
     auto seq = seq_cnt.first;
     auto cnt = seq_cnt.second;
     for (size_t i = 0; i < cnt; i++) {
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 1111c13a79f..04258080204 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -103,8 +103,8 @@ class DBWithTTLImpl : public DBWithTTL {
   void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override;
 
  private:
-   // remember whether the Close completes or not
-   bool closed_;
+  // remember whether the Close completes or not
+  bool closed_;
 };
 
 class TtlIterator : public Iterator {
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 61f5e64497d..f95ce4b79c3 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -87,14 +87,10 @@ class TtlTest : public testing::Test {
   }
 
   // Call db_ttl_->Close() before delete db_ttl_
-  void CloseTtl() {
-    CloseTtlHelper(true);
-  }
+  void CloseTtl() { CloseTtlHelper(true); }
 
   // No db_ttl_->Close() before delete db_ttl_
-  void CloseTtlNoDBClose() {
-    CloseTtlHelper(false);
-  }
+  void CloseTtlNoDBClose() { CloseTtlHelper(false); }
 
   void CloseTtlHelper(bool close_db) {
     if (db_ttl_ != nullptr) {
@@ -416,7 +412,6 @@ TEST_F(TtlTest, NoEffect) {
   CloseTtl();
 }
 
-
 // Rerun the NoEffect test with a different version of CloseTtl
 // function, where db is directly deleted without close.
 TEST_F(TtlTest, DestructWithoutClose) {
@@ -425,18 +420,18 @@ TEST_F(TtlTest, DestructWithoutClose) {
   int64_t boundary2 = 2 * boundary1;
 
   OpenTtl();
-  PutValues(0, boundary1);                       //T=0: Set1 never deleted
-  SleepCompactCheck(1, 0, boundary1);            //T=1: Set1 still there
+  PutValues(0, boundary1);             // T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);  // T=1: Set1 still there
   CloseTtlNoDBClose();
 
   OpenTtl(0);
-  PutValues(boundary1, boundary2 - boundary1);   //T=1: Set2 never deleted
-  SleepCompactCheck(1, 0, boundary2);            //T=2: Sets1 & 2 still there
+  PutValues(boundary1, boundary2 - boundary1);  // T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);           // T=2: Sets1 & 2 still there
   CloseTtlNoDBClose();
 
   OpenTtl(-1);
-  PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted
-  SleepCompactCheck(1, 0, kSampleSize_, true);    //T=4: Sets 1,2,3 still there
+  PutValues(boundary2, kSampleSize_ - boundary2);  // T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);  // T=4: Sets 1,2,3 still there
   CloseTtlNoDBClose();
 }
 
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index d9f9eabd978..2f51f4f9f57 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -918,9 +918,9 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
 
       if (merge_operator) {
         std::string merge_result;
-        s = MergeHelper::TimedFullMerge(
-            merge_operator, key, merge_data, merge_context.GetOperands(),
-            &merge_result, logger, statistics, env);
+        s = MergeHelper::TimedFullMerge(merge_operator, key, merge_data,
+                                        merge_context.GetOperands(),
+                                        &merge_result, logger, statistics, env);
         pinnable_val->Reset();
         *pinnable_val->GetSelf() = std::move(merge_result);
         pinnable_val->PinSelf();
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 7c7efddd154..ed28fa8fd27 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -1308,7 +1308,6 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
   DestroyDB(dbname, options);
 }
 
-
 TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
   DB* db;
   Options options;

From 42f898bf839356a8e3aebad2bb0ec68bbba040d9 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 23 Sep 2019 09:05:18 -0700
Subject: [PATCH 409/572] Fix format-diff.sh detecting changes vs. upstream
 (#5831)

Summary:
format-diff.sh, a.k.a. 'make format', would use 'master'
to decide which commits are probably unpublished. Much better to use
facebook remote master since local master may not be caught up and may
have its own unpublished commits. Script now tries to compare against
facebook remote master branch (branch pointer is updated with any fetch
or pull), because those differences are what would be considered the
differences for a pull request.

Also, script would compare against *parent* of merge-base with that
reference point, which is just wrong since that includes the last
published commit.

In case of problems, you can now customize the reference point, by
setting the FORMAT_UPSTREAM variable.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5831

Test Plan: manual

Differential Revision: D17528462

Pulled By: pdillinger

fbshipit-source-id: 50fdb8795d683bf3c14d449669c1a5299e0dfa8b
---
 build_tools/format-diff.sh | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index f8fbb64fbae..4c0f0e5d5d2 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -54,15 +54,25 @@ fi
 set -e
 
 uncommitted_code=`git diff HEAD`
-LAST_MASTER=`git merge-base master HEAD`
 
 # If there's no uncommitted changes, we assume user are doing post-commit
-# format check, in which case we'll check the modified lines since last commit
-# from master. Otherwise, we'll check format of the uncommitted code only.
+# format check, in which case we'll try to check the modified lines vs. the
+# facebook/rocksdb.git master branch. Otherwise, we'll check format of the
+# uncommitted code only.
 if [ -z "$uncommitted_code" ]
 then
-  # Check the format of last commit
-  diffs=$(git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -p 1)
+  # Attempt to get name of facebook/rocksdb.git remote.
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
+  # Fall back on 'origin' if that fails
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin
+  # Use master branch from that remote
+  [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/master"
+  # Get the common ancestor with that remote branch. Everything after that
+  # common ancestor would be considered the contents of a pull request, so
+  # should be relevant for formatting fixes.
+  FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
+  # Get the differences
+  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
 else
   # Check the format of uncommitted lines,
   diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
@@ -76,12 +86,12 @@ fi
 
 # Highlight the insertion/deletion from the clang-format-diff.py's output
 COLOR_END="\033[0m"
-COLOR_RED="\033[0;31m" 
-COLOR_GREEN="\033[0;32m" 
+COLOR_RED="\033[0;31m"
+COLOR_GREEN="\033[0;32m"
 
 echo -e "Detect lines that doesn't follow the format rules:\r"
 # Add the color to the diff. lines added will be green; lines removed will be red.
-echo "$diffs" | 
+echo "$diffs" |
   sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
   sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
 
@@ -104,7 +114,7 @@ fi
 # Do in-place format adjustment.
 if [ -z "$uncommitted_code" ]
 then
-  git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
 else
   git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
 fi

From 2367656b6c19048d76037d24025ef0caab136866 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Mon, 23 Sep 2019 12:13:24 -0700
Subject: [PATCH 410/572] Remove invalid comparison of va_list and nullptr
 (#5836)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The comparison of va_list and nullptr is always False under any arch, and will raise invalid operands of types error in aarch64 env (`error: invalid operands of types ‘va_list {aka __va_list}’ and ‘std::nullptr_t’ to binary ‘operator!=’`).

This patch removes this invalid assert.

Closes: https://github.com/facebook/rocksdb/issues/4277
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5836

Differential Revision: D17532470

fbshipit-source-id: ca98078ecbc6a9416c69de3bd6ffcfa33a0f0185
---
 java/rocksjni/loggerjnicallback.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/rocksjni/loggerjnicallback.cc b/java/rocksjni/loggerjnicallback.cc
index 61571e98712..a731fdac96e 100644
--- a/java/rocksjni/loggerjnicallback.cc
+++ b/java/rocksjni/loggerjnicallback.cc
@@ -131,7 +131,6 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format,
     }
 
     assert(format != nullptr);
-    assert(ap != nullptr);
     const std::unique_ptr<char[]> msg = format_str(format, ap);
 
     // pass msg to java callback handler

From 02554b3c382a3982e2ccb54e5f632a27b1cbb3c0 Mon Sep 17 00:00:00 2001
From: WangQingping <qiw209@gmail.com>
Date: Mon, 23 Sep 2019 15:02:02 -0700
Subject: [PATCH 411/572] Update HISTORY.md for stop manual compaction

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5837

Differential Revision: D17529753

fbshipit-source-id: 98bbf22c690384b2f440286151dffdaaa744e97c
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index 7bd8f16b468..fe746198857 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -17,6 +17,7 @@
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
 * The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
 * Deprecate `snap_refresh_nanos` option.
+* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 

From c9932d18cc8f3408a47928b26433d21daa76943d Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 24 Sep 2019 10:59:06 -0700
Subject: [PATCH 412/572] Add class comment for Block

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5832

Differential Revision: D17550773

Pulled By: ltamasi

fbshipit-source-id: 66972bb008516e55b6fbba58ddd10234346d5d11
---
 table/block_based/block.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/table/block_based/block.h b/table/block_based/block.h
index 9568cd69c06..73c21b4659a 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -135,6 +135,18 @@ class BlockReadAmpBitmap {
   uint32_t rnd_;
 };
 
+// This Block class is not for any old block: it is designed to hold only
+// uncompressed blocks containing sorted key-value pairs. It is thus
+// suitable for storing uncompressed data blocks, index blocks (including
+// partitions), range deletion blocks, properties blocks, metaindex blocks,
+// as well as the top level of the partitioned filter structure (which is
+// actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries (since the latter do not contain sorted key-value pairs).
+// Use BlockContents directly for those.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
 class Block {
  public:
   // Initialize the block with the specified contents.

From 6652c94f59dbd82bc57a2e62e66f7d355232f669 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 24 Sep 2019 13:58:27 -0700
Subject: [PATCH 413/572] Fix a bug in format_version 3 + partition filters +
 prefix search (#5835)

Summary:
Partitioned filters make use of a top-level index to find the partition in which the filter resides. The top-level index has a key per partition. The key is guaranteed to be larger or equal than any key in that partition. When used with format_version 3, which excludes the sequence number form index keys, the separator key in the index could be equal to the prefix of the keys in the next partition. In this way, when searching for the key, the top-level index will lead us to the previous partition, which has no key with that prefix. The prefix bloom test thus returns false, although the prefix exists in the bloom of the next partition.
The patch fixes that by a hack: It always adds the prefix of the first key of the next partition to the bloom of the current partition. In this way, in the corner cases that the index will lead us to the previous partition, we still can find the bloom filter there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5835

Differential Revision: D17513585

Pulled By: maysamyabandeh

fbshipit-source-id: e2d1ff26c759e6e03875c4d57f4228316ecf50e9
---
 HISTORY.md                                    |  1 +
 table/block_based/full_filter_block.cc        |  2 +-
 table/block_based/full_filter_block.h         |  3 +-
 table/block_based/partitioned_filter_block.cc | 21 ++++++++--
 table/block_based/partitioned_filter_block.h  |  3 +-
 .../partitioned_filter_block_test.cc          | 38 +++++++++++++++++++
 6 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index fe746198857..772f4e64c39 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
+* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index b00949c3c8a..77da8890093 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -57,7 +57,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
 }
 
 // Add prefix to filter if needed
-inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
   Slice prefix = prefix_extractor_->Transform(key);
   if (whole_key_filtering_) {
     // if both whole_key and prefix are added to bloom then we will have whole
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index ae1e974f41f..65dc278a860 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -59,6 +59,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   virtual void AddKey(const Slice& key);
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
   virtual void Reset();
+  void AddPrefix(const Slice& key);
+  const SliceTransform* prefix_extractor() { return prefix_extractor_; }
 
  private:
   // important: all of these might point to invalid addresses
@@ -74,7 +76,6 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   uint32_t num_added_;
   std::unique_ptr<const char[]> filter_data_;
 
-  void AddPrefix(const Slice& key);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index f06150c296d..f103b767342 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -40,7 +40,8 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
 
 PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
 
-void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+    const Slice* next_key) {
   // Use == to send the request only once
   if (filters_in_partition_ == filters_per_partition_) {
     // Currently only index builder is in charge of cutting a partition. We keep
@@ -51,6 +52,16 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
     return;
   }
   filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
+
+  // Add the prefix of the next key before finishing the partition. This hack,
+  // fixes a bug with format_verison=3 where seeking for the prefix would lead
+  // us to the previous partition.
+  const bool add_prefix =
+      next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+  if (add_prefix) {
+    FullFilterBlockBuilder::AddPrefix(*next_key);
+  }
+
   Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
   std::string& index_key = p_index_builder_->GetPartitionKey();
   filters.push_back({index_key, filter});
@@ -58,8 +69,12 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
   Reset();
 }
 
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+  MaybeCutAFilterBlock(&key);
+  FullFilterBlockBuilder::Add(key);
+}
+
 void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
-  MaybeCutAFilterBlock();
   filter_bits_builder_->AddKey(key);
   filters_in_partition_++;
   num_added_++;
@@ -87,7 +102,7 @@ Slice PartitionedFilterBlockBuilder::Finish(
     }
     filters.pop_front();
   } else {
-    MaybeCutAFilterBlock();
+    MaybeCutAFilterBlock(nullptr);
   }
   // If there is no filter partition left, then return the index on filter
   // partitions
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index b73ae3baa75..9cac1b88a2e 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -32,6 +32,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   virtual ~PartitionedFilterBlockBuilder();
 
   void AddKey(const Slice& key) override;
+  void Add(const Slice& key) override;
 
   size_t NumAdded() const override { return num_added_; }
 
@@ -53,7 +54,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   bool finishing_filters =
       false;  // true if Finish is called once but not complete yet.
   // The policy of when cut a filter block and Finish it
-  void MaybeCutAFilterBlock();
+  void MaybeCutAFilterBlock(const Slice* next_key);
   // Currently we keep the same number of partitions for filters and indexes.
   // This would allow for some potentioal optimizations in future. If such
   // optimizations did not realize we can use different number of partitions and
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index aa667afedf0..fdfc972091e 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -346,6 +346,44 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   }
 }
 
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      rocksdb::NewFixedPrefixTransform(2));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  // In the bug, searching for prefix "p3" on an index with format version 3,
+  // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+  // p2-keys, where the filter for prefix "p3" does not exist.
+  const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+                               "p5-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2], pkeys[3]);
+  builder->Add(pkeys[3]);
+  CutABlock(pib.get(), pkeys[3], pkeys[4]);
+  builder->Add(pkeys[4]);
+  CutABlock(pib.get(), pkeys[4]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto prefix = prefix_extractor->Transform(key);
+    auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix, prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+}
+
 TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {

From 76e951dbb1b4e06c5649618a61865f3141e66054 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 26 Sep 2019 16:16:28 -0700
Subject: [PATCH 414/572] Add a unit test to reproduce a corruption bug (#5851)

Summary:
This is a bug occaionally shows up in crash test, and this unit test is to reproduce it. The bug is following:
1. Database has multiple CFs.
2. Between one DB restart, the last log file is corrupted in the middle (not the tail)
3. During restart, DB crashes between flushes between two CFs.
The DB will fail to be opened again with error "SST file is ahead of WALs"
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5851

Test Plan: Run the test itself.

Differential Revision: D17614721

fbshipit-source-id: 1b0abce49b203a76a039e38e76bc940429975f20
---
 db/db_impl/db_impl_open.cc |  2 ++
 db/db_test2.cc             | 60 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 076985de156..92281df502d 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1053,6 +1053,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       versions_->MarkFileNumberUsed(max_log_number + 1);
       status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                       edit, &mutex_);
+      TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:AfterLogAndApply",
+                               nullptr);
       if (!status.ok()) {
         // Recovery failed
         break;
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 82df2a9082e..c50d55cb90f 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -16,6 +16,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/wal_filter.h"
+#include "test_util/fault_injection_test_env.h"
 
 namespace rocksdb {
 
@@ -4070,6 +4071,65 @@ TEST_F(DBTest2, RowCacheSnapshot) {
   db_->ReleaseSnapshot(s3);
 }
 #endif  // ROCKSDB_LITE
+
+// Disabled but the test is failing.
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, DISABLED_CrashInRecoveryMultipleCF) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // The value is large enough to be divided to two blocks.
+  std::string large_value(400, ' ');
+  ASSERT_OK(Put("foo1", large_value));
+  ASSERT_OK(Put("foo2", large_value));
+  Close();
+
+  // Corrupt the log file in the middle, so that it is not corrupted
+  // in the tail.
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  for (const auto& f : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+      std::string fname = dbname_ + "/" + f;
+      std::string file_content;
+      ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+      file_content[400] = 'h';
+      file_content[401] = 'a';
+      ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+      break;
+    }
+  }
+
+  // Reopen and freeze the file system after the first manifest write.
+  FaultInjectionTestEnv fit_env(options.env);
+  options.env = &fit_env;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:AfterLogAndApply",
+      [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  fit_env.SetFilesystemActive(true);
+  // If we continue using failure ingestion Env, it will conplain something
+  // when renaming current file, which is not expected. Need to investigate why.
+  options.env = env_;
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
 }  // namespace rocksdb
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

From ae45835703c7f331064a6a6bced2d20b91aaccd0 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 26 Sep 2019 16:59:06 -0700
Subject: [PATCH 415/572] Add TryCatchUpWithPrimary to StackableDB (#5855)

Summary:
as title.

Test Plan (on devserver):
```
$make all && make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5855

Differential Revision: D17615125

Pulled By: riversand963

fbshipit-source-id: bd6ed8cf59eafff41f0d1fc044f39e8f3573172a
---
 HISTORY.md                               | 1 +
 include/rocksdb/utilities/stackable_db.h | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 772f4e64c39..c1fa45ad451 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -19,6 +19,7 @@
 * The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
 * Deprecate `snap_refresh_nanos` option.
 * Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
+* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 32202d7a073..e6618cf4510 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -442,6 +442,12 @@ class StackableDB : public DB {
     return db_->DefaultColumnFamily();
   }
 
+#ifndef ROCKSDB_LITE
+  Status TryCatchUpWithPrimary() override {
+    return db_->TryCatchUpWithPrimary();
+  }
+#endif  // ROCKSDB_LITE
+
  protected:
   DB* db_;
   std::shared_ptr<DB> shared_db_ptr_;

From 51185592fd9d89381177d42e6448d046606beb1f Mon Sep 17 00:00:00 2001
From: "Chen, You" <chenyou.fdu@gmail.com>
Date: Fri, 27 Sep 2019 10:18:18 -0700
Subject: [PATCH 416/572] Add unordered write option rocksjava (#5839)

Summary:
Add unordered_write option api and related ut to rocksjava
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5839

Differential Revision: D17604446

Pulled By: maysamyabandeh

fbshipit-source-id: c6b07e85ca9d5e3a92973ddb6ab2bc079e53c9c1
---
 java/rocksjni/options.cc                      | 44 +++++++++++++++++++
 java/src/main/java/org/rocksdb/DBOptions.java | 15 +++++++
 .../java/org/rocksdb/DBOptionsInterface.java  | 38 ++++++++++++++++
 java/src/main/java/org/rocksdb/Options.java   | 14 ++++++
 .../test/java/org/rocksdb/DBOptionsTest.java  |  9 ++++
 .../test/java/org/rocksdb/OptionsTest.java    |  9 ++++
 6 files changed, 129 insertions(+)

diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 12f44b5eb09..33d42646fad 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1547,6 +1547,27 @@ jboolean Java_org_rocksdb_Options_enablePipelinedWrite(
   return static_cast<jboolean>(opt->enable_pipelined_write);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUnorderedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean unordered_write) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+        ->unordered_write = static_cast<bool>(unordered_write);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_unorderedWrite(
+        JNIEnv*, jobject, jlong jhandle) {
+    return reinterpret_cast<rocksdb::Options*>(jhandle)->unordered_write;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setAllowConcurrentMemtableWrite
@@ -5717,6 +5738,29 @@ jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(
   return static_cast<jboolean>(opt->enable_pipelined_write);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUnorderedWrite(
+        JNIEnv*, jobject, jlong jhandle, jboolean junordered_write) {
+    auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+    opt->unordered_write = junordered_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_unorderedWrite(
+        JNIEnv*, jobject, jlong jhandle) {
+auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+return static_cast<jboolean>(opt->unordered_write);
+}
+
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setEnableThreadTracking
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index e2c4c02b32e..fc413c76a68 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -872,6 +872,18 @@ public boolean enablePipelinedWrite() {
     return enablePipelinedWrite(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
+
   @Override
   public DBOptions setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -1266,6 +1278,9 @@ private native void setEnableThreadTracking(long handle,
   private native void setEnablePipelinedWrite(final long handle,
       final boolean enablePipelinedWrite);
   private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index f4217412fc9..a26449c5d38 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -1088,6 +1088,44 @@ T setNewTableReaderForCompactionInputs(
    */
   boolean enablePipelinedWrite();
 
+  /**
+   * Setting {@link #unorderedWrite()} to true trades higher write throughput with
+   * relaxing the immutability guarantee of snapshots. This violates the
+   * repeatability one expects from ::Get from a snapshot, as well as
+   * ::MultiGet and Iterator's consistent-point-in-time view property.
+   * If the application cannot tolerate the relaxed guarantees, it can implement
+   * its own mechanisms to work around that and yet benefit from the higher
+   * throughput. Using TransactionDB with WRITE_PREPARED write policy and
+   * {@link #twoWriteQueues()} true is one way to achieve immutable snapshots despite
+   * unordered_write.
+   *
+   * By default, i.e., when it is false, rocksdb does not advance the sequence
+   * number for new snapshots unless all the writes with lower sequence numbers
+   * are already finished. This provides the immutability that we except from
+   * snapshots. Moreover, since Iterator and MultiGet internally depend on
+   * snapshots, the snapshot immutability results into Iterator and MultiGet
+   * offering consistent-point-in-time view. If set to true, although
+   * Read-Your-Own-Write property is still provided, the snapshot immutability
+   * property is relaxed: the writes issued after the snapshot is obtained (with
+   * larger sequence numbers) will be still not visible to the reads from that
+   * snapshot, however, there still might be pending writes (with lower sequence
+   * number) that will change the state visible to the snapshot after they are
+   * landed to the memtable.
+   *
+   * @param unorderedWrite true to enabled unordered write
+   *
+   * @return the reference to the current options.
+   */
+  T setUnorderedWrite(final boolean unorderedWrite);
+
+  /**
+   * Returns true if unordered write are enabled.
+   * See {@link #setUnorderedWrite(boolean)}.
+   *
+   * @return true if unordered write are enabled, false otherwise.
+   */
+  boolean unorderedWrite();
+
   /**
    * If true, allow multi-writers to update mem tables in parallel.
    * Only some memtable factorys support concurrent writes; currently it
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index bb3c87aefd5..9fce1eda24e 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -919,6 +919,17 @@ public boolean enablePipelinedWrite() {
     return enablePipelinedWrite(nativeHandle_);
   }
 
+  @Override
+  public Options setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
   @Override
   public Options setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -1886,6 +1897,9 @@ private native void setEnableThreadTracking(long handle,
   private native void setEnablePipelinedWrite(final long handle,
       final boolean pipelinedWrite);
   private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index e6ebc46cd24..1731b6c270d 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -543,6 +543,15 @@ public void enablePipelinedWrite() {
     }
   }
 
+  @Test
+  public void unordredWrite() {
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final DBOptions opt = new DBOptions()) {
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index e27a33d7df0..04d362fb1d2 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -762,6 +762,15 @@ public void enablePipelinedWrite() {
     }
   }
 
+  @Test
+  public void unordredWrite() {
+    try(final Options opt = new Options()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final Options opt = new Options()) {

From 679a45d0cbf4005b7a990a7c59f072c0499ece78 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 27 Sep 2019 11:09:06 -0700
Subject: [PATCH 417/572] crash_test to do some verification for prefix
 extractor and iterator bounds. (#5846)

Summary:
For now, crash_test is not able to report any failure for the logic related to iterator upper, lower bounds or iterators, or reseek. These are features prone to errors. Improve db_stress in several ways:
(1) For each iterator run, reseek up to 3 times.
(2) For every iterator, create control iterator with upper or lower bound, with total order seek. Compare the results with the iterator.
(3) Make simple crash test to avoid prefix size to have more coverage.
(4) make prefix_size = 0 a valid size and -1 to indicate disabling prefix extractor.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5846

Test Plan: Manually hack the code to create wrong results and see they are caught by the tool.

Differential Revision: D17631760

fbshipit-source-id: acd460a177bd2124a5ffd7fff490702dba63030b
---
 tools/db_crashtest.py |   5 +-
 tools/db_stress.cc    | 187 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 167 insertions(+), 25 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 4013910ab2c..75cf95b2f7d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -124,8 +124,9 @@ def is_direct_io_supported(dbname):
     "max_background_compactions": 1,
     "max_bytes_for_level_base": 67108864,
     "memtablerep": "skip_list",
-    "prefixpercent": 25,
-    "readpercent": 25,
+    "prefixpercent": 0,
+    "readpercent": 50,
+    "prefix_size" : -1,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index cb8f96765ac..30a62e7f2f7 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -693,14 +693,16 @@ static enum RepFactory FLAGS_rep_factory;
 DEFINE_string(memtablerep, "prefix_hash", "");
 
 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
-  if (value < 0 || value > 8) {
-    fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
+  if (value < -1 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. -1 <= PrefixSize <= 8\n",
             flagname, value);
     return false;
   }
   return true;
 }
-DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
+DEFINE_int32(prefix_size, 7,
+             "Control the prefix size for HashSkipListRep. "
+             "-1 is disabled.");
 static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
 
@@ -2348,6 +2350,11 @@ class StressTest {
                         lock);
       } else {
         // OPERATION iterate
+        int num_seeks = static_cast<int>(
+            std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
+                     FLAGS_ops_per_thread - i - 1));
+        rand_keys = GenerateNKeys(thread, num_seeks, i);
+        i += num_seeks - 1;
         TestIterate(thread, read_opts, rand_column_families, rand_keys);
       }
       thread->stats.FinishedSingleOp();
@@ -2445,7 +2452,7 @@ class StressTest {
     std::string lower_bound_str;
     Slice lower_bound;
     if (thread->rand.OneIn(16)) {
-      // in 1/16 chance, set a iterator lower bound
+      // in 1/16 chance, enable iterator lower bound
       int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
       lower_bound_str = Key(rand_lower_key);
       lower_bound = Slice(lower_bound_str);
@@ -2457,21 +2464,61 @@ class StressTest {
     auto cfh = column_families_[rand_column_families[0]];
     std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
 
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    iter->Seek(key);
-    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-      if (thread->rand.OneIn(2)) {
-        iter->Next();
-      } else {
-        iter->Prev();
+    for (int64_t rkey : rand_keys) {
+      std::string key_str = Key(rkey);
+      Slice key = key_str;
+
+      if (readoptionscopy.iterate_upper_bound != nullptr &&
+          thread->rand.OneIn(2)) {
+        // 1/2 chance, change the upper bound.
+        // It is possible that it is changed without first use, but there is no
+        // problem with that.
+        int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+        upper_bound_str = Key(rand_upper_key);
+        upper_bound = Slice(upper_bound_str);
       }
-    }
 
-    if (s.ok()) {
-      thread->stats.AddIterations(1);
-    } else {
-      thread->stats.AddErrors(1);
+      // Set up an iterator and does the same without bounds and with total
+      // order seek and compare the results. This is to identify bugs related
+      // to bounds, prefix extractor or reseeking. Sometimes we are comparing
+      // iterators with the same set-up, and it doesn't hurt to check them
+      // to be equal.
+      ReadOptions cmp_ro;
+      cmp_ro.snapshot = snapshot;
+      cmp_ro.total_order_seek = true;
+      std::unique_ptr<Iterator> cmp_iter(
+          db_->NewIterator(readoptionscopy, cfh));
+      bool diverged = false;
+
+      iter->Seek(key);
+      cmp_iter->Seek(key);
+      VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
+                     &diverged);
+
+      for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+        if (thread->rand.OneIn(2)) {
+          iter->Next();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Next();
+          }
+        } else {
+          iter->Prev();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Prev();
+          }
+        }
+        VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
+                       &diverged);
+      }
+
+      if (s.ok()) {
+        thread->stats.AddIterations(1);
+      } else {
+        thread->stats.AddErrors(1);
+        break;
+      }
     }
 
     db_->ReleaseSnapshot(snapshot);
@@ -2479,6 +2526,98 @@ class StressTest {
     return s;
   }
 
+  // Compare the two iterator, iter and cmp_iter are in the same position,
+  // unless iter might be made invalidate or undefined because of
+  // upper or lower bounds, or prefix extractor.
+  // Will flag failure if the verification fails.
+  // diverged = true if the two iterator is already diverged.
+  // True if verification passed, false if not.
+  void VerifyIterator(ThreadState* thread, const ReadOptions& ro,
+                      Iterator* iter, Iterator* cmp_iter, const Slice& seek_key,
+                      bool* diverged) {
+    if (*diverged) {
+      return;
+    }
+
+    if (iter->Valid() && !cmp_iter->Valid()) {
+      fprintf(stderr,
+              "Control interator is invalid but iterator has value %s seek key "
+              "%s\n",
+              iter->key().ToString(true).c_str(),
+              seek_key.ToString(true).c_str());
+
+      *diverged = true;
+    } else if (cmp_iter->Valid()) {
+      // Iterator is not valid. It can be legimate if it has already been
+      // out of upper or lower bound, or filtered out by prefix iterator.
+      const Slice& total_order_key = cmp_iter->key();
+      const SliceTransform* pe = options_.prefix_extractor.get();
+      const Comparator* cmp = options_.comparator;
+
+      if (pe != nullptr) {
+        if (!pe->InDomain(seek_key)) {
+          // Prefix seek a non-in-domain key is undefined. Skip checking for
+          // this scenario.
+          *diverged = true;
+          return;
+        }
+
+        if (!pe->InDomain(total_order_key) ||
+            pe->Transform(total_order_key) != pe->Transform(seek_key)) {
+          // If the prefix is exhausted, the only thing needs to check
+          // is the iterator isn't return a position in prefix.
+          // Either way, checking can stop from here.
+          *diverged = true;
+          if (!iter->Valid() || !pe->InDomain(iter->key()) ||
+              pe->Transform(iter->key()) != pe->Transform(seek_key)) {
+            return;
+          }
+          fprintf(stderr,
+                  "Iterator stays in prefix bug contol doesn't"
+                  " seek key %s iterator key %s control iterator key %s\n",
+                  seek_key.ToString(true).c_str(),
+                  iter->key().ToString(true).c_str(),
+                  cmp_iter->key().ToString(true).c_str());
+        }
+      }
+      // Check upper or lower bounds.
+      if (!*diverged) {
+        if (!iter->Valid() ||
+            (iter->key() != cmp_iter->key() &&
+             (ro.iterate_upper_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
+             (ro.iterate_lower_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
+          fprintf(stderr,
+                  "Iterator diverged from control iterator which"
+                  " has value %s seek key %s\n",
+                  total_order_key.ToString(true).c_str(),
+                  seek_key.ToString(true).c_str());
+          if (iter->Valid()) {
+            fprintf(stderr, "iterator has value %s\n",
+                    iter->key().ToString(true).c_str());
+          } else {
+            fprintf(stderr, "iterator is not valid\n");
+          }
+          if (ro.iterate_upper_bound != nullptr) {
+            fprintf(stderr, "upper bound %s\n",
+                    ro.iterate_upper_bound->ToString(true).c_str());
+          }
+          if (ro.iterate_lower_bound != nullptr) {
+            fprintf(stderr, "lower bound %s\n",
+                    ro.iterate_lower_bound->ToString(true).c_str());
+          }
+          *diverged = true;
+        }
+      }
+    }
+    if (*diverged) {
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    }
+  }
+
 #ifdef ROCKSDB_LITE
   virtual Status TestBackupRestore(
       ThreadState* /* thread */,
@@ -2802,8 +2941,10 @@ class StressTest {
       options_.max_background_flushes = FLAGS_max_background_flushes;
       options_.compaction_style =
           static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-      options_.prefix_extractor.reset(
-          NewFixedPrefixTransform(FLAGS_prefix_size));
+      if (FLAGS_prefix_size >= 0) {
+        options_.prefix_extractor.reset(
+            NewFixedPrefixTransform(FLAGS_prefix_size));
+      }
       options_.max_open_files = FLAGS_open_files;
       options_.statistics = dbstats;
       options_.env = FLAGS_env;
@@ -3814,8 +3955,8 @@ class BatchedOpsStressTest : public StressTest {
         fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
                 key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
                 StringToHex(values[i]).c_str());
-      // we continue after error rather than exiting so that we can
-      // find more errors if any
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
       }
     }
 
@@ -4425,7 +4566,7 @@ int main(int argc, char** argv) {
   FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
   FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
                                   rocksdb::Env::Priority::BOTTOM);
-  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
     fprintf(stderr,
             "Error: prefixpercent is non-zero while prefix_size is "
             "not positive!\n");
@@ -4437,7 +4578,7 @@ int main(int argc, char** argv) {
             "test_batches_snapshots test!\n");
     exit(1);
   }
-  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size <= 0) {
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
     fprintf(stderr,
             "Error: please specify positive prefix_size in order to use "
             "memtable_prefix_bloom_size_ratio\n");

From 5cd8aaf75f24509edc77f75ccb08d09ff62ebb55 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 27 Sep 2019 16:53:49 -0700
Subject: [PATCH 418/572] db_stress: fix run time error when prefix_size = -1
 (#5862)

Summary:
When prefix_size = -1, stress test crashes with run time error because of overflow. Fix it by not using -1 but 7 in prefix scan mode.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5862

Test Plan:
Run
python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \
      888887 --compression_type=zstd
and see it doesn't crash.

Differential Revision: D17642313

fbshipit-source-id: f029e7651498c905af1b1bee6d310ae50cdcda41
---
 tools/db_stress.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 30a62e7f2f7..e4aa85e0dd6 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -4285,9 +4285,12 @@ class CfConsistencyStressTest : public StressTest {
                                 const ReadOptions& readoptions,
                                 const std::vector<int>& rand_column_families,
                                 const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
-    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+    Slice prefix = Slice(key.data(), prefix_to_use);
 
     std::string upper_bound;
     Slice ub_slice;
@@ -4305,7 +4308,7 @@ class CfConsistencyStressTest : public StressTest {
          iter->Next()) {
       ++count;
     }
-    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    assert(count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
     Status s = iter->status();
     if (s.ok()) {
       thread->stats.AddPrefixes(1, count);

From 643df920d8831ddf72bc4d340c20317a881ef304 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 27 Sep 2019 17:15:48 -0700
Subject: [PATCH 419/572] Explicitly declare atomic flush incompatible with
 pipelined write (#5860)

Summary:
Atomic flush is incompatible with pipelined write. At least now.
If pipelined write is enabled, a thread performing write can exit the write
thread and start inserting into memtables. Consequently a thread performing
flush will enter write thread and race with memtable insertion by the former.
This will cause undefined result in terms of data persistence.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5860

Test Plan:
```
$make all && make check
```

Differential Revision: D17638944

Pulled By: riversand963

fbshipit-source-id: abc578dc49a5dbe41bc5adcecf448f8e042a6d49
---
 db/db_impl/db_impl_open.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 92281df502d..b02676bb47c 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -228,6 +228,11 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
         "unordered_write is incompatible with enable_pipelined_write");
   }
 
+  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "atomic_flush is incompatible with enable_pipelined_write");
+  }
+
   return Status::OK();
 }
 

From 69c4ccb970bdc8d5a58067de89a322ef9c7c2d93 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 30 Sep 2019 12:36:45 -0700
Subject: [PATCH 420/572] Fix three more db_stress bugs (#5867)

Summary:
Two more bug fixes in db_stress:
1. this is to complete the fix of the regression bug causing overflowing when supporting FLAGS_prefix_size = -1.
2. Fix regression bug in compare iterator itself:
(1) when creating control iterator, which used the same read option as the normal iterator by mistake; (2) the logic of comparing has some problems. Fix them.
(3) disable validation for lower bound now, which generated some wildly different results. Disabling it to make normal tests pass while investigating it.
3. Cleaning up snapshots in verification failure cases. Memory is leaked otherwise.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5867

Test Plan: Run "make crash_test" for a while and see at least 1 is fixed.

Differential Revision: D17671712

fbshipit-source-id: 011f98ea1a72aef23e19ff28656830c78699b402
---
 tools/db_stress.cc | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index e4aa85e0dd6..da2a122d498 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -2372,6 +2372,11 @@ class StressTest {
       }
 #endif
     }
+    while (!thread->snapshot_queue.empty()) {
+      db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+      delete thread->snapshot_queue.front().second.key_vec;
+      thread->snapshot_queue.pop();
+    }
 
     thread->stats.Stop();
   }
@@ -2486,8 +2491,7 @@ class StressTest {
       ReadOptions cmp_ro;
       cmp_ro.snapshot = snapshot;
       cmp_ro.total_order_seek = true;
-      std::unique_ptr<Iterator> cmp_iter(
-          db_->NewIterator(readoptionscopy, cfh));
+      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cfh));
       bool diverged = false;
 
       iter->Seek(key);
@@ -2539,12 +2543,27 @@ class StressTest {
       return;
     }
 
+    if (ro.iterate_lower_bound != nullptr) {
+      // Lower bound would create a lot of discrepency for now so disabling
+      // the verification for now.
+      *diverged = true;
+      return;
+    }
+
     if (iter->Valid() && !cmp_iter->Valid()) {
       fprintf(stderr,
-              "Control interator is invalid but iterator has value %s seek key "
+              "Control interator is invalid but iterator has key %s seek key "
               "%s\n",
               iter->key().ToString(true).c_str(),
               seek_key.ToString(true).c_str());
+      if (ro.iterate_upper_bound != nullptr) {
+        fprintf(stderr, "upper bound %s\n",
+                ro.iterate_upper_bound->ToString(true).c_str());
+      }
+      if (ro.iterate_lower_bound != nullptr) {
+        fprintf(stderr, "lower bound %s\n",
+                ro.iterate_lower_bound->ToString(true).c_str());
+      }
 
       *diverged = true;
     } else if (cmp_iter->Valid()) {
@@ -2582,8 +2601,8 @@ class StressTest {
       }
       // Check upper or lower bounds.
       if (!*diverged) {
-        if (!iter->Valid() ||
-            (iter->key() != cmp_iter->key() &&
+        if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
+            (!iter->Valid() &&
              (ro.iterate_upper_bound == nullptr ||
               cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
              (ro.iterate_lower_bound == nullptr ||
@@ -3280,6 +3299,8 @@ class NonBatchedOpsStressTest : public StressTest {
     const int64_t keys_per_thread = max_key / shared->GetNumThreads();
     int64_t start = keys_per_thread * thread->tid;
     int64_t end = start + keys_per_thread;
+    uint64_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
     if (thread->tid == shared->GetNumThreads() - 1) {
       end = max_key;
     }
@@ -3298,8 +3319,7 @@ class NonBatchedOpsStressTest : public StressTest {
           }
           // TODO(ljin): update "long" to uint64_t
           // Reseek when the prefix changes
-          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
-              0) {
+          if (i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
             iter->Seek(Key(i));
           }
           std::string from_db;
@@ -4042,6 +4062,8 @@ class BatchedOpsStressTest : public StressTest {
   virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
     auto cfh = column_families_[rand_column_families[0]];
@@ -4056,7 +4078,7 @@ class BatchedOpsStressTest : public StressTest {
     Status s = Status::OK();
     for (int i = 0; i < 10; i++) {
       prefixes[i] += key.ToString();
-      prefixes[i].resize(FLAGS_prefix_size);
+      prefixes[i].resize(prefix_to_use);
       prefix_slices[i] = Slice(prefixes[i]);
       readoptionscopy[i] = readoptions;
       readoptionscopy[i].snapshot = snapshot;

From 51413e0a856f377a7d62dfe4606c29ba9051c872 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Mon, 30 Sep 2019 14:00:37 -0700
Subject: [PATCH 421/572] Fix a compile error (#5864)

Summary:
```
tools/block_cache_analyzer/block_cache_trace_analyzer.cc:653:48: error: implicit conversion loses integer precision: 'uint64_t' (aka 'unsigned long long') to 'std::__1::linear_congruential_engine<unsigned int, 48271, 0, 2147483647>::result_type' (aka 'unsigned int') [-Werror,-Wshorten-64-to-32]
  std::default_random_engine rand_engine(env_->NowMicros());
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5864

Differential Revision: D17668962

fbshipit-source-id: e08fa58b2a78a8dd8b334862b5714208f696b8ab
---
 tools/block_cache_analyzer/block_cache_trace_analyzer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 709f0722171..891af9a52ef 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -650,7 +650,7 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
     const std::map<std::string, Features>& label_features,
     const std::map<std::string, Predictions>& label_predictions,
     uint32_t max_number_of_values) const {
-  std::default_random_engine rand_engine(env_->NowMicros());
+  std::default_random_engine rand_engine(static_cast<unsigned int>(env_->NowMicros()));
   for (auto const& label_feature_vectors : label_features) {
     const Features& past = label_feature_vectors.second;
     auto it = label_predictions.find(label_feature_vectors.first);

From 503a756e42bded0a665d28afb19b0cdecc482fd6 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 30 Sep 2019 22:14:03 -0700
Subject: [PATCH 422/572] Fix clang analyze warning in db_stress (#5870)

Summary:
Recent changes trigger clang analyze warning. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5870

Test Plan: "USE_CLANG=1 TEST_TMPDIR=/dev/shm/rocksdb OPT=-g make -j60 analyze" and make sure it passes.

Differential Revision: D17682533

fbshipit-source-id: 02716f2a24572550a22db4bbe9b54d4872dfae32
---
 tools/db_stress.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index da2a122d498..8c87338f7d9 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -3319,7 +3319,8 @@ class NonBatchedOpsStressTest : public StressTest {
           }
           // TODO(ljin): update "long" to uint64_t
           // Reseek when the prefix changes
-          if (i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
+          if (prefix_to_use > 0 &&
+              i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
             iter->Seek(Key(i));
           }
           std::string from_db;
@@ -4330,7 +4331,8 @@ class CfConsistencyStressTest : public StressTest {
          iter->Next()) {
       ++count;
     }
-    assert(count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
+    assert(prefix_to_use == 0 ||
+           count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
     Status s = iter->status();
     if (s.ok()) {
       thread->stats.AddPrefixes(1, count);

From 846e05005d78dfd4276cce6753967cb16930aabb Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 1 Oct 2019 11:20:50 -0700
Subject: [PATCH 423/572] Revert "Merging iterator to avoid child iterator
 reseek for some cases (#5286)" (#5871)

Summary:
This reverts commit 9fad3e21eb90d215b6719097baba417bc1eeca3c.

Iterator verification in stress tests sometimes fail for assertion
table/block_based/block_based_table_reader.cc:2973: void rocksdb::BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() [with TBlockIter = rocksdb::DataBlockIter; TValue = rocksdb::Slice]: Assertion `!next_block_is_out_of_bound || user_comparator_.Compare(*read_options_.iterate_upper_bound, index_iter_->user_key()) <= 0' failed.

It is likely to be linked to https://github.com/facebook/rocksdb/pull/5286 together with https://github.com/facebook/rocksdb/pull/5468 as the former PR makes some child iterator's seek being avoided, so that upper bound condition fails to be updated there. Strictly speaking, the former PR was merged before the latter one, but the latter one feels a more important improvement so I choose to revert the former one for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5871

Differential Revision: D17689196

fbshipit-source-id: 4ded5be68f67bee2782d31a29cb72ea68f59dd8c
---
 HISTORY.md                                   |  2 +-
 db/db_iterator_test.cc                       | 69 --------------------
 db/version_set.cc                            |  3 +-
 table/block_based/block_based_table_reader.h |  3 +-
 table/internal_iterator.h                    |  5 +-
 table/iterator_wrapper.h                     |  1 -
 table/merging_iterator.cc                    | 33 ++--------
 7 files changed, 12 insertions(+), 104 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c1fa45ad451..2431688f555 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
 * Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
@@ -108,7 +109,6 @@
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 * On DB open, delete WAL trash files left behind in wal_dir
 
-
 ## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ed12a880156..00207461e9a 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2692,75 +2692,6 @@ TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
-  Options options = CurrentOptions();
-  options.compression = CompressionType::kNoCompression;
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 800;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-
-  Random rnd(301);
-  std::string random_str = RandomString(&rnd, 180);
-
-  ASSERT_OK(Put("1", random_str));
-  ASSERT_OK(Put("2", random_str));
-  ASSERT_OK(Put("3", random_str));
-  ASSERT_OK(Put("4", random_str));
-  ASSERT_OK(Put("8", random_str));
-  ASSERT_OK(Put("9", random_str));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("5", random_str));
-  ASSERT_OK(Put("6", random_str));
-  ASSERT_OK(Put("7", random_str));
-  ASSERT_OK(Flush());
-
-  // These two keys will be kept in memtable.
-  ASSERT_OK(Put("0", random_str));
-  ASSERT_OK(Put("8", random_str));
-
-  int num_iter_wrapper_seek = 0;
-  SyncPoint::GetInstance()->SetCallBack(
-      "IteratorWrapper::Seek:0",
-      [&](void* /*arg*/) { num_iter_wrapper_seek++; });
-  SyncPoint::GetInstance()->EnableProcessing();
-  {
-    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
-    iter->Seek("1");
-    ASSERT_TRUE(iter->Valid());
-    // DBIter always wraps internal iterator with IteratorWrapper,
-    // and in merging iterator each child iterator will be wrapped
-    // with IteratorWrapper.
-    ASSERT_EQ(4, num_iter_wrapper_seek);
-
-    // child position: 1 and 5
-    num_iter_wrapper_seek = 0;
-    iter->Seek("2");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(3, num_iter_wrapper_seek);
-
-    // child position: 2 and 5
-    num_iter_wrapper_seek = 0;
-    iter->Seek("6");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(4, num_iter_wrapper_seek);
-
-    // child position: 8 and 6
-    num_iter_wrapper_seek = 0;
-    iter->Seek("7");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(3, num_iter_wrapper_seek);
-
-    // child position: 8 and 7
-    num_iter_wrapper_seek = 0;
-    iter->Seek("5");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(4, num_iter_wrapper_seek);
-  }
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
 // MyRocks may change iterate bounds before seek. Simply test to make sure such
 // usage doesn't break iterator.
 TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 61c67b42a42..fc9316a3d81 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -861,8 +861,7 @@ class LevelIterator final : public InternalIterator {
                 bool skip_filters, int level, RangeDelAggregator* range_del_agg,
                 const std::vector<AtomicCompactionUnitBoundary>*
                     compaction_boundaries = nullptr)
-      : InternalIterator(false),
-        table_cache_(table_cache),
+      : table_cache_(table_cache),
         read_options_(read_options),
         env_options_(env_options),
         icomparator_(icomparator),
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b18dccd11d0..dd496d35466 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -615,8 +615,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
                           const SliceTransform* prefix_extractor,
                           BlockType block_type, TableReaderCaller caller,
                           size_t compaction_readahead_size = 0)
-      : InternalIteratorBase<TValue>(false),
-        table_(table),
+      : table_(table),
         read_options_(read_options),
         icomp_(icomp),
         user_comparator_(icomp.user_comparator()),
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index adcccf79592..d7940eeffa9 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -25,8 +25,8 @@ struct IterateResult {
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
-  InternalIteratorBase() : is_mutable_(true) {}
-  InternalIteratorBase(bool _is_mutable) : is_mutable_(_is_mutable) {}
+  InternalIteratorBase() {}
+
   // No copying allowed
   InternalIteratorBase(const InternalIteratorBase&) = delete;
   InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
@@ -148,7 +148,6 @@ class InternalIteratorBase : public Cleanable {
   virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
     return Status::NotSupported("");
   }
-  bool is_mutable() const { return is_mutable_; }
 
  protected:
   void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index d6648bc3812..f8fdde565ec 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -74,7 +74,6 @@ class IteratorWrapperBase {
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
   void Seek(const Slice& k) {
-    TEST_SYNC_POINT("IteratorWrapper::Seek:0");
     assert(iter_);
     iter_->Seek(k);
     Update();
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 62d5ce1ee56..189850887cb 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -109,37 +109,18 @@ class MergingIterator : public InternalIterator {
   }
 
   void Seek(const Slice& target) override {
-    bool is_increasing_reseek = false;
-    if (current_ != nullptr && direction_ == kForward && status_.ok() &&
-        !prefix_seek_mode_ && comparator_->Compare(target, key()) >= 0) {
-      is_increasing_reseek = true;
-    }
     ClearHeaps();
     status_ = Status::OK();
     for (auto& child : children_) {
-      // If upper bound never changes, we can skip Seek() for
-      // the !Valid() case too, but people do hack the code to change
-      // upper bound between Seek(), so it's not a good idea to break
-      // the API.
-      // If DBIter is used on top of merging iterator, we probably
-      // can skip mutable child iterators if they are invalid too,
-      // but it's a less clean API. We can optimize for it later if
-      // needed.
-      if (!is_increasing_reseek || !child.Valid() ||
-          comparator_->Compare(target, child.key()) > 0 ||
-          child.iter()->is_mutable()) {
-        PERF_TIMER_GUARD(seek_child_seek_time);
+      PERF_TIMER_GUARD(seek_child_seek_time);
 
-        child.Seek(target);
+      child.Seek(target);
 
-        PERF_COUNTER_ADD(seek_child_seek_count, 1);
-      }
-      {
-        // Strictly, we timed slightly more than min heap operation,
-        // but these operations are very cheap.
-        PERF_TIMER_GUARD(seek_min_heap_time);
-        AddToMinHeapOrCheckStatus(&child);
-      }
+      PERF_COUNTER_ADD(seek_child_seek_count, 1);
+      // Strictly, we timed slightly more than min heap operation,
+      // but these operations are very cheap.
+      PERF_TIMER_GUARD(seek_min_heap_time);
+      AddToMinHeapOrCheckStatus(&child);
     }
     direction_ = kForward;
     {

From 9f31df86795d93f7913d8fce58b6c8647751bac7 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 1 Oct 2019 14:03:16 -0700
Subject: [PATCH 424/572] Fix compilation error (#5872)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Without this fix, compiler complains.
```
$ROCKSDB_NO_FBCODE=1 USE_CLANG=1 make ldb
table/block_based/full_filter_block.cc: In constructor ‘rocksdb::FullFilterBlockBuilder::FullFilterBlockBuilder(const rocksdb::SliceTransform*, bool, rocksdb::FilterBitsBuilder*)’:
table/block_based/full_filter_block.cc:20:43: error: declaration of ‘prefix_extractor’ shadows a member of 'this' [-Werror=shadow]
FilterBitsBuilder* filter_bits_builder)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5872

Test Plan:
```
$ROCKSDB_NO_FBCODE=1 make all
```

Differential Revision: D17690058

Pulled By: riversand963

fbshipit-source-id: 19e3d9bd86e1123847095240e73d30da5d66240e
---
 table/block_based/full_filter_block.cc        | 4 ++--
 table/block_based/partitioned_filter_block.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 77da8890093..9a858232dbd 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -16,9 +16,9 @@
 namespace rocksdb {
 
 FullFilterBlockBuilder::FullFilterBlockBuilder(
-    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder)
-    : prefix_extractor_(prefix_extractor),
+    : prefix_extractor_(_prefix_extractor),
       whole_key_filtering_(whole_key_filtering),
       last_whole_key_recorded_(false),
       last_prefix_recorded_(false),
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index f103b767342..c2d1917d056 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -18,12 +18,12 @@
 namespace rocksdb {
 
 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
-    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
     const bool use_value_delta_encoding,
     PartitionedIndexBuilder* const p_index_builder,
     const uint32_t partition_size)
-    : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
+    : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
                              filter_bits_builder),
       index_on_filter_block_builder_(index_block_restart_interval,
                                      true /*use_delta_encoding*/,

From d783af1857fe157b401ec4fb0e8d07faa0ed31d1 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 1 Oct 2019 19:25:12 -0700
Subject: [PATCH 425/572] Fix a timer bug in MergingIterator::Seek() caused by
 #5871 (#5874)

Summary:
Conflict resolving in 846e05005d78dfd4276cce6753967cb16930aabb ("Revert "Merging iterator to avoid child iterator reseek for some cases") caused some timer misplaced. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5874

Test Plan: See it build.

Differential Revision: D17705073

fbshipit-source-id: 9bd3a8dc4901ac33c2c6fc5b1091ffbc56a8529f
---
 table/merging_iterator.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 189850887cb..2ee379b052e 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -112,15 +112,18 @@ class MergingIterator : public InternalIterator {
     ClearHeaps();
     status_ = Status::OK();
     for (auto& child : children_) {
-      PERF_TIMER_GUARD(seek_child_seek_time);
-
-      child.Seek(target);
+      {
+        PERF_TIMER_GUARD(seek_child_seek_time);
+        child.Seek(target);
+      }
 
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
-      // Strictly, we timed slightly more than min heap operation,
-      // but these operations are very cheap.
-      PERF_TIMER_GUARD(seek_min_heap_time);
-      AddToMinHeapOrCheckStatus(&child);
+      {
+        // Strictly, we timed slightly more than min heap operation,
+        // but these operations are very cheap.
+        PERF_TIMER_GUARD(seek_min_heap_time);
+        AddToMinHeapOrCheckStatus(&child);
+      }
     }
     direction_ = kForward;
     {

From 9e4913ce9d5113f205f8d56fa4490d503ab61472 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Wed, 2 Oct 2019 15:31:54 -0700
Subject: [PATCH 426/572] Add FullBloomTest.CorruptFilters,RawSchema (#5834)

Summary:
There was significant untested logic in FullFilterBitsReader in
the handling of serialized Bloom filter bits that cannot be generated by
FullFilterBitsBuilder in the current compilation. These now test many of
those corner-case behaviors, including bad metadata or filters created
with different cache line size than the current compiled-in value.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5834

Test Plan: thisisthetest

Differential Revision: D17726372

Pulled By: pdillinger

fbshipit-source-id: fb7b8003b5a8e6fb4666fe95206128f3d5835fc7
---
 util/bloom.cc      |   2 +-
 util/bloom_test.cc | 185 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+), 1 deletion(-)

diff --git a/util/bloom.cc b/util/bloom.cc
index 56eef320d1c..d59e29b51b2 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -153,7 +153,7 @@ class FullFilterBitsReader : public FilterBitsReader {
         if (num_lines_at_curr_cache_size == 0) {
           // The cache line size seems not a power of two. It's not supported
           // and indicates a corruption so disable using this filter.
-          assert(false);
+          // Removed for unit testing corruption: assert(false);
           num_lines_ = 0;
           num_probes_ = 0;
           break;
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index cda5e73e123..f57d101bd92 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -274,6 +274,10 @@ class FullBloomTest : public testing::Test {
     bits_builder_->AddKey(s);
   }
 
+  void OpenRaw(const Slice& s) {
+    bits_reader_.reset(policy_->GetFilterBitsReader(s));
+  }
+
   void Build() {
     Slice filter = bits_builder_->Finish(&buf_);
     bits_reader_.reset(policy_->GetFilterBitsReader(filter));
@@ -293,6 +297,17 @@ class FullBloomTest : public testing::Test {
     return bits_reader_->MayMatch(s);
   }
 
+  uint64_t PackedMatches() {
+    char buffer[sizeof(int)];
+    uint64_t result = 0;
+    for (int i = 0; i < 64; i++) {
+      if (Matches(Key(i + 12345, buffer))) {
+        result |= 1 << i;
+      }
+    }
+    return result;
+  }
+
   double FalsePositiveRate() {
     char buffer[sizeof(int)];
     int result = 0;
@@ -452,6 +467,176 @@ TEST_F(FullBloomTest, Schema) {
   ResetPolicy();
 }
 
+// A helper class for testing custom or corrupt filter bits as read by
+// FullFilterBitsReader.
+struct RawFilterTester {
+  // Buffer, from which we always return a tail Slice, so the
+  // last five bytes are always the metadata bytes.
+  std::array<char, 3000> data_;
+  // Points five bytes from the end
+  char* metadata_ptr_;
+
+  RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {}
+
+  Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines,
+                     uint32_t num_probes) {
+    metadata_ptr_[0] = static_cast<char>(num_probes);
+    EncodeFixed32(metadata_ptr_ + 1, num_lines);
+    uint32_t len = len_without_metadata + /*metadata*/ 5;
+    assert(len <= data_.size());
+    return Slice(metadata_ptr_ - len_without_metadata, len);
+  }
+
+  Slice Reset(uint32_t len_without_metadata, uint32_t num_lines,
+               uint32_t num_probes, bool fill_ones) {
+    data_.fill(fill_ones ? 0xff : 0);
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+
+  Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines,
+                        uint32_t num_probes) {
+    for (uint32_t i = 0; i < data_.size(); ++i) {
+      data_[i] = static_cast<char>(0x7b7b >> (i % 7));
+    }
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+};
+
+TEST_F(FullBloomTest, RawSchema) {
+  RawFilterTester cft;
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // One 256-byte cache line.
+  OpenRaw(cft.ResetWeirdFill(256, 1, 2));
+  ASSERT_EQ(18446744073675927543ULL, PackedMatches());
+
+  // Two 128-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2, 2));
+  ASSERT_EQ(18446744073407559151ULL, PackedMatches());
+
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 4, 2));
+  ASSERT_EQ(18446744073441107966ULL, PackedMatches());
+}
+
+TEST_F(FullBloomTest, CorruptFilters) {
+  RawFilterTester cft;
+
+  for (bool fill : {false, true}) {
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 256 is unusual but legal cache line size
+    OpenRaw(cft.Reset(256 * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 30 should be max num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 1 should be min num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Type 1 trivial filter bits - returns true as if FP by zero probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 0, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 37, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes
+    OpenRaw(cft.Reset(0, 0, 0, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // No solution to 0 * x == CACHE_LINE_SIZE
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // Can't have 3 * x == 4 for integer x
+    OpenRaw(cft.Reset(4, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // 97 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(97 * 3, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits
+    // 65 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
+    // ASSERT_TRUE(Matches("hello"));
+    // ASSERT_TRUE(Matches("world"));
+    // NB: NOT PROPERLY CHECKED in implementation
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Bad filter bits - returns false as if built from zero keys
+    // < 5 bytes overall means missing even metadata
+    OpenRaw(cft.Reset(-1, 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    OpenRaw(cft.Reset(-5, 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // 31 is not a useful num_probes, nor generated by RocksDB unless directly
+    // using filter bits API without BloomFilterPolicy.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // Similar, with 127, largest positive char
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // num_probes set to 128 / -128, lowest negative char
+    // NB: Bug in implementation interprets this as negative and has same
+    // effect as zero probes, but effectively reserves negative char values
+    // for future use.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // Similar, with 255 / -1
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From cca87d7722ac097dbf489a4214bb93a78f4ecba3 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 3 Oct 2019 10:20:17 -0700
Subject: [PATCH 427/572] Fix reopen voting logic in db_stress to prevent hangs
 (#5876)

Summary:
When multiple operations are performed in a db_stress thread in one loop
iteration, the reopen voting logic needs to take that into account. It
was doing that for MultiGet, but a new option was introduced recently to
do multiple iterator seeks per iteration, which broke it again. Fix the
logic to be more robust and agnostic of the type of operation performed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5876

Test Plan: Run db_stress

Differential Revision: D17733590

Pulled By: anand1976

fbshipit-source-id: 787f01abefa1e83bba43e0b4f4abb26699b2089e
---
 tools/db_stress.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 8c87338f7d9..4697960fd3b 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -2086,18 +2086,19 @@ class StressTest {
     const int delBound = writeBound + (int)FLAGS_delpercent;
     const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
     const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
-    int multiget_batch_size = 0;
 
     thread->stats.Start();
-    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+    for (uint64_t i = 0, prev_i = 0; i < FLAGS_ops_per_thread; i++) {
       if (thread->shared->HasVerificationFailedYet()) {
         break;
       }
-      // Check if the multiget batch crossed the ops_per_open boundary. If it
-      // did, then we should vote to reopen
+      // In case i is incremented more than once due to multiple operations,
+      // such as MultiGet or iterator seeks, check whether we have crossed
+      // the ops_per_open boundary in the previous iteration. If it did,
+      // then vote to reopen
       if (i != 0 &&
           (i % ops_per_open == 0 ||
-           i % ops_per_open < (i - multiget_batch_size) % ops_per_open)) {
+           i % ops_per_open < prev_i % ops_per_open)) {
         {
           thread->stats.FinishedSingleOp();
           MutexLock l(thread->shared->GetMutex());
@@ -2118,6 +2119,7 @@ class StressTest {
           // thread->stats.Start();
         }
       }
+      prev_i = i;
 
       // Change Options
       if (FLAGS_set_options_one_in > 0 &&
@@ -2312,14 +2314,13 @@ class StressTest {
       // Reset this in case we pick something other than a read op. We don't
       // want to use a stale value when deciding at the beginning of the loop
       // whether to vote to reopen
-      multiget_batch_size = 0;
       if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
         // OPERATION read
         if (FLAGS_use_multiget) {
           // Leave room for one more iteration of the loop with a single key
           // batch. This is to ensure that each thread does exactly the same
           // number of ops
-          multiget_batch_size = static_cast<int>(
+          int multiget_batch_size = static_cast<int>(
               std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
                        FLAGS_ops_per_thread - i - 1));
           // If its the last iteration, ensure that multiget_batch_size is 1

From 9f54446525a238960f2819288bfa4b2a7090c2c0 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 3 Oct 2019 13:17:44 -0700
Subject: [PATCH 428/572] Fix type in shift operation in bloom_test (#5882)

Summary:
Broken type for shift in PR#5834. Fixing code means fixing
expected values in test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5882

Test Plan: thisisthetest

Differential Revision: D17746136

Pulled By: pdillinger

fbshipit-source-id: d3c456ed30b433d55fcab6fc7d836940fe3b46b8
---
 util/bloom_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index f57d101bd92..921d2931f9a 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -302,7 +302,7 @@ class FullBloomTest : public testing::Test {
     uint64_t result = 0;
     for (int i = 0; i < 64; i++) {
       if (Matches(Key(i + 12345, buffer))) {
-        result |= 1 << i;
+        result |= uint64_t{1} << i;
       }
     }
     return result;
@@ -507,15 +507,15 @@ TEST_F(FullBloomTest, RawSchema) {
   // Two probes, about 3/4 bits set: ~50% "FP" rate
   // One 256-byte cache line.
   OpenRaw(cft.ResetWeirdFill(256, 1, 2));
-  ASSERT_EQ(18446744073675927543ULL, PackedMatches());
+  ASSERT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
 
   // Two 128-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 2, 2));
-  ASSERT_EQ(18446744073407559151ULL, PackedMatches());
+  ASSERT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
 
   // Four 64-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 4, 2));
-  ASSERT_EQ(18446744073441107966ULL, PackedMatches());
+  ASSERT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
 }
 
 TEST_F(FullBloomTest, CorruptFilters) {

From 19a97dd13977f6c1cc602b66aa786e87f11f474d Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 3 Oct 2019 20:52:17 -0700
Subject: [PATCH 429/572] Fix data block upper bound checking for iterator
 reseek case (#5883)

Summary:
When an iterator reseek happens with the user specifying a new iterate_upper_bound in ReadOptions, and the new seek position is at the end of the same data block, the Seek() ends up using a stale value of data_block_within_upper_bound_ and may return incorrect results.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5883

Test Plan: Added a new test case DBIteratorTest.IterReseekNewUpperBound. Verified that it failed due to the assertion failure without the fix, and passes with the fix.

Differential Revision: D17752740

Pulled By: anand1976

fbshipit-source-id: f9b635ff5d6aeb0e1bef102cf8b2f900efd378e3
---
 db/db_iterator_test.cc                        | 27 +++++++++++++++++++
 table/block_based/block_based_table_reader.cc | 11 +++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 00207461e9a..6abe40b276b 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -183,6 +183,33 @@ TEST_P(DBIteratorTest, IterSeekBeforePrev) {
   delete iter;
 }
 
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+  Random rnd(301);
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.block_size_deviation = 50;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("a", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("b", RandomString(&rnd, 400)));
+  dbfull()->Flush(FlushOptions());
+  ReadOptions opts;
+  Slice ub = Slice("aa");
+  opts.iterate_upper_bound = &ub;
+  auto iter = NewIterator(opts);
+  iter->Seek(Slice("a"));
+  ub = Slice("b");
+  iter->Seek(Slice("aabc"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aaef");
+  delete iter;
+}
+
 TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 4c58c23eca3..acab50aaf3b 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2707,11 +2707,21 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
     // Index contains the first key of the block, and it's >= target.
     // We can defer reading the block.
     is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
     ResetDataIter();
   } else {
     // Need to use the data block.
     if (!same_block) {
       InitDataBlock();
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
     }
 
     if (target) {
@@ -2722,7 +2732,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
     FindKeyForward();
   }
 
-  CheckDataBlockWithinUpperBound();
   CheckOutOfBound();
 
   if (target) {

From da3b2840cb20639ef9f468d9b96f1285cbd049f8 Mon Sep 17 00:00:00 2001
From: jsteemann <jan@arangodb.com>
Date: Mon, 7 Oct 2019 12:21:39 -0700
Subject: [PATCH 430/572] save a few redundant container lookups (#5875)

Summary:
This PR eliminates repeated lookups in associative or ordered containers when a single lookup suffices.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5875

Differential Revision: D17753172

Pulled By: anand1976

fbshipit-source-id: 796b02b760082521d8c42a1cb65a76bf0e6c1b8e
---
 db/version_builder.cc                             | 10 ++++------
 utilities/transactions/pessimistic_transaction.cc |  7 ++++---
 utilities/transactions/transaction_base.cc        |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/db/version_builder.cc b/db/version_builder.cc
index 8a8aefcb881..53e25a446a8 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -303,10 +303,7 @@ class VersionBuilder::Rep {
           levels_[level].added_files.erase(exising);
         }
       } else {
-        auto exising = invalid_levels_[level].find(number);
-        if (exising != invalid_levels_[level].end()) {
-          invalid_levels_[level].erase(exising);
-        } else {
+        if (invalid_levels_[level].erase(number) == 0) {
           // Deleting an non-existing file on invalid level.
           has_invalid_levels_ = true;
         }
@@ -326,8 +323,9 @@ class VersionBuilder::Rep {
         levels_[level].added_files[f->fd.GetNumber()] = f;
       } else {
         uint64_t number = new_file.second.fd.GetNumber();
-        if (invalid_levels_[level].count(number) == 0) {
-          invalid_levels_[level].insert(number);
+        auto& lvls = invalid_levels_[level];
+        if (lvls.count(number) == 0) {
+          lvls.insert(number);
         } else {
           // Creating an already existing file on invalid level.
           has_invalid_levels_ = true;
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 1c0e2f06384..c709d34b1c5 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -473,10 +473,11 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch,
     void RecordKey(uint32_t column_family_id, const Slice& key) {
       std::string key_str = key.ToString();
 
-      auto iter = (keys_)[column_family_id].find(key_str);
-      if (iter == (keys_)[column_family_id].end()) {
+      auto& cfh_keys = keys_[column_family_id];
+      auto iter = cfh_keys.find(key_str);
+      if (iter == cfh_keys.end()) {
         // key not yet seen, store it.
-        (keys_)[column_family_id].insert({std::move(key_str)});
+        cfh_keys.insert({std::move(key_str)});
       }
     }
 
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 637a38b9ab8..c10c1795f4e 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -170,7 +170,7 @@ Status TransactionBaseImpl::RollbackToSavePoint() {
         }
         if (tracked_keys_iter->second.num_reads == 0 &&
             tracked_keys_iter->second.num_writes == 0) {
-          tracked_keys_[column_family_id].erase(tracked_keys_iter);
+          cf_tracked_keys.erase(tracked_keys_iter);
         }
       }
     }

From 9905101c8cb2c8d2e599f06dd3c350d5c8da2cad Mon Sep 17 00:00:00 2001
From: lokeshgupta0912 <56107114+lokeshgupta0912@users.noreply.github.com>
Date: Mon, 7 Oct 2019 12:23:27 -0700
Subject: [PATCH 431/572] Replaced some words (#5877)

Summary:
improved Vocabulary
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5877

Differential Revision: D17753217

Pulled By: anand1976

fbshipit-source-id: f255418534297e537a2735f0a0546c724b8f7c70
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f1bc0c05f3c..9ef21ee57df 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,11 @@ It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by S
 and Jeff Dean (jeff@google.com)
 
 This code is a library that forms the core building block for a fast
-key value server, especially suited for storing data on flash drives.
+key-value server, especially suited for storing data on flash drives.
 It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
 between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
 and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
-making it specially suitable for storing multiple terabytes of data in a
+making it especially suitable for storing multiple terabytes of data in a
 single database.
 
 Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples

From 457bcfde028c37e01a087e631bb87ea49b5a5f98 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 7 Oct 2019 17:47:19 -0700
Subject: [PATCH 432/572] Let TestEnv and FaultInjectEnv use Env of choice
 (#5886)

Summary:
Instead of hard coding Env::Default in TestEnv and a few other places, use the
DBTestBase::env_ that has been deduced from the constructor.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5886

Test Plan:
```
make check
```

Differential Revision: D17773029

Pulled By: riversand963

fbshipit-source-id: 7ce4e5175a487e9d281ea2c3aae3c41bffd44629
---
 db/db_basic_test.cc | 55 ++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 23f7d445116..b3bd681b2d5 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -913,30 +913,30 @@ TEST_F(DBBasicTest, MmapAndBufferOptions) {
 
 class TestEnv : public EnvWrapper {
   public:
-    explicit TestEnv() : EnvWrapper(Env::Default()),
-                close_count(0) { }
-
-    class TestLogger : public Logger {
-      public:
-        using Logger::Logv;
-        TestLogger(TestEnv *env_ptr) : Logger() { env = env_ptr; }
-        ~TestLogger() override {
-          if (!closed_) {
-            CloseHelper();
-          }
-        }
-        void Logv(const char* /*format*/, va_list /*ap*/) override{};
-
-       protected:
-        Status CloseImpl() override { return CloseHelper(); }
-
-       private:
-        Status CloseHelper() {
-          env->CloseCountInc();;
-          return Status::IOError();
-        }
-        TestEnv *env;
-    };
+   explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+
+   class TestLogger : public Logger {
+    public:
+     using Logger::Logv;
+     explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+     ~TestLogger() override {
+       if (!closed_) {
+         CloseHelper();
+       }
+     }
+     void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+    protected:
+     Status CloseImpl() override { return CloseHelper(); }
+
+    private:
+     Status CloseHelper() {
+       env->CloseCountInc();
+       ;
+       return Status::IOError();
+     }
+     TestEnv* env;
+   };
 
     void CloseCountInc() { close_count++; }
 
@@ -958,7 +958,8 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   DB* db = nullptr;
-  TestEnv* env = new TestEnv();
+  TestEnv* env = new TestEnv(env_);
+  std::unique_ptr<TestEnv> local_env_guard(env);
   options.create_if_missing = true;
   options.env = env;
   Status s = DB::Open(options, dbname, &db);
@@ -992,13 +993,11 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_EQ(env->GetCloseCount(), 2);
   options.info_log.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
-
-  delete options.env;
 }
 
 TEST_F(DBBasicTest, DBCloseFlushError) {
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.manual_wal_flush = true;

From 46ca51d430465dd625cb79634350fd986a4e3008 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 7 Oct 2019 20:09:27 -0700
Subject: [PATCH 433/572] filter_bench - a prelim tool for SST filter
 benchmarking (#5825)

Summary:
Example: using the tool before and after PR https://github.com/facebook/rocksdb/issues/5784 shows that
the refactoring, presumed performance-neutral, actually sped up SST
filters by about 3% to 8% (repeatable result):

Before:
-  Dry run ns/op: 22.4725
-  Single filter ns/op: 51.1078
-  Random filter ns/op: 120.133

After:
+  Dry run ns/op: 22.2301
+  Single filter run ns/op: 47.4313
+  Random filter ns/op: 115.9

Only tests filters for the block-based table (full filters and
partitioned filters - same implementation; not block-based filters),
which seems to be the recommended format/implementation.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5825

Differential Revision: D17804987

Pulled By: pdillinger

fbshipit-source-id: 0f18a9c254c57f7866030d03e7fa4ba503bac3c5
---
 CMakeLists.txt                                |   1 +
 Makefile                                      |   5 +-
 .../block_based_filter_block_test.cc          |  49 +-
 table/block_based/full_filter_block_test.cc   |  48 +-
 table/block_based/mock_block_based_table.h    |  45 ++
 util/filter_bench.cc                          | 461 ++++++++++++++++++
 util/gflags_compat.h                          |   7 +
 7 files changed, 532 insertions(+), 84 deletions(-)
 create mode 100644 table/block_based/mock_block_based_table.h
 create mode 100644 util/filter_bench.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5412ab903a..68c28f1b345 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1046,6 +1046,7 @@ if(WITH_TESTS)
     db/range_del_aggregator_bench.cc
     tools/db_bench.cc
     table/table_reader_bench.cc
+    util/filter_bench.cc
     utilities/persistent_cache/hash_table_bench.cc)
   add_library(testharness OBJECT test_util/testharness.cc)
   foreach(sourcefile ${BENCHMARKS})
diff --git a/Makefile b/Makefile
index fbba9990747..99d25d02e4a 100644
--- a/Makefile
+++ b/Makefile
@@ -649,7 +649,7 @@ TEST_LIBS = \
 	librocksdb_env_basic_test.a
 
 # TODO: add back forward_iterator_bench, after making it build in all environemnts.
-BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench persistent_cache_bench range_del_aggregator_bench
+BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench filter_bench persistent_cache_bench range_del_aggregator_bench
 
 # if user didn't config LIBNAME, set the default
 ifeq ($(LIBNAME),)
@@ -1162,6 +1162,9 @@ persistent_cache_bench: utilities/persistent_cache/persistent_cache_bench.o $(LI
 memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
+filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
 db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index d223dec6e1f..eb7a2dc1d5c 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "table/block_based/block_based_filter_block.h"
-
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -48,28 +48,10 @@ class MockBlockBasedTable : public BlockBasedTable {
       : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
 };
 
-class FilterBlockTest : public testing::Test {
+class FilterBlockTest : public mock::MockBlockBasedTableTester,
+                        public testing::Test {
  public:
-  Options options_;
-  ImmutableCFOptions ioptions_;
-  EnvOptions env_options_;
-  BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp_;
-  std::unique_ptr<BlockBasedTable> table_;
-
-  FilterBlockTest()
-      : ioptions_(options_),
-        env_options_(options_),
-        icomp_(options_.comparator) {
-    table_options_.filter_policy.reset(new TestHashFilter);
-
-    constexpr bool skip_filters = false;
-    constexpr int level = 0;
-    constexpr bool immortal_table = false;
-    table_.reset(new MockBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table)));
-  }
+  FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {}
 };
 
 TEST_F(FilterBlockTest, EmptyBuilder) {
@@ -254,28 +236,11 @@ TEST_F(FilterBlockTest, MultiChunk) {
 
 // Test for block based filter block
 // use new interface in FilterPolicy to create filter builder/reader
-class BlockBasedFilterBlockTest : public testing::Test {
+class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
  public:
-  Options options_;
-  ImmutableCFOptions ioptions_;
-  EnvOptions env_options_;
-  BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp_;
-  std::unique_ptr<BlockBasedTable> table_;
-
   BlockBasedFilterBlockTest()
-      : ioptions_(options_),
-        env_options_(options_),
-        icomp_(options_.comparator) {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
-
-    constexpr bool skip_filters = false;
-    constexpr int level = 0;
-    constexpr bool immortal_table = false;
-    table_.reset(new MockBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table)));
-  }
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10)) {}
 };
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index b87db6def94..6ee1092dc23 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -4,9 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/block_based/full_filter_block.h"
-
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
 #include "table/full_filter_bits_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -100,28 +100,11 @@ class TestHashFilter : public FilterPolicy {
   }
 };
 
-class PluginFullFilterBlockTest : public testing::Test {
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
  public:
-  Options options_;
-  ImmutableCFOptions ioptions_;
-  EnvOptions env_options_;
-  BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp_;
-  std::unique_ptr<BlockBasedTable> table_;
-
   PluginFullFilterBlockTest()
-      : ioptions_(options_),
-        env_options_(options_),
-        icomp_(options_.comparator) {
-    table_options_.filter_policy.reset(new TestHashFilter);
-
-    constexpr bool skip_filters = false;
-    constexpr int level = 0;
-    constexpr bool immortal_table = false;
-    table_.reset(new MockBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table)));
-  }
+      : mock::MockBlockBasedTableTester(new TestHashFilter) {}
 };
 
 TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
@@ -193,28 +176,11 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
       /*lookup_context=*/nullptr));
 }
 
-class FullFilterBlockTest : public testing::Test {
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                            public testing::Test {
  public:
-  Options options_;
-  ImmutableCFOptions ioptions_;
-  EnvOptions env_options_;
-  BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp_;
-  std::unique_ptr<BlockBasedTable> table_;
-
   FullFilterBlockTest()
-      : ioptions_(options_),
-        env_options_(options_),
-        icomp_(options_.comparator) {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
-
-    constexpr bool skip_filters = false;
-    constexpr int level = 0;
-    constexpr bool immortal_table = false;
-    table_.reset(new MockBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table)));
-  }
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
 };
 
 TEST_F(FullFilterBlockTest, EmptyBuilder) {
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
new file mode 100644
index 00000000000..7775e06781a
--- /dev/null
+++ b/table/block_based/mock_block_based_table.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "table/block_based/block_based_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+
+namespace rocksdb {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+ public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  MockBlockBasedTableTester(const FilterPolicy *filter_policy)
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.filter_policy.reset(filter_policy);
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table)));
+  }
+};
+
+}  // namespace mock
+}  // namespace rocksdb
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
new file mode 100644
index 00000000000..336218f94f2
--- /dev/null
+++ b/util/filter_bench.cc
@@ -0,0 +1,461 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <cinttypes>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_int64(seed, 0, "Seed for random number generators");
+
+DEFINE_double(working_mem_size_mb, 200,
+              "MB of memory to get up to among all filters");
+
+DEFINE_uint32(average_keys_per_filter, 10000,
+              "Average number of keys per filter");
+
+DEFINE_uint32(key_size, 16, "Number of bytes each key should be");
+
+DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
+
+DEFINE_uint32(bits_per_key, 10, "Bits per key setting for filters");
+
+DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
+
+DEFINE_bool(use_full_block_reader, false,
+            "Use FullFilterBlockReader interface rather than FilterBitsReader");
+
+DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
+
+DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
+
+DEFINE_bool(legend, false,
+            "Print more information about interpreting results instead of "
+            "running tests");
+
+void _always_assert_fail(int line, const char *file, const char *expr) {
+  fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
+  abort();
+}
+
+#define ALWAYS_ASSERT(cond) \
+  ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
+
+using rocksdb::BlockContents;
+using rocksdb::CachableEntry;
+using rocksdb::fastrange32;
+using rocksdb::FilterBitsBuilder;
+using rocksdb::FilterBitsReader;
+using rocksdb::FullFilterBlockReader;
+using rocksdb::Slice;
+using rocksdb::mock::MockBlockBasedTableTester;
+
+struct KeyMaker {
+  KeyMaker(size_t size)
+      : data_(new char[size]),
+        slice_(data_.get(), size),
+        vals_(reinterpret_cast<uint32_t *>(data_.get())) {
+    assert(size >= 8);
+    memset(data_.get(), 0, size);
+  }
+  std::unique_ptr<char[]> data_;
+  Slice slice_;
+  uint32_t *vals_;
+
+  Slice Get(uint32_t filter_num, uint32_t val_num) {
+    vals_[0] = filter_num + val_num;
+    vals_[1] = val_num;
+    return slice_;
+  }
+};
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+struct FilterInfo {
+  uint32_t filter_id_ = 0;
+  std::unique_ptr<const char[]> owner_;
+  Slice filter_;
+  uint32_t keys_added_ = 0;
+  std::unique_ptr<FilterBitsReader> reader_;
+  std::unique_ptr<FullFilterBlockReader> full_block_reader_;
+  uint64_t outside_queries_ = 0;
+  uint64_t false_positives_ = 0;
+};
+
+enum TestMode {
+  kSingleFilter,
+  kBatchPrepared,
+  kBatchUnprepared,
+  kFiftyOneFilter,
+  kEightyTwentyFilter,
+  kRandomFilter,
+};
+
+static const std::vector<TestMode> allTestModes = {
+    kSingleFilter,   kBatchPrepared,      kBatchUnprepared,
+    kFiftyOneFilter, kEightyTwentyFilter, kRandomFilter,
+};
+
+static const std::vector<TestMode> quickTestModes = {
+    kSingleFilter,
+    kRandomFilter,
+};
+
+const char *TestModeToString(TestMode tm) {
+  switch (tm) {
+    case kSingleFilter:
+      return "Single filter";
+    case kBatchPrepared:
+      return "Batched, prepared";
+    case kBatchUnprepared:
+      return "Batched, unprepared";
+    case kFiftyOneFilter:
+      return "Skewed 50% in 1%";
+    case kEightyTwentyFilter:
+      return "Skewed 80% in 20%";
+    case kRandomFilter:
+      return "Random filter";
+  }
+  return "Bad TestMode";
+}
+
+struct FilterBench : public MockBlockBasedTableTester {
+  std::vector<KeyMaker> kms_;
+  std::vector<FilterInfo> infos_;
+  std::mt19937 random_;
+
+  FilterBench()
+      : MockBlockBasedTableTester(
+            rocksdb::NewBloomFilterPolicy(FLAGS_bits_per_key)),
+        random_(FLAGS_seed) {
+    for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
+      kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
+    }
+  }
+
+  void Go();
+
+  void RandomQueryTest(bool inside, bool dry_run, TestMode mode);
+};
+
+void FilterBench::Go() {
+  std::unique_ptr<FilterBitsBuilder> builder(
+      table_options_.filter_policy->GetFilterBitsBuilder());
+
+  uint32_t variance_mask = 1;
+  while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) {
+    variance_mask = variance_mask * 2 + 1;
+  }
+
+  const std::vector<TestMode> &testModes =
+      FLAGS_quick ? quickTestModes : allTestModes;
+  if (FLAGS_quick) {
+    FLAGS_m_queries /= 10.0;
+  }
+
+  std::cout << "Building..." << std::endl;
+
+  size_t total_memory_used = 0;
+  size_t total_keys_added = 0;
+
+  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+
+  while (total_memory_used < 1024 * 1024 * FLAGS_working_mem_size_mb) {
+    uint32_t filter_id = random_();
+    uint32_t keys_to_add = FLAGS_average_keys_per_filter +
+                           (random_() & variance_mask) - (variance_mask / 2);
+    for (uint32_t i = 0; i < keys_to_add; ++i) {
+      builder->AddKey(kms_[0].Get(filter_id, i));
+    }
+    infos_.emplace_back();
+    FilterInfo &info = infos_.back();
+    info.filter_id_ = filter_id;
+    info.filter_ = builder->Finish(&info.owner_);
+    info.keys_added_ = keys_to_add;
+    info.reader_.reset(
+        table_options_.filter_policy->GetFilterBitsReader(info.filter_));
+    CachableEntry<BlockContents> block(
+        new BlockContents(info.filter_), nullptr /* cache */,
+        nullptr /* cache_handle */, true /* own_value */);
+    info.full_block_reader_.reset(
+        new FullFilterBlockReader(table_.get(), std::move(block)));
+    total_memory_used += info.filter_.size();
+    total_keys_added += keys_to_add;
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / total_keys_added;
+  std::cout << "Build avg ns/key: " << ns << std::endl;
+  std::cout << "Number of filters: " << infos_.size() << std::endl;
+  std::cout << "Total memory (MB): " << total_memory_used / 1024.0 / 1024.0
+            << std::endl;
+
+  double bpk = total_memory_used * 8.0 / total_keys_added;
+  std::cout << "Bits/key actual: " << bpk << std::endl;
+  if (!FLAGS_quick) {
+    double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
+    std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
+              << std::endl;
+    std::cout << "Tolerable FP rate %: " << 100.0 * tolerable_rate << std::endl;
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Verifying..." << std::endl;
+
+    uint32_t outside_q_per_f = 1000000 / infos_.size();
+    uint64_t fps = 0;
+    for (uint32_t i = 0; i < infos_.size(); ++i) {
+      FilterInfo &info = infos_[i];
+      for (uint32_t j = 0; j < info.keys_added_; ++j) {
+        ALWAYS_ASSERT(info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+      }
+      for (uint32_t j = 0; j < outside_q_per_f; ++j) {
+        fps += info.reader_->MayMatch(
+            kms_[0].Get(info.filter_id_, j | 0x80000000));
+      }
+    }
+    std::cout << " No FNs :)" << std::endl;
+    double prelim_rate = double(fps) / outside_q_per_f / infos_.size();
+    std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl;
+
+    if (!FLAGS_allow_bad_fp_rate) {
+      ALWAYS_ASSERT(prelim_rate < tolerable_rate);
+    }
+  }
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Inside queries..." << std::endl;
+  random_.seed(FLAGS_seed + 1);
+  RandomQueryTest(/*inside*/ true, /*dry_run*/ true, kRandomFilter);
+  for (TestMode tm : testModes) {
+    random_.seed(FLAGS_seed + 1);
+    RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
+  }
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Outside queries..." << std::endl;
+  random_.seed(FLAGS_seed + 2);
+  RandomQueryTest(/*inside*/ false, /*dry_run*/ true, kRandomFilter);
+  for (TestMode tm : testModes) {
+    random_.seed(FLAGS_seed + 2);
+    RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
+  }
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
+}
+
+void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
+  for (auto &info : infos_) {
+    info.outside_queries_ = 0;
+    info.false_positives_ = 0;
+  }
+
+  uint32_t dry_run_hash = 0;
+  uint64_t max_queries =
+      static_cast<uint64_t>(FLAGS_m_queries * 1000000 + 0.50);
+  // Some filters may be considered secondary in order to implement skewed
+  // queries. num_primary_filters is the number that are to be treated as
+  // equal, and any remainder will be treated as secondary.
+  size_t num_primary_filters = infos_.size();
+  // The proportion (when divided by 2^32 - 1) of filter queries going to
+  // the primary filters (default = all). The remainder of queries are
+  // against secondary filters.
+  uint32_t primary_filter_threshold = 0xffffffff;
+  if (mode == kSingleFilter) {
+    // 100% of queries to 1 filter
+    num_primary_filters = 1;
+  } else if (mode == kFiftyOneFilter) {
+    // 50% of queries
+    primary_filter_threshold /= 2;
+    // to 1% of filters
+    num_primary_filters = (num_primary_filters + 99) / 100;
+  } else if (mode == kEightyTwentyFilter) {
+    // 80% of queries
+    primary_filter_threshold = primary_filter_threshold / 5 * 4;
+    // to 20% of filters
+    num_primary_filters = (num_primary_filters + 4) / 5;
+  }
+  size_t batch_size = 1;
+  std::unique_ptr<Slice *[]> batch_slices;
+  std::unique_ptr<bool[]> batch_results;
+  if (mode == kBatchPrepared || mode == kBatchUnprepared) {
+    batch_size = kms_.size();
+    batch_slices.reset(new Slice *[batch_size]);
+    batch_results.reset(new bool[batch_size]);
+    for (size_t i = 0; i < batch_size; ++i) {
+      batch_slices[i] = &kms_[i].slice_;
+      batch_results[i] = false;
+    }
+  }
+
+  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+
+  for (uint64_t q = 0; q < max_queries; q += batch_size) {
+    uint32_t filter_index;
+    if (random_() <= primary_filter_threshold) {
+      filter_index = fastrange32(num_primary_filters, random_());
+    } else {
+      // secondary
+      filter_index =
+          num_primary_filters +
+          fastrange32(infos_.size() - num_primary_filters, random_());
+    }
+    FilterInfo &info = infos_[filter_index];
+    for (size_t i = 0; i < batch_size; ++i) {
+      if (inside) {
+        kms_[i].Get(info.filter_id_, fastrange32(info.keys_added_, random_()));
+      } else {
+        kms_[i].Get(info.filter_id_, random_() | 0x80000000);
+        info.outside_queries_++;
+      }
+    }
+    // TODO: implement batched interface to full block reader
+    if (mode == kBatchPrepared && !dry_run && !FLAGS_use_full_block_reader) {
+      for (size_t i = 0; i < batch_size; ++i) {
+        batch_results[i] = false;
+      }
+      info.reader_->MayMatch(batch_size, batch_slices.get(),
+                             batch_results.get());
+      for (size_t i = 0; i < batch_size; ++i) {
+        if (inside) {
+          ALWAYS_ASSERT(batch_results[i]);
+        } else {
+          info.false_positives_ += batch_results[i];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < batch_size; ++i) {
+        if (dry_run) {
+          dry_run_hash ^= rocksdb::BloomHash(kms_[i].slice_);
+        } else {
+          bool may_match;
+          if (FLAGS_use_full_block_reader) {
+            may_match = info.full_block_reader_->KeyMayMatch(
+                kms_[i].slice_,
+                /*prefix_extractor=*/nullptr,
+                /*block_offset=*/rocksdb::kNotValid,
+                /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                /*get_context=*/nullptr,
+                /*lookup_context=*/nullptr);
+          } else {
+            may_match = info.reader_->MayMatch(kms_[i].slice_);
+          }
+          if (inside) {
+            ALWAYS_ASSERT(may_match);
+          } else {
+            info.false_positives_ += may_match;
+          }
+        }
+      }
+    }
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / max_queries;
+
+  if (dry_run) {
+    // Printing part of hash prevents dry run components from being optimized
+    // away by compiler
+    std::cout << "  Dry run (" << std::hex << (dry_run_hash & 0xfff) << std::dec
+              << ") ";
+  } else {
+    std::cout << "  " << TestModeToString(mode) << " ";
+  }
+  std::cout << "ns/op: " << ns << std::endl;
+
+  if (!inside && !dry_run && mode == kRandomFilter) {
+    uint64_t q = 0;
+    uint64_t fp = 0;
+    double worst_fp_rate = 0.0;
+    double best_fp_rate = 1.0;
+    for (auto &info : infos_) {
+      q += info.outside_queries_;
+      fp += info.false_positives_;
+      if (info.outside_queries_ > 0) {
+        double fp_rate = double(info.false_positives_) / info.outside_queries_;
+        worst_fp_rate = std::max(worst_fp_rate, fp_rate);
+        best_fp_rate = std::min(best_fp_rate, fp_rate);
+      }
+    }
+    std::cout << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
+    if (!FLAGS_quick) {
+      std::cout << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
+                << std::endl;
+      std::cout << "    Best    FP rate %: " << 100.0 * best_fp_rate
+                << std::endl;
+      std::cout << "    Best possible bits/key: "
+                << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [-quick] [OTHER OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  if (FLAGS_legend) {
+    std::cout
+        << "Legend:" << std::endl
+        << "  \"Inside\" - key that was added to filter" << std::endl
+        << "  \"Outside\" - key that was not added to filter" << std::endl
+        << "  \"FN\" - false negative query (must not happen)" << std::endl
+        << "  \"FP\" - false positive query (OK at low rate)" << std::endl
+        << "  \"Dry run\" - cost of testing and hashing overhead. Consider"
+        << "\n     subtracting this cost from the others." << std::endl
+        << "  \"Single filter\" - essentially minimum cost, assuming filter"
+        << "\n     fits easily in L1 CPU cache." << std::endl
+        << "  \"Batched, prepared\" - several queries at once against a"
+        << "\n     randomly chosen filter, using multi-query interface."
+        << std::endl
+        << "  \"Batched, unprepared\" - similar, but using serial calls"
+        << "\n     to single query interface." << std::endl
+        << "  \"Random filter\" - a filter is chosen at random as target"
+        << "\n     of each query." << std::endl
+        << "  \"Skewed X% in Y%\" - like \"Random filter\" except Y% of"
+        << "\n      the filters are designated as \"hot\" and receive X%"
+        << "\n      of queries." << std::endl;
+  } else {
+    FilterBench b;
+    b.Go();
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/util/gflags_compat.h b/util/gflags_compat.h
index 0ea3aef5e94..d5a30ce7e5d 100644
--- a/util/gflags_compat.h
+++ b/util/gflags_compat.h
@@ -10,3 +10,10 @@
 // still google by default.
 #define GFLAGS_NAMESPACE google
 #endif
+
+#ifndef DEFINE_uint32
+// DEFINE_uint32 does not appear in older versions of gflags. This should be
+// a sane definition for those versions.
+#define DEFINE_uint32(name, val, txt) \
+  DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, name, val, txt)
+#endif

From e3a93c9ee1aa3bcf2e6981da30dbf375ed1b2128 Mon Sep 17 00:00:00 2001
From: Tomas Kolda <koldat@gmail.com>
Date: Tue, 8 Oct 2019 14:18:48 -0700
Subject: [PATCH 434/572] Fix crash when background task fails (#5879)

Summary:
Fixing crash. Full story in issue: https://github.com/facebook/rocksdb/issues/5878
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5879

Differential Revision: D17812299

Pulled By: anand1976

fbshipit-source-id: 14e5a4fc502ade974583da9692d0ed6e5014613a
---
 HISTORY.md                             |  1 +
 db/db_impl/db_impl.cc                  | 22 +++++++++++++---------
 db/db_impl/db_impl.h                   |  5 +++--
 db/db_impl/db_impl_compaction_flush.cc | 15 +++++++++------
 db/db_impl/db_impl_open.cc             |  5 +++--
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 2431688f555..41d1078c7a8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
 * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
 * Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
+* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 1c2df575de7..d3b001d25c4 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2904,8 +2904,11 @@ DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
 }
 
 void DBImpl::ReleaseFileNumberFromPendingOutputs(
-    std::list<uint64_t>::iterator v) {
-  pending_outputs_.erase(v);
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    pending_outputs_.erase(*v.get());
+    v.reset();
+  }
 }
 
 #ifndef ROCKSDB_LITE
@@ -3744,7 +3747,7 @@ Status DBImpl::IngestExternalFiles(
 
   // TODO (yanqin) maybe handle the case in which column_families have
   // duplicates
-  std::list<uint64_t>::iterator pending_output_elem;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
   size_t total = 0;
   for (const auto& arg : args) {
     total += arg.external_files.size();
@@ -3752,7 +3755,7 @@ Status DBImpl::IngestExternalFiles(
   uint64_t next_file_number = 0;
   Status status = ReserveFileNumbersBeforeIngestion(
       static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
-      &pending_output_elem, &next_file_number);
+      pending_output_elem, &next_file_number);
   if (!status.ok()) {
     InstrumentedMutexLock l(&mutex_);
     ReleaseFileNumberFromPendingOutputs(pending_output_elem);
@@ -4026,7 +4029,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
   SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
   VersionEdit dummy_edit;
   uint64_t next_file_number = 0;
-  std::list<uint64_t>::iterator pending_output_elem;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
   {
     // Lock db mutex
     InstrumentedMutexLock l(&mutex_);
@@ -4036,7 +4039,8 @@ Status DBImpl::CreateColumnFamilyWithImport(
     }
 
     // Make sure that bg cleanup wont delete the files that we are importing
-    pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+    pending_output_elem.reset(new std::list<uint64_t>::iterator(
+        CaptureCurrentFileNumberInPendingOutputs()));
 
     if (status.ok()) {
       // If crash happen after a hard link established, Recover function may
@@ -4254,18 +4258,18 @@ Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
 
 Status DBImpl::ReserveFileNumbersBeforeIngestion(
     ColumnFamilyData* cfd, uint64_t num,
-    std::list<uint64_t>::iterator* pending_output_elem,
+    std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
     uint64_t* next_file_number) {
   Status s;
   SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
-  assert(nullptr != pending_output_elem);
   assert(nullptr != next_file_number);
   InstrumentedMutexLock l(&mutex_);
   if (error_handler_.IsDBStopped()) {
     // Do not ingest files when there is a bg_error
     return error_handler_.GetBGError();
   }
-  *pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+  pending_output_elem.reset(new std::list<uint64_t>::iterator(
+      CaptureCurrentFileNumberInPendingOutputs()));
   *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
   auto cf_options = cfd->GetLatestMutableCFOptions();
   VersionEdit dummy_edit;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 84acc845275..c1f4e66b9b0 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1313,7 +1313,8 @@ class DBImpl : public DB {
   // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
   // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
   // and blocked by any other pending_outputs_ calls)
-  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
+  void ReleaseFileNumberFromPendingOutputs(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
 
   Status SyncClosedLogs(JobContext* job_context);
 
@@ -1605,7 +1606,7 @@ class DBImpl : public DB {
   // Write a version edit to the MANIFEST.
   Status ReserveFileNumbersBeforeIngestion(
       ColumnFamilyData* cfd, uint64_t num,
-      std::list<uint64_t>::iterator* pending_output_elem,
+      std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
       uint64_t* next_file_number);
 #endif  //! ROCKSDB_LITE
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 2c8be284742..8e4dc411f53 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -969,8 +969,9 @@ Status DBImpl::CompactFilesImpl(
   GetSnapshotContext(job_context, &snapshot_seqs,
                      &earliest_write_conflict_snapshot, &snapshot_checker);
 
-  auto pending_outputs_inserted_elem =
-      CaptureCurrentFileNumberInPendingOutputs();
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
 
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
@@ -2216,8 +2217,9 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
     assert(bg_flush_scheduled_);
     num_running_flushes_++;
 
-    auto pending_outputs_inserted_elem =
-        CaptureCurrentFileNumberInPendingOutputs();
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
     FlushReason reason;
 
     Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
@@ -2298,8 +2300,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
 
     num_running_compactions_++;
 
-    auto pending_outputs_inserted_elem =
-        CaptureCurrentFileNumberInPendingOutputs();
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
 
     assert((bg_thread_pri == Env::Priority::BOTTOM &&
             bg_bottom_compaction_scheduled_) ||
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index b02676bb47c..44f6e6e23ab 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1136,8 +1136,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
-  auto pending_outputs_inserted_elem =
-      CaptureCurrentFileNumberInPendingOutputs();
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
   meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
   ReadOptions ro;
   ro.total_order_seek = true;

From 2f4e288143f1c0c9f5a9e829d1b20ce62aca4224 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Tue, 8 Oct 2019 16:48:57 -0700
Subject: [PATCH 435/572] Enable partitioned index/filter in stress tests
 (#5895)

Summary:
This is the 2nd attempt after the revert of https://github.com/facebook/rocksdb/pull/4020
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5895

Test Plan:
```
./tools/db_crashtest.py blackbox --simple --interval=10 --max_key=10000000
```

Differential Revision: D17822137

Pulled By: maysamyabandeh

fbshipit-source-id: 3d148c0d8cc129080410ff859c04b544223c8ea3
---
 tools/db_crashtest.py | 12 +++++++-----
 tools/db_stress.cc    |  3 +--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 75cf95b2f7d..f0bdb633494 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,8 +41,8 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
-    # Temporarily disable hash and partitioned index
-    "index_type": 0,
+    # Temporarily disable hash index
+    "index_type": lambda: random.choice([0,2]),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -51,7 +51,7 @@
     "nooverwritepercent": 1,
     "open_files": lambda : random.choice([-1, 500000]),
     # Temporarily disable partitioned filter
-    "partition_filters": 0,
+    "partition_filters": lambda: random.randint(0, 1),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -179,8 +179,10 @@ def finalize_and_sanitize(src_params):
         # now assertion failures are triggered.
         dest_params["compaction_ttl"] = 0
     if dest_params["partition_filters"] == 1:
-        dest_params["index_type"] = 2
-        dest_params["use_block_based_filter"] = 0
+        if dest_params["index_type"] != 2:
+            dest_params["partition_filters"] = 0
+        else:
+            dest_params["use_block_based_filter"] = 0
     return dest_params
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 4697960fd3b..b522062b154 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -354,8 +354,7 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter"
               "instead of full filter for block based table");
 
 DEFINE_bool(partition_filters, false,
-            "use partitioned filters "
-            "for block-based table");
+            "use partitioned filters for block-based table");
 
 DEFINE_int32(
     index_type,

From 167cdc9f17aedfd090cc937c7a2b35d4b51e6063 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 8 Oct 2019 19:17:39 -0700
Subject: [PATCH 436/572] Support custom env in sst_dump (#5845)

Summary:
This PR allows for the creation of custom env when using sst_dump. If
the user does not set options.env or set options.env to nullptr, then sst_dump
will automatically try to create a custom env depending on the path to the sst
file or db directory. In order to use this feature, the user must call
ObjectRegistry::Register() beforehand.

Test Plan (on devserver):
```
$make all && make check
```
All tests must pass to ensure this change does not break anything.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5845

Differential Revision: D17678038

Pulled By: riversand963

fbshipit-source-id: 58ecb4b3f75246d52b07c4c924a63ee61c1ee626
---
 HISTORY.md                          |  1 +
 env/env.cc                          | 30 ++++++++++++++++
 include/rocksdb/env.h               |  4 +++
 include/rocksdb/utilities/ldb_cmd.h |  5 +++
 tools/ldb_cmd.cc                    | 54 ++++++++++++++++++-----------
 tools/ldb_cmd_impl.h                |  4 +--
 tools/ldb_cmd_test.cc               | 41 +++++++++++++++++++---
 tools/ldb_test.py                   |  7 ++--
 tools/ldb_tool.cc                   |  2 ++
 tools/sst_dump_test.cc              | 46 ++++++++++++++++++++----
 tools/sst_dump_tool.cc              | 34 ++++++++++++++----
 11 files changed, 183 insertions(+), 45 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 41d1078c7a8..a3a0781ce78 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,7 @@
 * Deprecate `snap_refresh_nanos` option.
 * Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
 * Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
+* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
diff --git a/env/env.cc b/env/env.cc
index 4c222cfc19e..51614c12be6 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -43,6 +43,36 @@ Status Env::LoadEnv(const std::string& value, Env** result) {
   return s;
 }
 
+Status Env::LoadEnv(const std::string& value, Env** result,
+                    std::shared_ptr<Env>* guard) {
+  assert(result);
+  Status s;
+#ifndef ROCKSDB_LITE
+  Env* env = nullptr;
+  std::unique_ptr<Env> uniq_guard;
+  std::string err_msg;
+  assert(guard != nullptr);
+  env = ObjectRegistry::NewInstance()->NewObject<Env>(value, &uniq_guard,
+                                                      &err_msg);
+  if (!env) {
+    s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " +
+                         value);
+    env = Env::Default();
+  }
+  if (s.ok() && uniq_guard) {
+    guard->reset(uniq_guard.release());
+    *result = guard->get();
+  } else {
+    *result = env;
+  }
+#else
+  (void)result;
+  (void)guard;
+  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
+#endif
+  return s;
+}
+
 std::string Env::PriorityToString(Env::Priority priority) {
   switch (priority) {
     case Env::Priority::BOTTOM:
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index b3b30b5ac8c..0c5e590b2f1 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -152,6 +152,10 @@ class Env {
   // Loads the environment specified by the input value into the result
   static Status LoadEnv(const std::string& value, Env** result);
 
+  // Loads the environment specified by the input value into the result
+  static Status LoadEnv(const std::string& value, Env** result,
+                        std::shared_ptr<Env>* guard);
+
   // Return a default environment suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default environment.
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index 3a5c980f48f..cf7d25fba2c 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -29,6 +29,7 @@ namespace rocksdb {
 class LDBCommand {
  public:
   // Command-line arguments
+  static const std::string ARG_ENV_URI;
   static const std::string ARG_DB;
   static const std::string ARG_PATH;
   static const std::string ARG_SECONDARY_PATH;
@@ -128,6 +129,7 @@ class LDBCommand {
 
  protected:
   LDBCommandExecuteResult exec_state_;
+  std::string env_uri_;
   std::string db_path_;
   // If empty, open DB as primary. If non-empty, open the DB as secondary
   // with this secondary path. When running against a database opened by
@@ -176,6 +178,9 @@ class LDBCommand {
   /** List of command-line options valid for this command */
   const std::vector<std::string> valid_cmd_line_options_;
 
+  /** Shared pointer to underlying environment if applicable **/
+  std::shared_ptr<Env> env_guard_;
+
   bool ParseKeyValue(const std::string& line, std::string* key,
                      std::string* value, bool is_key_hex, bool is_value_hex);
 
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 50514c8a029..a47b3d4b8d0 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -45,6 +45,7 @@
 
 namespace rocksdb {
 
+const std::string LDBCommand::ARG_ENV_URI = "env_uri";
 const std::string LDBCommand::ARG_DB = "db";
 const std::string LDBCommand::ARG_PATH = "path";
 const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
@@ -274,6 +275,17 @@ void LDBCommand::Run() {
     return;
   }
 
+  if (!options_.env || options_.env == Env::Default()) {
+    Env* env = Env::Default();
+    Status s = Env::LoadEnv(env_uri_, &env, &env_guard_);
+    if (!s.ok() && !s.IsNotFound()) {
+      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+    options_.env = env;
+  }
+
   if (db_ == nullptr && !NoDBOpen()) {
     OpenDB();
     if (exec_state_.IsFailed() && try_load_options_) {
@@ -318,6 +330,11 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
     db_path_ = itr->second;
   }
 
+  itr = options.find(ARG_ENV_URI);
+  if (itr != options.end()) {
+    env_uri_ = itr->second;
+  }
+
   itr = options.find(ARG_CF_NAME);
   if (itr != options.end()) {
     column_family_name_ = itr->second;
@@ -341,7 +358,7 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
 
 void LDBCommand::OpenDB() {
   if (!create_if_missing_ && try_load_options_) {
-    Status s = LoadLatestOptions(db_path_, Env::Default(), &options_,
+    Status s = LoadLatestOptions(db_path_, options_.env, &options_,
                                  &column_families_, ignore_unknown_options_);
     if (!s.ok() && !s.IsNotFound()) {
       // Option file exists but load option file error.
@@ -397,7 +414,7 @@ void LDBCommand::OpenDB() {
     if (column_families_.empty()) {
       // Try to figure out column family lists
       std::vector<std::string> cf_list;
-      st = DB::ListColumnFamilies(DBOptions(), db_path_, &cf_list);
+      st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
       // There is possible the DB doesn't exist yet, for "create if not
       // "existing case". The failure is ignored here. We rely on DB::Open()
       // to give us the correct error message for problem with opening
@@ -487,7 +504,8 @@ ColumnFamilyHandle* LDBCommand::GetCfHandle() {
 
 std::vector<std::string> LDBCommand::BuildCmdLineOptions(
     std::vector<std::string> options) {
-  std::vector<std::string> ret = {ARG_DB,
+  std::vector<std::string> ret = {ARG_ENV_URI,
+                                  ARG_DB,
                                   ARG_SECONDARY_PATH,
                                   ARG_BLOOM_BITS,
                                   ARG_BLOCK_SIZE,
@@ -1095,31 +1113,23 @@ void ManifestDumpCommand::DoCommand() {
 void ListColumnFamiliesCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(ListColumnFamiliesCommand::Name());
-  ret.append(" full_path_to_db_directory ");
   ret.append("\n");
 }
 
 ListColumnFamiliesCommand::ListColumnFamiliesCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, {}) {
-  if (params.size() != 1) {
-    exec_state_ = LDBCommandExecuteResult::Failed(
-        "dbname must be specified for the list_column_families command");
-  } else {
-    dbname_ = params[0];
-  }
-}
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
 
 void ListColumnFamiliesCommand::DoCommand() {
   std::vector<std::string> column_families;
-  Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
+  Status s = DB::ListColumnFamilies(options_, db_path_, &column_families);
   if (!s.ok()) {
-    printf("Error in processing db %s %s\n", dbname_.c_str(),
+    printf("Error in processing db %s %s\n", db_path_.c_str(),
            s.ToString().c_str());
   } else {
-    printf("Column families in %s: \n{", dbname_.c_str());
+    printf("Column families in %s: \n{", db_path_.c_str());
     bool first = true;
     for (auto cf : column_families) {
       if (!first) {
@@ -2857,13 +2867,14 @@ void BackupCommand::DoCommand() {
   }
   printf("open db OK\n");
   Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env);
+  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  assert(custom_env != nullptr);
 
   BackupableDBOptions backup_options =
       BackupableDBOptions(backup_dir_, custom_env);
   backup_options.info_log = logger_.get();
   backup_options.max_background_operations = num_threads_;
-  status = BackupEngine::Open(Env::Default(), backup_options, &backup_engine);
+  status = BackupEngine::Open(custom_env, backup_options, &backup_engine);
   if (status.ok()) {
     printf("open backup engine OK\n");
   } else {
@@ -2893,7 +2904,8 @@ void RestoreCommand::Help(std::string& ret) {
 
 void RestoreCommand::DoCommand() {
   Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env);
+  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  assert(custom_env != nullptr);
 
   std::unique_ptr<BackupEngineReadOnly> restore_engine;
   Status status;
@@ -2902,8 +2914,8 @@ void RestoreCommand::DoCommand() {
     opts.info_log = logger_.get();
     opts.max_background_operations = num_threads_;
     BackupEngineReadOnly* raw_restore_engine_ptr;
-    status = BackupEngineReadOnly::Open(Env::Default(), opts,
-                                        &raw_restore_engine_ptr);
+    status =
+        BackupEngineReadOnly::Open(custom_env, opts, &raw_restore_engine_ptr);
     if (status.ok()) {
       restore_engine.reset(raw_restore_engine_ptr);
     }
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 38b2817c0d1..49ade584082 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -183,9 +183,6 @@ class ListColumnFamiliesCommand : public LDBCommand {
   virtual void DoCommand() override;
 
   virtual bool NoDBOpen() override { return true; }
-
- private:
-  std::string dbname_;
 };
 
 class CreateColumnFamilyCommand : public LDBCommand {
@@ -510,6 +507,7 @@ class BackupableCommand : public LDBCommand {
   std::string backup_dir_;
   int num_threads_;
   std::unique_ptr<Logger> logger_;
+  std::shared_ptr<Env> backup_env_guard_;
 
  private:
   static const std::string ARG_BACKUP_DIR;
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index a8e2d5c7b14..85afb118325 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/ldb_cmd.h"
+#include "port/stack_trace.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 
@@ -15,7 +16,23 @@ using std::map;
 
 namespace rocksdb {
 
-class LdbCmdTest : public testing::Test {};
+class LdbCmdTest : public testing::Test {
+ public:
+  LdbCmdTest() : testing::Test() {}
+
+  Env* TryLoadCustomOrDefaultEnv() {
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (!test_env_uri) {
+      return Env::Default();
+    }
+    Env* env = Env::Default();
+    Env::LoadEnv(test_env_uri, &env, &env_guard_);
+    return env;
+  }
+
+ private:
+  std::shared_ptr<Env> env_guard_;
+};
 
 TEST_F(LdbCmdTest, HexToString) {
   // map input to expected outputs.
@@ -51,7 +68,8 @@ TEST_F(LdbCmdTest, HexToStringBadInputs) {
 }
 
 TEST_F(LdbCmdTest, MemEnv) {
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
   Options opts;
   opts.env = env.get();
   opts.create_if_missing = true;
@@ -84,13 +102,15 @@ TEST_F(LdbCmdTest, MemEnv) {
 
 TEST_F(LdbCmdTest, OptionParsing) {
   // test parsing flags
+  Options opts;
+  opts.env = TryLoadCustomOrDefaultEnv();
   {
     std::vector<std::string> args;
     args.push_back("scan");
     args.push_back("--ttl");
     args.push_back("--timestamp");
     LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
-        args, Options(), LDBOptions(), nullptr);
+        args, opts, LDBOptions(), nullptr);
     const std::vector<std::string> flags = command->TEST_GetFlags();
     EXPECT_EQ(flags.size(), 2);
     EXPECT_EQ(flags[0], "ttl");
@@ -107,7 +127,7 @@ TEST_F(LdbCmdTest, OptionParsing) {
         "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_"
         "ef=gh.ijk'");
     LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
-        args, Options(), LDBOptions(), nullptr);
+        args, opts, LDBOptions(), nullptr);
     const std::map<std::string, std::string> option_map =
         command->TEST_GetOptionMap();
     EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/");
@@ -120,7 +140,8 @@ TEST_F(LdbCmdTest, OptionParsing) {
 }
 
 TEST_F(LdbCmdTest, ListFileTombstone) {
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
   Options opts;
   opts.env = env.get();
   opts.create_if_missing = true;
@@ -209,8 +230,18 @@ TEST_F(LdbCmdTest, ListFileTombstone) {
 }
 } // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 #else
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index 4403379460b..74bb7fb1676 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -516,13 +516,12 @@ def testWALDump(self):
 
     def testListColumnFamilies(self):
         print "Running testListColumnFamilies..."
-        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put x1 y1 --create_if_missing", "OK")
-        cmd = "list_column_families %s | grep -v \"Column families\""
+        cmd = "list_column_families | grep -v \"Column families\""
         # Test on valid dbPath.
-        self.assertRunOKFull(cmd % dbPath, "{default}")
+        self.assertRunOK(cmd, "{default}")
         # Test on empty path.
-        self.assertRunFAILFull(cmd % "")
+        self.assertRunFAIL(cmd)
 
     def testColumnFamilies(self):
         print "Running testColumnFamilies..."
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index 2831afe63b6..9b5669547a6 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -21,6 +21,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
              "=<full_path_to_db_directory> when necessary\n");
   ret.append("\n");
+  ret.append("commands can optionally specify --" + LDBCommand::ARG_ENV_URI +
+             "=<uri_of_environment> if necessary\n\n");
   ret.append(
       "The following optional parameters control if keys/values are "
       "input/output as hex or as plain strings:\n");
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index d7391ebab9c..0fc72d7385f 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/sst_dump_tool.h"
 
 #include "file/random_access_file_reader.h"
+#include "port/stack_trace.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
@@ -85,15 +86,33 @@ void cleanup(const Options& opts, const std::string& file_name) {
 
 // Test for sst dump tool "raw" mode
 class SSTDumpToolTest : public testing::Test {
-  std::string testDir_;
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<Env> env_guard_;
 
  public:
-  SSTDumpToolTest() { testDir_ = test::TmpDir(); }
+  SSTDumpToolTest() : env_(Env::Default()) {
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (test_env_uri) {
+      Env::LoadEnv(test_env_uri, &env_, &env_guard_);
+    }
+    test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
+    Status s = env_->CreateDirIfMissing(test_dir_);
+    EXPECT_OK(s);
+  }
+
+  ~SSTDumpToolTest() override {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "Data is still at %s\n", test_dir_.c_str());
+    } else {
+      EXPECT_OK(env_->DeleteDir(test_dir_));
+    }
+  }
 
-  ~SSTDumpToolTest() override {}
+  Env* env() { return env_; }
 
   std::string MakeFilePath(const std::string& file_name) const {
-    std::string path(testDir_);
+    std::string path(test_dir_);
     path.append("/").append(file_name);
     return path;
   }
@@ -112,6 +131,7 @@ class SSTDumpToolTest : public testing::Test {
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
   Options opts;
+  opts.env = env();
   std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
   createSST(opts, file_path);
 
@@ -129,6 +149,7 @@ TEST_F(SSTDumpToolTest, EmptyFilter) {
 
 TEST_F(SSTDumpToolTest, FilterBlock) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -149,6 +170,7 @@ TEST_F(SSTDumpToolTest, FilterBlock) {
 
 TEST_F(SSTDumpToolTest, FullFilterBlock) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -169,6 +191,7 @@ TEST_F(SSTDumpToolTest, FullFilterBlock) {
 
 TEST_F(SSTDumpToolTest, GetProperties) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -189,6 +212,7 @@ TEST_F(SSTDumpToolTest, GetProperties) {
 
 TEST_F(SSTDumpToolTest, CompressedSizes) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -208,9 +232,9 @@ TEST_F(SSTDumpToolTest, CompressedSizes) {
 }
 
 TEST_F(SSTDumpToolTest, MemEnv) {
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<Env> mem_env(NewMemEnv(env()));
   Options opts;
-  opts.env = env.get();
+  opts.env = mem_env.get();
   std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
   createSST(opts, file_path);
 
@@ -228,8 +252,18 @@ TEST_F(SSTDumpToolTest, MemEnv) {
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 9f3e7d6b2df..fa4a431e81b 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -150,7 +150,7 @@ Status SstFileDumper::VerifyChecksum() {
 
 Status SstFileDumper::DumpTable(const std::string& out_filename) {
   std::unique_ptr<WritableFile> out_file;
-  Env* env = Env::Default();
+  Env* env = options_.env;
   env->NewWritableFile(out_filename, &out_file, soptions_);
   Status s = table_reader_->DumpTable(out_file.get());
   out_file->Close();
@@ -161,7 +161,7 @@ uint64_t SstFileDumper::CalculateCompressedTableSize(
     const TableBuilderOptions& tb_options, size_t block_size,
     uint64_t* num_data_blocks) {
   std::unique_ptr<WritableFile> out_file;
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<Env> env(NewMemEnv(options_.env));
   env->NewWritableFile(testFileName, &out_file, soptions_);
   std::unique_ptr<WritableFileWriter> dest_writer;
   dest_writer.reset(
@@ -411,6 +411,9 @@ void print_help() {
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
+    --env_uri=<uri of underlying Env>
+      URI of underlying Env
+
     --command=check|scan|raw|verify
         check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
         scan: Iterate over entries in files and print them to screen
@@ -463,6 +466,7 @@ void print_help() {
 }  // namespace
 
 int SSTDumpTool::Run(int argc, char** argv, Options options) {
+  const char* env_uri = nullptr;
   const char* dir_or_file = nullptr;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
@@ -489,15 +493,16 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
   uint64_t total_index_block_size = 0;
   uint64_t total_filter_block_size = 0;
   for (int i = 1; i < argc; i++) {
-    if (strncmp(argv[i], "--file=", 7) == 0) {
+    if (strncmp(argv[i], "--env_uri=", 10) == 0) {
+      env_uri = argv[i] + 10;
+    } else if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
       output_hex = true;
     } else if (strcmp(argv[i], "--input_key_hex") == 0) {
       input_key_hex = true;
-    } else if (sscanf(argv[i],
-               "--read_num=%lu%c",
-               (unsigned long*)&n, &junk) == 1) {
+    } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
+               1) {
       read_num = n;
     } else if (strcmp(argv[i], "--verify_checksum") == 0) {
       verify_checksum = true;
@@ -589,6 +594,23 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
     exit(1);
   }
 
+  std::shared_ptr<rocksdb::Env> env_guard;
+
+  // If caller of SSTDumpTool::Run(...) does not specify a different env other
+  // than Env::Default(), then try to load custom env based on dir_or_file.
+  // Otherwise, the caller is responsible for creating custom env.
+  if (!options.env || options.env == rocksdb::Env::Default()) {
+    Env* env = Env::Default();
+    Status s = Env::LoadEnv(env_uri ? env_uri : "", &env, &env_guard);
+    if (!s.ok() && !s.IsNotFound()) {
+      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    options.env = env;
+  } else {
+    fprintf(stdout, "options.env is %p\n", options.env);
+  }
+
   std::vector<std::string> filenames;
   rocksdb::Env* env = options.env;
   rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);

From 90e285efde3ca28af1430a894b50818534b2e257 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 8 Oct 2019 19:19:43 -0700
Subject: [PATCH 437/572] Fix some implicit conversions in filter_bench (#5894)

Summary:
Fixed some spots where converting size_t or uint_fast32_t to
uint32_t. Wrapped mt19937 in a new Random32 class to avoid future
such traps.

NB: I tried using Random32::Uniform (std::uniform_int_distribution) in
filter_bench instead of fastrange, but that more than doubled the dry
run time! So I added fastrange as Random32::Uniformish. ;)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5894

Test Plan: USE_CLANG=1 build, and manual re-run filter_bench

Differential Revision: D17825131

Pulled By: pdillinger

fbshipit-source-id: 68feee333b5f8193c084ded760e3d6679b405ecd
---
 util/filter_bench.cc | 50 +++++++++++++++++++++++---------------------
 util/random.h        | 46 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 336218f94f2..c12bef66919 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -13,7 +13,6 @@ int main() {
 
 #include <cinttypes>
 #include <iostream>
-#include <random>
 #include <vector>
 
 #include "port/port.h"
@@ -23,13 +22,14 @@ int main() {
 #include "table/block_based/mock_block_based_table.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
+#include "util/random.h"
 #include "util/stop_watch.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
 using GFLAGS_NAMESPACE::SetUsageMessage;
 
-DEFINE_int64(seed, 0, "Seed for random number generators");
+DEFINE_uint32(seed, 0, "Seed for random number generators");
 
 DEFINE_double(working_mem_size_mb, 200,
               "MB of memory to get up to among all filters");
@@ -70,6 +70,7 @@ using rocksdb::fastrange32;
 using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
 using rocksdb::FullFilterBlockReader;
+using rocksdb::Random32;
 using rocksdb::Slice;
 using rocksdb::mock::MockBlockBasedTableTester;
 
@@ -154,7 +155,7 @@ const char *TestModeToString(TestMode tm) {
 struct FilterBench : public MockBlockBasedTableTester {
   std::vector<KeyMaker> kms_;
   std::vector<FilterInfo> infos_;
-  std::mt19937 random_;
+  Random32 random_;
 
   FilterBench()
       : MockBlockBasedTableTester(
@@ -193,9 +194,10 @@ void FilterBench::Go() {
   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
 
   while (total_memory_used < 1024 * 1024 * FLAGS_working_mem_size_mb) {
-    uint32_t filter_id = random_();
+    uint32_t filter_id = random_.Next();
     uint32_t keys_to_add = FLAGS_average_keys_per_filter +
-                           (random_() & variance_mask) - (variance_mask / 2);
+                           (random_.Next() & variance_mask) -
+                           (variance_mask / 2);
     for (uint32_t i = 0; i < keys_to_add; ++i) {
       builder->AddKey(kms_[0].Get(filter_id, i));
     }
@@ -256,19 +258,19 @@ void FilterBench::Go() {
 
   std::cout << "----------------------------" << std::endl;
   std::cout << "Inside queries..." << std::endl;
-  random_.seed(FLAGS_seed + 1);
+  random_.Seed(FLAGS_seed + 1);
   RandomQueryTest(/*inside*/ true, /*dry_run*/ true, kRandomFilter);
   for (TestMode tm : testModes) {
-    random_.seed(FLAGS_seed + 1);
+    random_.Seed(FLAGS_seed + 1);
     RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
   }
 
   std::cout << "----------------------------" << std::endl;
   std::cout << "Outside queries..." << std::endl;
-  random_.seed(FLAGS_seed + 2);
+  random_.Seed(FLAGS_seed + 2);
   RandomQueryTest(/*inside*/ false, /*dry_run*/ true, kRandomFilter);
   for (TestMode tm : testModes) {
-    random_.seed(FLAGS_seed + 2);
+    random_.Seed(FLAGS_seed + 2);
     RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
   }
 
@@ -282,13 +284,14 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     info.false_positives_ = 0;
   }
 
+  uint32_t num_infos = static_cast<uint32_t>(infos_.size());
   uint32_t dry_run_hash = 0;
   uint64_t max_queries =
       static_cast<uint64_t>(FLAGS_m_queries * 1000000 + 0.50);
   // Some filters may be considered secondary in order to implement skewed
   // queries. num_primary_filters is the number that are to be treated as
   // equal, and any remainder will be treated as secondary.
-  size_t num_primary_filters = infos_.size();
+  uint32_t num_primary_filters = num_infos;
   // The proportion (when divided by 2^32 - 1) of filter queries going to
   // the primary filters (default = all). The remainder of queries are
   // against secondary filters.
@@ -307,14 +310,14 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     // to 20% of filters
     num_primary_filters = (num_primary_filters + 4) / 5;
   }
-  size_t batch_size = 1;
+  uint32_t batch_size = 1;
   std::unique_ptr<Slice *[]> batch_slices;
   std::unique_ptr<bool[]> batch_results;
   if (mode == kBatchPrepared || mode == kBatchUnprepared) {
-    batch_size = kms_.size();
+    batch_size = static_cast<uint32_t>(kms_.size());
     batch_slices.reset(new Slice *[batch_size]);
     batch_results.reset(new bool[batch_size]);
-    for (size_t i = 0; i < batch_size; ++i) {
+    for (uint32_t i = 0; i < batch_size; ++i) {
       batch_slices[i] = &kms_[i].slice_;
       batch_results[i] = false;
     }
@@ -324,31 +327,30 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
 
   for (uint64_t q = 0; q < max_queries; q += batch_size) {
     uint32_t filter_index;
-    if (random_() <= primary_filter_threshold) {
-      filter_index = fastrange32(num_primary_filters, random_());
+    if (random_.Next() <= primary_filter_threshold) {
+      filter_index = random_.Uniformish(num_primary_filters);
     } else {
       // secondary
-      filter_index =
-          num_primary_filters +
-          fastrange32(infos_.size() - num_primary_filters, random_());
+      filter_index = num_primary_filters +
+                     random_.Uniformish(num_infos - num_primary_filters);
     }
     FilterInfo &info = infos_[filter_index];
-    for (size_t i = 0; i < batch_size; ++i) {
+    for (uint32_t i = 0; i < batch_size; ++i) {
       if (inside) {
-        kms_[i].Get(info.filter_id_, fastrange32(info.keys_added_, random_()));
+        kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
       } else {
-        kms_[i].Get(info.filter_id_, random_() | 0x80000000);
+        kms_[i].Get(info.filter_id_, random_.Next() | uint32_t{0x80000000});
         info.outside_queries_++;
       }
     }
     // TODO: implement batched interface to full block reader
     if (mode == kBatchPrepared && !dry_run && !FLAGS_use_full_block_reader) {
-      for (size_t i = 0; i < batch_size; ++i) {
+      for (uint32_t i = 0; i < batch_size; ++i) {
         batch_results[i] = false;
       }
       info.reader_->MayMatch(batch_size, batch_slices.get(),
                              batch_results.get());
-      for (size_t i = 0; i < batch_size; ++i) {
+      for (uint32_t i = 0; i < batch_size; ++i) {
         if (inside) {
           ALWAYS_ASSERT(batch_results[i]);
         } else {
@@ -356,7 +358,7 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
         }
       }
     } else {
-      for (size_t i = 0; i < batch_size; ++i) {
+      for (uint32_t i = 0; i < batch_size; ++i) {
         if (dry_run) {
           dry_run_hash ^= rocksdb::BloomHash(kms_[i].slice_);
         } else {
diff --git a/util/random.h b/util/random.h
index 2a5fcbc6ae9..f5beb9cd217 100644
--- a/util/random.h
+++ b/util/random.h
@@ -77,7 +77,51 @@ class Random {
   static Random* GetTLSInstance();
 };
 
-// A simple 64bit random number generator based on std::mt19937_64
+// A good 32-bit random number generator based on std::mt19937.
+// This exists in part to avoid compiler variance in warning about coercing
+// uint_fast32_t from mt19937 to uint32_t.
+class Random32 {
+ private:
+  std::mt19937 generator_;
+
+ public:
+  explicit Random32(uint32_t s) : generator_(s) {}
+
+  // Generates the next random number
+  uint32_t Next() { return static_cast<uint32_t>(generator_()); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(uint32_t n) {
+    return static_cast<uint32_t>(
+        std::uniform_int_distribution<std::mt19937::result_type>(
+            0, n - 1)(generator_));
+  }
+
+  // Returns an *almost* uniformly distributed value in the range [0..n-1].
+  // Much faster than Uniform().
+  // REQUIRES: n > 0
+  uint32_t Uniformish(uint32_t n) {
+    // fastrange (without the header)
+    return static_cast<uint32_t>((uint64_t(generator_()) * uint64_t(n)) >> 32);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint32_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(uint32_t{1} << Uniform(max_log + 1));
+  }
+
+  // Reset the seed of the generator to the given value
+  void Seed(uint32_t new_seed) { generator_.seed(new_seed); }
+};
+
+// A good 64-bit random number generator based on std::mt19937_64
 class Random64 {
  private:
   std::mt19937_64 generator_;

From 5b123813f8ac912e0140c41c0b19250e72fb4c26 Mon Sep 17 00:00:00 2001
From: katherine <kzak@fb.com>
Date: Wed, 9 Oct 2019 07:39:19 -0700
Subject: [PATCH 438/572] Remove deprecated RocksDBCommonHelper and
 cont_integration.sh (#5889)

Summary:
As titled. RocksDBCommonHelper contains references to legacy APIs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5889

Differential Revision: D17783179

fbshipit-source-id: dcde82a73a311bfa3300ad69189b3a32727134d1
---
 build_tools/RocksDBCommonHelper.php | 377 ----------------------------
 build_tools/cont_integration.sh     | 137 ----------
 2 files changed, 514 deletions(-)
 delete mode 100644 build_tools/RocksDBCommonHelper.php
 delete mode 100755 build_tools/cont_integration.sh

diff --git a/build_tools/RocksDBCommonHelper.php b/build_tools/RocksDBCommonHelper.php
deleted file mode 100644
index e7bfb520347..00000000000
--- a/build_tools/RocksDBCommonHelper.php
+++ /dev/null
@@ -1,377 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-// Name of the environment variables which need to be set by the entity which
-// triggers continuous runs so that code at the end of the file gets executed
-// and Sandcastle run starts.
-const ENV_POST_RECEIVE_HOOK = "POST_RECEIVE_HOOK";
-const ENV_HTTPS_APP_VALUE = "HTTPS_APP_VALUE";
-const ENV_HTTPS_TOKEN_VALUE = "HTTPS_TOKEN_VALUE";
-
-const PRIMARY_TOKEN_FILE = '/home/krad/.sandcastle';
-const CONT_RUN_ALIAS = "leveldb";
-
-//////////////////////////////////////////////////////////////////////
-/*  Run tests in sandcastle */
-function postURL($diffID, $url) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($url) > 0);
-
-  $cmd_args = array(
-    'diff_id' => (int)$diffID,
-    'name' => sprintf(
-      'click here for sandcastle tests for D%d',
-      (int)$diffID
-    ),
-    'link' => $url
-  );
-  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
-         . ' | arc call-conduit differential.updateunitresults';
-
-  shell_exec($cmd);
-}
-
-function buildUpdateTestStatusCmd($diffID, $test, $status) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($test) > 0);
-  assert(strlen($status) > 0);
-
-  $cmd_args = array(
-    'diff_id' => (int)$diffID,
-    'name' => $test,
-    'result' => $status
-  );
-
-  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
-         . ' | arc call-conduit differential.updateunitresults';
-
-  return $cmd;
-}
-
-function updateTestStatus($diffID, $test) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($test) > 0);
-
-  shell_exec(buildUpdateTestStatusCmd($diffID, $test, "waiting"));
-}
-
-function getSteps($applyDiff, $diffID, $username, $test) {
-  assert(strlen($username) > 0);
-  assert(strlen($test) > 0);
-
-  if ($applyDiff) {
-    assert(strlen($diffID) > 0);
-    assert(is_numeric($diffID));
-
-    $arcrc_content = (PHP_OS == "Darwin" ?
-        exec("cat ~/.arcrc | gzip -f | base64") :
-            exec("cat ~/.arcrc | gzip -f | base64 -w0"));
-    assert(strlen($arcrc_content) > 0);
-
-    // Sandcastle machines don't have arc setup. We copy the user certificate
-    // and authenticate using that in Sandcastle.
-    $setup = array(
-      "name" => "Setup arcrc",
-      "shell" => "echo " . escapeshellarg($arcrc_content) . " | base64 --decode"
-                 . " | gzip -d > ~/.arcrc",
-      "user" => "root"
-    );
-
-    // arc demands certain permission on its config.
-    // also fix the sticky bit issue in sandcastle
-    $fix_permission = array(
-      "name" => "Fix environment",
-      "shell" => "chmod 600 ~/.arcrc && chmod +t /dev/shm",
-      "user" => "root"
-    );
-
-    // Construct the steps in the order of execution.
-    $steps[] = $setup;
-    $steps[] = $fix_permission;
-  }
-
-  // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise
-  // Git thinks it is an uncommitted change.
-  $fix_git_ignore = array(
-    "name" => "Fix git ignore",
-    "shell" => "echo fbcode >> .git/info/exclude",
-    "user" => "root"
-  );
-
-  // This fixes "FATAL: ThreadSanitizer can not mmap the shadow memory"
-  // Source:
-  // https://github.com/google/sanitizers/wiki/ThreadSanitizerCppManual#FAQ
-  $fix_kernel_issue = array(
-    "name" => "Fix kernel issue with tsan",
-    "shell" => "echo 2 >/proc/sys/kernel/randomize_va_space",
-    "user" => "root"
-  );
-
-  $steps[] = $fix_git_ignore;
-  $steps[] = $fix_kernel_issue;
-
-  // This will be the command used to execute particular type of tests.
-  $cmd = "";
-
-  if ($applyDiff) {
-    // Patch the code (keep your fingures crossed).
-    $patch = array(
-      "name" => "Patch " . $diffID,
-      "shell" => "arc --arcrc-file ~/.arcrc "
-                  . "patch --nocommit --diff " . escapeshellarg($diffID),
-      "user" => "root"
-    );
-
-    $steps[] = $patch;
-
-    updateTestStatus($diffID, $test);
-    $cmd = buildUpdateTestStatusCmd($diffID, $test, "running") . "; ";
-  }
-
-  // Run the actual command.
-  $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " .
-           escapeshellarg($test) . "; exit_code=$?; ";
-
-  if ($applyDiff) {
-    $cmd = $cmd . "([[ \$exit_code -eq 0 ]] &&"
-                . buildUpdateTestStatusCmd($diffID, $test, "pass") . ")"
-                . "||" . buildUpdateTestStatusCmd($diffID, $test, "fail")
-                . "; ";
-  }
-
-  // shell command to sort the tests based on exit code and print
-  // the output of the log files.
-  $cat_sorted_logs = "
-    while read code log_file;
-      do echo \"################ cat \$log_file [exit_code : \$code] ################\";
-      cat \$log_file;
-    done < <(tail -n +2 LOG | sort -k7,7n -k4,4gr | awk '{print \$7,\$NF}')";
-
-  // Shell command to cat all log files
-  $cat_all_logs = "for f in `ls t/!(run-*)`; do echo \$f;cat \$f; done";
-
-  // If LOG file exist use it to cat log files sorted by exit code, otherwise
-  // cat everything
-  $logs_cmd = "if [ -f LOG ]; then {$cat_sorted_logs}; else {$cat_all_logs}; fi";
-
-  $cmd = $cmd . " cat /tmp/precommit-check.log"
-              . "; shopt -s extglob; {$logs_cmd}"
-              . "; shopt -u extglob; [[ \$exit_code -eq 0 ]]";
-  assert(strlen($cmd) > 0);
-
-  $run_test = array(
-    "name" => "Run " . $test,
-    "shell" => $cmd,
-    "user" => "root",
-    "parser" => "python build_tools/error_filter.py " . escapeshellarg($test),
-  );
-
-  $steps[] = $run_test;
-
-  if ($applyDiff) {
-    // Clean up the user arc config we are using.
-    $cleanup = array(
-      "name" => "Arc cleanup",
-      "shell" => "rm -f ~/.arcrc",
-      "user" => "root"
-    );
-
-    $steps[] = $cleanup;
-  }
-
-  assert(count($steps) > 0);
-  return $steps;
-}
-
-function getSandcastleConfig() {
-  $sandcastle_config = array();
-
-  $cwd = getcwd();
-  $cwd_token_file = "{$cwd}/.sandcastle";
-  // This is a case when we're executed from a continuous run. Fetch the values
-  // from the environment.
-  if (getenv(ENV_POST_RECEIVE_HOOK)) {
-    $sandcastle_config[0] = getenv(ENV_HTTPS_APP_VALUE);
-    $sandcastle_config[1] = getenv(ENV_HTTPS_TOKEN_VALUE);
-  } else {
-    // This is a typical `[p]arc diff` case. Fetch the values from the specific
-    // configuration files.
-    for ($i = 0; $i < 50; $i++) {
-      if (file_exists(PRIMARY_TOKEN_FILE) ||
-          file_exists($cwd_token_file)) {
-        break;
-      }
-      // If we failed to fetch the tokens, sleep for 0.2 second and try again
-      usleep(200000);
-    }
-    assert(file_exists(PRIMARY_TOKEN_FILE) ||
-           file_exists($cwd_token_file));
-
-    // Try the primary location first, followed by a secondary.
-    if (file_exists(PRIMARY_TOKEN_FILE)) {
-      $cmd = 'cat ' . PRIMARY_TOKEN_FILE;
-    } else {
-      $cmd = 'cat ' . escapeshellarg($cwd_token_file);
-    }
-
-    assert(strlen($cmd) > 0);
-    $sandcastle_config = explode(':', rtrim(shell_exec($cmd)));
-  }
-
-  // In this case be very explicit about the implications.
-  if (count($sandcastle_config) != 2) {
-    echo "Sandcastle configuration files don't contain valid information " .
-         "or the necessary environment variables aren't defined. Unable " .
-         "to validate the code changes.";
-    exit(1);
-  }
-
-  assert(strlen($sandcastle_config[0]) > 0);
-  assert(strlen($sandcastle_config[1]) > 0);
-  assert(count($sandcastle_config) > 0);
-
-  return $sandcastle_config;
-}
-
-// This function can be called either from `[p]arc diff` command or during
-// the Git post-receive hook.
- function startTestsInSandcastle($applyDiff, $workflow, $diffID) {
-  // Default options don't terminate on failure, but that's what we want. In
-  // the current case we use assertions intentionally as "terminate on failure
-  // invariants".
-  assert_options(ASSERT_BAIL, true);
-
-  // In case of a diff we'll send notificatios to the author. Else it'll go to
-  // the entire team because failures indicate that build quality has regressed.
-  $username = $applyDiff ? exec("whoami") : CONT_RUN_ALIAS;
-  assert(strlen($username) > 0);
-
-  if ($applyDiff) {
-    assert($workflow);
-    assert(strlen($diffID) > 0);
-    assert(is_numeric($diffID));
-  }
-
-  // List of tests we want to run in Sandcastle.
-  $tests = array("unit", "unit_non_shm", "unit_481", "clang_unit", "tsan",
-                 "asan", "lite_test", "valgrind", "release", "release_481",
-                 "clang_release", "clang_analyze", "code_cov",
-                 "java_build", "no_compression", "unity", "ubsan");
-
-  $send_email_template = array(
-    'type' => 'email',
-    'triggers' => array('fail'),
-    'emails' => array($username . '@fb.com'),
-  );
-
-  // Construct a job definition for each test and add it to the master plan.
-  foreach ($tests as $test) {
-    $stepName = "RocksDB diff " . $diffID . " test " . $test;
-
-    if (!$applyDiff) {
-      $stepName = "RocksDB continuous integration test " . $test;
-    }
-
-    $arg[] = array(
-      "name" => $stepName,
-      "report" => array($send_email_template),
-      "steps" => getSteps($applyDiff, $diffID, $username, $test)
-    );
-  }
-
-  // We cannot submit the parallel execution master plan to Sandcastle and
-  // need supply the job plan as a determinator. So we construct a small job
-  // that will spit out the master job plan which Sandcastle will parse and
-  // execute. Why compress the job definitions? Otherwise we run over the max
-  // string size.
-  $cmd = "echo " . base64_encode(json_encode($arg))
-         . (PHP_OS == "Darwin" ?
-             " | gzip -f | base64" :
-                 " | gzip -f | base64 -w0");
-  assert(strlen($cmd) > 0);
-
-  $arg_encoded = shell_exec($cmd);
-  assert(strlen($arg_encoded) > 0);
-
-  $runName = "Run diff " . $diffID . "for user " . $username;
-
-  if (!$applyDiff) {
-    $runName = "RocksDB continuous integration build and test run";
-  }
-
-  $command = array(
-    "name" => $runName,
-    "steps" => array()
-  );
-
-  $command["steps"][] = array(
-    "name" => "Generate determinator",
-    "shell" => "echo " . $arg_encoded . " | base64 --decode | gzip -d"
-               . " | base64 --decode",
-    "determinator" => true,
-    "user" => "root"
-  );
-
-  // Submit to Sandcastle.
-  $url = 'https://interngraph.intern.facebook.com/sandcastle/create';
-
-  $job = array(
-    'command' => 'SandcastleUniversalCommand',
-    'args' => $command,
-    'capabilities' => array(
-      'vcs' => 'rocksdb-int-git',
-      'type' => 'lego',
-    ),
-    'hash' => 'origin/master',
-    'user' => $username,
-    'alias' => 'rocksdb-precommit',
-    'tags' => array('rocksdb'),
-    'description' => 'Rocksdb precommit job',
-  );
-
-  // Fetch the configuration necessary to submit a successful HTTPS request.
-  $sandcastle_config = getSandcastleConfig();
-
-  $app = $sandcastle_config[0];
-  $token = $sandcastle_config[1];
-
-  $cmd = 'curl -s -k '
-          . ' -F app=' . escapeshellarg($app)
-          . ' -F token=' . escapeshellarg($token)
-          . ' -F job=' . escapeshellarg(json_encode($job))
-          .' ' . escapeshellarg($url);
-
-  $output = shell_exec($cmd);
-  assert(strlen($output) > 0);
-
-  // Extract Sandcastle URL from the response.
-  preg_match('/url": "(.+)"/', $output, $sandcastle_url);
-
-  assert(count($sandcastle_url) > 0, "Unable to submit Sandcastle request.");
-  assert(strlen($sandcastle_url[1]) > 0, "Unable to extract Sandcastle URL.");
-
-  if ($applyDiff) {
-    echo "\nSandcastle URL: " . $sandcastle_url[1] . "\n";
-    // Ask Phabricator to display it on the diff UI.
-    postURL($diffID, $sandcastle_url[1]);
-  } else {
-    echo "Continuous integration started Sandcastle tests. You can look at ";
-    echo "the progress at:\n" . $sandcastle_url[1] . "\n";
-  }
-}
-
-// Continuous run cript will set the environment variable and based on that
-// we'll trigger the execution of tests in Sandcastle. In that case we don't
-// need to apply any diffs and there's no associated workflow either.
-if (getenv(ENV_POST_RECEIVE_HOOK)) {
-  startTestsInSandcastle(
-    false /* $applyDiff */,
-    NULL /* $workflow */,
-    NULL /* $diffID */);
-}
diff --git a/build_tools/cont_integration.sh b/build_tools/cont_integration.sh
deleted file mode 100755
index 66d25522785..00000000000
--- a/build_tools/cont_integration.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2016, Facebook. All rights reserved.
-#
-# Overall wrapper script for RocksDB continuous builds. The implementation is a
-# trivial pulling scheme. We loop infinitely, check if any new changes have been
-# committed, if yes then trigger a Sandcastle run, and finally go to sleep again
-# for a certain interval.
-#
-
-SRC_GIT_REPO=/data/git/rocksdb-public
-error=0
-
-function log {
-  DATE=`date +%Y-%m-%d:%H:%M:%S`
-  # shellcheck disable=SC2068
-  echo $DATE $@
-}
-
-function log_err {
-  # shellcheck disable=SC2145
-  log "ERROR: $@ Error code: $error."
-}
-
-function update_repo_status {
-  # Update the parent first.
-  pushd $SRC_GIT_REPO
-
-  # This is a fatal error. Something in the environment isn't right and we will
-  # terminate the execution.
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "Where is $SRC_GIT_REPO?"
-    exit $error
-  fi
-
-  HTTPS_PROXY=fwdproxy:8080 git fetch -f
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git fetch -f failed."
-    popd
-    return $error
-  fi
-
-  git update-ref refs/heads/master refs/remotes/origin/master
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git update-ref failed."
-    popd
-    return $error
-  fi
-
-  popd
-
-  # We're back in an instance-specific directory. Get the latest changes.
-  git pull --rebase
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git pull --rebase failed."
-    return $error
-  fi
-}
-
-#
-# Execution starts here.
-#
-
-# Path to the determinator from the root of the RocksDB repo.
-CONTRUN_DETERMINATOR=./build_tools/RocksDBCommonHelper.php
-
-# Value of the previous commit.
-PREV_COMMIT=
-
-log "Starting to monitor for new RocksDB changes ..."
-log "Running under `pwd` as `whoami`."
-
-# Paranoia. Make sure that we're using the right branch.
-git checkout master
-
-error=$?
-if [ ! $error -eq 0 ]; then
-  log_err "This is not good. Can't checkout master. Bye-bye!"
-  exit 1
-fi
-
-# We'll run forever and let the execution environment terminate us if we'll
-# exceed whatever timeout is set for the job.
-while true;
-do
-  # Get the latest changes committed.
-  update_repo_status
-
-  error=$?
-  if [  $error -eq 0 ]; then
-    LAST_COMMIT=`git log -1 | head -1 | grep commit | awk '{ print $2; }'`
-
-    log "Last commit is '$LAST_COMMIT', previous commit is '$PREV_COMMIT'."
-
-    if [ "$PREV_COMMIT" == "$LAST_COMMIT" ]; then
-      log "There were no changes since the last time I checked. Going to sleep."
-    else
-      if [ ! -z "$LAST_COMMIT" ]; then
-        log "New code has been committed or previous commit not known. " \
-            "Will trigger the tests."
-
-        PREV_COMMIT=$LAST_COMMIT
-        log "Updated previous commit to '$PREV_COMMIT'."
-
-        #
-        # This is where we'll trigger the Sandcastle run. The values for
-        # HTTPS_APP_VALUE and HTTPS_APP_VALUE will be set in the container we're
-        # running in.
-        #
-        POST_RECEIVE_HOOK=1 php $CONTRUN_DETERMINATOR
-
-        error=$?
-        if [ $error -eq 0 ]; then
-          log "Sandcastle run successfully triggered."
-        else
-          log_err "Failed to trigger Sandcastle run."
-        fi
-      else
-        log_err "Previous commit not updated. Don't know what the last one is."
-      fi
-    fi
-  else
-    log_err "Getting latest changes failed. Will skip running tests for now."
-  fi
-
-  # Always sleep, even if errors happens while trying to determine the latest
-  # commit. This will prevent us terminating in case of transient errors.
-  log "Will go to sleep for 5 minutes."
-  sleep 5m
-done

From 80ad996b355cea5c83c45b08f98fd9e0433777df Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Wed, 9 Oct 2019 09:25:56 -0700
Subject: [PATCH 439/572] Make the db_stress reopen loop in OperateDb() more
 robust (#5893)

Summary:
The loop in OperateDb() is getting quite complicated with the introduction of multiple key operations such as MultiGet and Reseeks. This is resulting in a number of corner cases that hangs db_stress due to synchronization problems during reopen (i.e when -reopen=<> option is specified). This PR makes it more robust by ensuring all db_stress threads vote to reopen the DB the exact same number of times.
Most of the changes in this diff are due to indentation.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5893

Test Plan: Run crash test

Differential Revision: D17823827

Pulled By: anand1976

fbshipit-source-id: ec893829f611ac7cac4057c0d3d99f9ffb6a6dd9
---
 tools/db_stress.cc | 457 ++++++++++++++++++++++-----------------------
 1 file changed, 227 insertions(+), 230 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index b522062b154..eeeb8e3c671 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -2087,18 +2087,11 @@ class StressTest {
     const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
 
     thread->stats.Start();
-    for (uint64_t i = 0, prev_i = 0; i < FLAGS_ops_per_thread; i++) {
+    for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
       if (thread->shared->HasVerificationFailedYet()) {
         break;
       }
-      // In case i is incremented more than once due to multiple operations,
-      // such as MultiGet or iterator seeks, check whether we have crossed
-      // the ops_per_open boundary in the previous iteration. If it did,
-      // then vote to reopen
-      if (i != 0 &&
-          (i % ops_per_open == 0 ||
-           i % ops_per_open < prev_i % ops_per_open)) {
-        {
+      if (open_cnt != 0) {
           thread->stats.FinishedSingleOp();
           MutexLock l(thread->shared->GetMutex());
           while (!thread->snapshot_queue.empty()) {
@@ -2116,261 +2109,265 @@ class StressTest {
           }
           // Commenting this out as we don't want to reset stats on each open.
           // thread->stats.Start();
-        }
       }
-      prev_i = i;
 
-      // Change Options
-      if (FLAGS_set_options_one_in > 0 &&
-          thread->rand.OneIn(FLAGS_set_options_one_in)) {
-        SetOptions(thread);
-      }
+      for (uint64_t i = 0; i < ops_per_open; i++) {
+        if (thread->shared->HasVerificationFailedYet()) {
+          break;
+        }
 
-      if (FLAGS_set_in_place_one_in > 0 &&
-          thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
-        options_.inplace_update_support ^= options_.inplace_update_support;
-      }
+        // Change Options
+        if (FLAGS_set_options_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_options_one_in)) {
+          SetOptions(thread);
+        }
+
+        if (FLAGS_set_in_place_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
+          options_.inplace_update_support ^= options_.inplace_update_support;
+        }
 
-      MaybeClearOneColumnFamily(thread);
+        MaybeClearOneColumnFamily(thread);
 
 #ifndef ROCKSDB_LITE
-      if (FLAGS_compact_files_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
-        auto* random_cf =
-            column_families_[thread->rand.Next() % FLAGS_column_families];
-        rocksdb::ColumnFamilyMetaData cf_meta_data;
-        db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
-
-        // Randomly compact up to three consecutive files from a level
-        const int kMaxRetry = 3;
-        for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
-          size_t random_level = thread->rand.Uniform(
-              static_cast<int>(cf_meta_data.levels.size()));
-
-          const auto& files = cf_meta_data.levels[random_level].files;
-          if (files.size() > 0) {
-            size_t random_file_index =
-                thread->rand.Uniform(static_cast<int>(files.size()));
-            if (files[random_file_index].being_compacted) {
-              // Retry as the selected file is currently being compacted
-              continue;
-            }
+        if (FLAGS_compact_files_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
+          auto* random_cf =
+              column_families_[thread->rand.Next() % FLAGS_column_families];
+          rocksdb::ColumnFamilyMetaData cf_meta_data;
+          db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
+
+          // Randomly compact up to three consecutive files from a level
+          const int kMaxRetry = 3;
+          for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
+            size_t random_level = thread->rand.Uniform(
+                static_cast<int>(cf_meta_data.levels.size()));
+
+            const auto& files = cf_meta_data.levels[random_level].files;
+            if (files.size() > 0) {
+              size_t random_file_index =
+                  thread->rand.Uniform(static_cast<int>(files.size()));
+              if (files[random_file_index].being_compacted) {
+                // Retry as the selected file is currently being compacted
+                continue;
+              }
 
-            std::vector<std::string> input_files;
-            input_files.push_back(files[random_file_index].name);
-            if (random_file_index > 0 &&
-                !files[random_file_index - 1].being_compacted) {
-              input_files.push_back(files[random_file_index - 1].name);
-            }
-            if (random_file_index + 1 < files.size() &&
-                !files[random_file_index + 1].being_compacted) {
-              input_files.push_back(files[random_file_index + 1].name);
-            }
+              std::vector<std::string> input_files;
+              input_files.push_back(files[random_file_index].name);
+              if (random_file_index > 0 &&
+                  !files[random_file_index - 1].being_compacted) {
+                input_files.push_back(files[random_file_index - 1].name);
+              }
+              if (random_file_index + 1 < files.size() &&
+                  !files[random_file_index + 1].being_compacted) {
+                input_files.push_back(files[random_file_index + 1].name);
+              }
 
-            size_t output_level =
-                std::min(random_level + 1, cf_meta_data.levels.size() - 1);
-            auto s =
-                db_->CompactFiles(CompactionOptions(), random_cf, input_files,
-                                  static_cast<int>(output_level));
-            if (!s.ok()) {
-              fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
-                      s.ToString().c_str());
-              thread->stats.AddNumCompactFilesFailed(1);
-            } else {
-              thread->stats.AddNumCompactFilesSucceed(1);
+              size_t output_level =
+                  std::min(random_level + 1, cf_meta_data.levels.size() - 1);
+              auto s =
+                  db_->CompactFiles(CompactionOptions(), random_cf, input_files,
+                                    static_cast<int>(output_level));
+              if (!s.ok()) {
+                fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                        s.ToString().c_str());
+                thread->stats.AddNumCompactFilesFailed(1);
+              } else {
+                thread->stats.AddNumCompactFilesSucceed(1);
+              }
+              break;
             }
-            break;
           }
         }
-      }
 #endif                // !ROCKSDB_LITE
-      int64_t rand_key = GenerateOneKey(thread, i);
-      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      std::string keystr = Key(rand_key);
-      Slice key = keystr;
-      std::unique_ptr<MutexLock> lock;
-      if (ShouldAcquireMutexOnKey()) {
-        lock.reset(new MutexLock(
-            shared->GetMutexForKey(rand_column_family, rand_key)));
-      }
+        int64_t rand_key = GenerateOneKey(thread, i);
+        int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+        std::string keystr = Key(rand_key);
+        Slice key = keystr;
+        std::unique_ptr<MutexLock> lock;
+        if (ShouldAcquireMutexOnKey()) {
+          lock.reset(new MutexLock(
+              shared->GetMutexForKey(rand_column_family, rand_key)));
+        }
 
-      auto column_family = column_families_[rand_column_family];
+        auto column_family = column_families_[rand_column_family];
 
-      if (FLAGS_compact_range_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
-        int64_t end_key_num;
-        if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
-          end_key_num = port::kMaxInt64;
-        } else {
-          end_key_num = FLAGS_compact_range_width + rand_key;
-        }
-        std::string end_key_buf = Key(end_key_num);
-        Slice end_key(end_key_buf);
-
-        CompactRangeOptions cro;
-        cro.exclusive_manual_compaction =
-            static_cast<bool>(thread->rand.Next() % 2);
-        Status status = db_->CompactRange(cro, column_family, &key, &end_key);
-        if (!status.ok()) {
-          printf("Unable to perform CompactRange(): %s\n",
-                 status.ToString().c_str());
+        if (FLAGS_compact_range_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
+          int64_t end_key_num;
+          if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
+            end_key_num = port::kMaxInt64;
+          } else {
+            end_key_num = FLAGS_compact_range_width + rand_key;
+          }
+          std::string end_key_buf = Key(end_key_num);
+          Slice end_key(end_key_buf);
+
+          CompactRangeOptions cro;
+          cro.exclusive_manual_compaction =
+              static_cast<bool>(thread->rand.Next() % 2);
+          Status status = db_->CompactRange(cro, column_family, &key, &end_key);
+          if (!status.ok()) {
+            printf("Unable to perform CompactRange(): %s\n",
+                   status.ToString().c_str());
+          }
         }
-      }
 
-      std::vector<int> rand_column_families =
-          GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
-
-      if (FLAGS_flush_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
-        FlushOptions flush_opts;
-        std::vector<ColumnFamilyHandle*> cfhs;
-        std::for_each(
-            rand_column_families.begin(), rand_column_families.end(),
-            [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
-        Status status = db_->Flush(flush_opts, cfhs);
-        if (!status.ok()) {
-          fprintf(stdout, "Unable to perform Flush(): %s\n",
-                  status.ToString().c_str());
+        std::vector<int> rand_column_families =
+            GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+        if (FLAGS_flush_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+          FlushOptions flush_opts;
+          std::vector<ColumnFamilyHandle*> cfhs;
+          std::for_each(
+              rand_column_families.begin(), rand_column_families.end(),
+              [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+          Status status = db_->Flush(flush_opts, cfhs);
+          if (!status.ok()) {
+            fprintf(stdout, "Unable to perform Flush(): %s\n",
+                    status.ToString().c_str());
+          }
         }
-      }
 
-      std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+        std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
 
-      if (FLAGS_ingest_external_file_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
-        TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
-      }
+        if (FLAGS_ingest_external_file_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
+          TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
+        }
 
-      if (FLAGS_backup_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
-        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Backup/restore gave inconsistent state",
-                            s);
+        if (FLAGS_backup_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
         }
-      }
 
-      if (FLAGS_checkpoint_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
-        Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+        if (FLAGS_checkpoint_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
+          Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+          }
         }
-      }
 
-      if (FLAGS_acquire_snapshot_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
-        auto snapshot = db_->GetSnapshot();
-        ReadOptions ropt;
-        ropt.snapshot = snapshot;
-        std::string value_at;
-        // When taking a snapshot, we also read a key from that snapshot. We
-        // will later read the same key before releasing the snapshot and verify
-        // that the results are the same.
-        auto status_at = db_->Get(ropt, column_family, key, &value_at);
-        std::vector<bool> *key_vec = nullptr;
-
-        if (FLAGS_compare_full_db_state_snapshot &&
-            (thread->tid == 0)) {
-          key_vec = new std::vector<bool>(FLAGS_max_key);
-          // When `prefix_extractor` is set, seeking to beginning and scanning
-          // across prefixes are only supported with `total_order_seek` set.
-          ropt.total_order_seek = true;
-          std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
-          for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-            uint64_t key_val;
-            if (GetIntVal(iterator->key().ToString(), &key_val)) {
-              (*key_vec)[key_val] = true;
+        if (FLAGS_acquire_snapshot_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
+          auto snapshot = db_->GetSnapshot();
+          ReadOptions ropt;
+          ropt.snapshot = snapshot;
+          std::string value_at;
+          // When taking a snapshot, we also read a key from that snapshot. We
+          // will later read the same key before releasing the snapshot and verify
+          // that the results are the same.
+          auto status_at = db_->Get(ropt, column_family, key, &value_at);
+          std::vector<bool> *key_vec = nullptr;
+
+          if (FLAGS_compare_full_db_state_snapshot &&
+              (thread->tid == 0)) {
+            key_vec = new std::vector<bool>(FLAGS_max_key);
+            // When `prefix_extractor` is set, seeking to beginning and scanning
+            // across prefixes are only supported with `total_order_seek` set.
+            ropt.total_order_seek = true;
+            std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+            for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+              uint64_t key_val;
+              if (GetIntVal(iterator->key().ToString(), &key_val)) {
+                (*key_vec)[key_val] = true;
+              }
             }
           }
-        }
 
-        ThreadState::SnapshotState snap_state = {
-            snapshot, rand_column_family, column_family->GetName(),
-            keystr,   status_at,          value_at, key_vec};
-        thread->snapshot_queue.emplace(
-            std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
-            snap_state);
-      }
-      while (!thread->snapshot_queue.empty() &&
-             i >= thread->snapshot_queue.front().first) {
-        auto snap_state = thread->snapshot_queue.front().second;
-        assert(snap_state.snapshot);
-        // Note: this is unsafe as the cf might be dropped concurrently. But it
-        // is ok since unclean cf drop is cunnrently not supported by write
-        // prepared transactions.
-        Status s =
-            AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+          ThreadState::SnapshotState snap_state = {
+              snapshot, rand_column_family, column_family->GetName(),
+              keystr,   status_at,          value_at, key_vec};
+          thread->snapshot_queue.emplace(
+              std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
+              snap_state);
+        }
+        while (!thread->snapshot_queue.empty() &&
+               i >= thread->snapshot_queue.front().first) {
+          auto snap_state = thread->snapshot_queue.front().second;
+          assert(snap_state.snapshot);
+          // Note: this is unsafe as the cf might be dropped concurrently. But it
+          // is ok since unclean cf drop is cunnrently not supported by write
+          // prepared transactions.
+          Status s =
+              AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+          }
+          db_->ReleaseSnapshot(snap_state.snapshot);
+          delete snap_state.key_vec;
+          thread->snapshot_queue.pop();
         }
-        db_->ReleaseSnapshot(snap_state.snapshot);
-        delete snap_state.key_vec;
-        thread->snapshot_queue.pop();
-      }
 
-      int prob_op = thread->rand.Uniform(100);
-      // Reset this in case we pick something other than a read op. We don't
-      // want to use a stale value when deciding at the beginning of the loop
-      // whether to vote to reopen
-      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
-        // OPERATION read
-        if (FLAGS_use_multiget) {
-          // Leave room for one more iteration of the loop with a single key
-          // batch. This is to ensure that each thread does exactly the same
-          // number of ops
-          int multiget_batch_size = static_cast<int>(
-              std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
-                       FLAGS_ops_per_thread - i - 1));
-          // If its the last iteration, ensure that multiget_batch_size is 1
-          multiget_batch_size = std::max(multiget_batch_size, 1);
-          rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
-          TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
-          i += multiget_batch_size - 1;
+        int prob_op = thread->rand.Uniform(100);
+        // Reset this in case we pick something other than a read op. We don't
+        // want to use a stale value when deciding at the beginning of the loop
+        // whether to vote to reopen
+        if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+          // OPERATION read
+          if (FLAGS_use_multiget) {
+            // Leave room for one more iteration of the loop with a single key
+            // batch. This is to ensure that each thread does exactly the same
+            // number of ops
+            int multiget_batch_size = static_cast<int>(
+                std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                         FLAGS_ops_per_thread - i - 1));
+            // If its the last iteration, ensure that multiget_batch_size is 1
+            multiget_batch_size = std::max(multiget_batch_size, 1);
+            rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
+            TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
+            i += multiget_batch_size - 1;
+          } else {
+            TestGet(thread, read_opts, rand_column_families, rand_keys);
+          }
+        } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+          // OPERATION prefix scan
+          // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+          // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+          // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+          // prefix
+          TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
+        } else if (prefixBound <= prob_op && prob_op < writeBound) {
+          // OPERATION write
+          TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
+                  value, lock);
+        } else if (writeBound <= prob_op && prob_op < delBound) {
+          // OPERATION delete
+          TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
+        } else if (delBound <= prob_op && prob_op < delRangeBound) {
+          // OPERATION delete range
+          TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
+                          lock);
         } else {
-          TestGet(thread, read_opts, rand_column_families, rand_keys);
+          // OPERATION iterate
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
+                       FLAGS_ops_per_thread - i - 1));
+          rand_keys = GenerateNKeys(thread, num_seeks, i);
+          i += num_seeks - 1;
+          TestIterate(thread, read_opts, rand_column_families, rand_keys);
         }
-      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
-        // OPERATION prefix scan
-        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
-        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
-        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
-        // prefix
-        TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
-      } else if (prefixBound <= prob_op && prob_op < writeBound) {
-        // OPERATION write
-        TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
-                value, lock);
-      } else if (writeBound <= prob_op && prob_op < delBound) {
-        // OPERATION delete
-        TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
-      } else if (delBound <= prob_op && prob_op < delRangeBound) {
-        // OPERATION delete range
-        TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
-                        lock);
-      } else {
-        // OPERATION iterate
-        int num_seeks = static_cast<int>(
-            std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
-                     FLAGS_ops_per_thread - i - 1));
-        rand_keys = GenerateNKeys(thread, num_seeks, i);
-        i += num_seeks - 1;
-        TestIterate(thread, read_opts, rand_column_families, rand_keys);
-      }
-      thread->stats.FinishedSingleOp();
+        thread->stats.FinishedSingleOp();
 #ifndef ROCKSDB_LITE
-      uint32_t tid = thread->tid;
-      assert(secondaries_.empty() ||
-             static_cast<size_t>(tid) < secondaries_.size());
-      if (FLAGS_secondary_catch_up_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
-        Status s = secondaries_[tid]->TryCatchUpWithPrimary();
-        if (!s.ok()) {
-          VerificationAbort(shared, "Secondary instance failed to catch up", s);
-          break;
+        uint32_t tid = thread->tid;
+        assert(secondaries_.empty() ||
+               static_cast<size_t>(tid) < secondaries_.size());
+        if (FLAGS_secondary_catch_up_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
+          Status s = secondaries_[tid]->TryCatchUpWithPrimary();
+          if (!s.ok()) {
+            VerificationAbort(shared, "Secondary instance failed to catch up", s);
+            break;
+          }
         }
-      }
 #endif
+      }
     }
     while (!thread->snapshot_queue.empty()) {
       db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);

From 4c49e38f15ad2d575d5768742e961e6eb85a44c0 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Thu, 10 Oct 2019 09:37:38 -0700
Subject: [PATCH 440/572] MultiGet batching in memtable (#5818)

Summary:
RocksDB has a MultiGet() API that implements batched key lookup for higher performance (https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L468). Currently, batching is implemented in BlockBasedTableReader::MultiGet() for SST file lookups. One of the ways it improves performance is by pipelining bloom filter lookups (by prefetching required cachelines for all the keys in the batch, and then doing the probe) and thus hiding the cache miss latency. The same concept can be extended to the memtable as well. This PR involves implementing a pipelined bloom filter lookup in DynamicBloom, and implementing MemTable::MultiGet() that can leverage it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5818

Test Plan:
Existing tests

Performance Test:
Ran the below command which fills up the memtable and makes sure there are no flushes and then call multiget. Ran it on master and on the new change and see atleast 1% performance improvement across all the test runs I did. Sometimes the improvement was upto 5%.

TEST_TMPDIR=/data/users/$USER/benchmarks/feature/ numactl -C 10 ./db_bench -benchmarks="fillseq,multireadrandom" -num=600000 -compression_type="none" -level_compaction_dynamic_level_bytes -write_buffer_size=200000000 -target_file_size_base=200000000 -max_bytes_for_level_base=16777216 -reads=90000 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 -statistics -memtable_whole_key_filtering=true -memtable_bloom_size_ratio=10

Differential Revision: D17578869

Pulled By: vjnadimpalli

fbshipit-source-id: 23dc651d9bf49db11d22375bf435708875a1f192
---
 db/db_basic_test.cc      |  16 ++++-
 db/db_impl/db_impl.cc    |  45 +++++---------
 db/memtable.cc           | 123 ++++++++++++++++++++++++++++++++-------
 db/memtable.h            |  12 ++++
 db/memtable_list.cc      |  11 ++++
 db/memtable_list.h       |   3 +
 table/multiget_context.h |  10 ++++
 util/dynamic_bloom.h     |  32 ++++++++--
 8 files changed, 196 insertions(+), 56 deletions(-)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index b3bd681b2d5..07d27a41a5d 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1362,6 +1362,7 @@ class MultiGetPrefixExtractorTest : public DBBasicTest,
 TEST_P(MultiGetPrefixExtractorTest, Batched) {
   Options options = CurrentOptions();
   options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_prefix_bloom_size_ratio = 10;
   BlockBasedTableOptions bbto;
   if (GetParam()) {
     bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
@@ -1373,17 +1374,30 @@ TEST_P(MultiGetPrefixExtractorTest, Batched) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   Reopen(options);
 
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+
   // First key is not in the prefix_extractor domain
   ASSERT_OK(Put("k", "v0"));
   ASSERT_OK(Put("kk1", "v1"));
   ASSERT_OK(Put("kk2", "v2"));
   ASSERT_OK(Put("kk3", "v3"));
   ASSERT_OK(Put("kk4", "v4"));
+  std::vector<std::string> mem_keys(
+      {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+  std::vector<std::string> inmem_values;
+  inmem_values = MultiGet(mem_keys, nullptr);
+  ASSERT_EQ(inmem_values[0], "v0");
+  ASSERT_EQ(inmem_values[1], "v1");
+  ASSERT_EQ(inmem_values[2], "v2");
+  ASSERT_EQ(inmem_values[3], "v3");
+  ASSERT_EQ(inmem_values[4], "v4");
+  ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5);
   ASSERT_OK(Flush());
 
   std::vector<std::string> keys({"k", "kk1", "kk2", "kk3", "kk4"});
   std::vector<std::string> values;
-  SetPerfLevel(kEnableCount);
   get_perf_context()->Reset();
   values = MultiGet(keys, nullptr);
   ASSERT_EQ(values[0], "v0");
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index d3b001d25c4..72cfb0be93f 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1961,41 +1961,26 @@ void DBImpl::MultiGetImpl(
     keys_left -= batch_size;
     for (auto mget_iter = range.begin(); mget_iter != range.end();
          ++mget_iter) {
-      MergeContext& merge_context = mget_iter->merge_context;
-      merge_context.Clear();
-      Status& s = *mget_iter->s;
-      PinnableSlice* value = mget_iter->value;
-      s = Status::OK();
+      mget_iter->merge_context.Clear();
+      *mget_iter->s = Status::OK();
+    }
 
-      bool skip_memtable =
-          (read_options.read_tier == kPersistedTier &&
-           has_unpersisted_data_.load(std::memory_order_relaxed));
-      bool done = false;
-      if (!skip_memtable) {
-        if (super_version->mem->Get(*(mget_iter->lkey), value->GetSelf(), &s,
-                                    &merge_context,
-                                    &mget_iter->max_covering_tombstone_seq,
-                                    read_options, callback, is_blob_index)) {
-          done = true;
-          value->PinSelf();
-          RecordTick(stats_, MEMTABLE_HIT);
-        } else if (super_version->imm->Get(
-                       *(mget_iter->lkey), value->GetSelf(), &s, &merge_context,
-                       &mget_iter->max_covering_tombstone_seq, read_options,
-                       callback, is_blob_index)) {
-          done = true;
-          value->PinSelf();
-          RecordTick(stats_, MEMTABLE_HIT);
-        }
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    if (!skip_memtable) {
+      super_version->mem->MultiGet(read_options, &range, callback,
+                                   is_blob_index);
+      if (!range.empty()) {
+        super_version->imm->MultiGet(read_options, &range, callback,
+                                     is_blob_index);
       }
-      if (done) {
-        range.MarkKeyDone(mget_iter);
-      } else {
-        RecordTick(stats_, MEMTABLE_MISS);
+      if (!range.empty()) {
         lookup_current = true;
+        uint64_t left = range.KeysLeft();
+        RecordTick(stats_, MEMTABLE_MISS, left);
       }
     }
-
     if (lookup_current) {
       PERF_TIMER_GUARD(get_from_output_files_time);
       super_version->current->MultiGet(read_options, &range, callback,
diff --git a/db/memtable.cc b/db/memtable.cc
index dd660451468..8b2ddf0e178 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -10,9 +10,9 @@
 #include "db/memtable.h"
 
 #include <algorithm>
+#include <array>
 #include <limits>
 #include <memory>
-
 #include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
@@ -804,6 +804,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
     }
   }
+
   if (bloom_filter_ && !may_contain) {
     // iter is null if prefix bloom says the key does not exist
     PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@@ -812,26 +813,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     if (bloom_filter_) {
       PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
     }
-    Saver saver;
-    saver.status = s;
-    saver.found_final_value = &found_final_value;
-    saver.merge_in_progress = &merge_in_progress;
-    saver.key = &key;
-    saver.value = value;
-    saver.seq = kMaxSequenceNumber;
-    saver.mem = this;
-    saver.merge_context = merge_context;
-    saver.max_covering_tombstone_seq = *max_covering_tombstone_seq;
-    saver.merge_operator = moptions_.merge_operator;
-    saver.logger = moptions_.info_log;
-    saver.inplace_update_support = moptions_.inplace_update_support;
-    saver.statistics = moptions_.statistics;
-    saver.env_ = env_;
-    saver.callback_ = callback;
-    saver.is_blob_index = is_blob_index;
-    saver.do_merge = do_merge;
-    table_->Get(key, &saver, SaveValue);
-    *seq = saver.seq;
+    GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+                 is_blob_index, value, s, merge_context, seq,
+                 &found_final_value, &merge_in_progress);
   }
 
   // No change to value, since we have not yet found a Put/Delete
@@ -842,6 +826,103 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   return found_final_value;
 }
 
+void MemTable::GetFromTable(const LookupKey& key,
+                            SequenceNumber max_covering_tombstone_seq,
+                            bool do_merge, ReadCallback* callback,
+                            bool* is_blob_index, std::string* value, Status* s,
+                            MergeContext* merge_context, SequenceNumber* seq,
+                            bool* found_final_value, bool* merge_in_progress) {
+  Saver saver;
+  saver.status = s;
+  saver.found_final_value = found_final_value;
+  saver.merge_in_progress = merge_in_progress;
+  saver.key = &key;
+  saver.value = value;
+  saver.seq = kMaxSequenceNumber;
+  saver.mem = this;
+  saver.merge_context = merge_context;
+  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+  saver.merge_operator = moptions_.merge_operator;
+  saver.logger = moptions_.info_log;
+  saver.inplace_update_support = moptions_.inplace_update_support;
+  saver.statistics = moptions_.statistics;
+  saver.env_ = env_;
+  saver.callback_ = callback;
+  saver.is_blob_index = is_blob_index;
+  saver.do_merge = do_merge;
+  table_->Get(key, &saver, SaveValue);
+  *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                        ReadCallback* callback, bool* is_blob) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  MultiGetRange temp_range(*range, range->begin(), range->end());
+  if (bloom_filter_) {
+    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+    int num_keys = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (!prefix_extractor_) {
+        keys[num_keys++] = &iter->ukey;
+      } else if (prefix_extractor_->InDomain(iter->ukey)) {
+        prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
+        keys[num_keys++] = &prefixes.back();
+      }
+    }
+    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
+    int idx = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        continue;
+      }
+      if (!may_match[idx]) {
+        temp_range.SkipKey(iter);
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+      idx++;
+    }
+  }
+  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_final_value{false};
+    bool merge_in_progress = iter->s->IsMergeInProgress();
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        NewRangeTombstoneIterator(
+            read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
+    if (range_del_iter != nullptr) {
+      iter->max_covering_tombstone_seq = std::max(
+          iter->max_covering_tombstone_seq,
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
+    }
+    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+                 callback, is_blob, iter->value->GetSelf(), iter->s,
+                 &(iter->merge_context), &seq, &found_final_value,
+                 &merge_in_progress);
+
+    if (!found_final_value && merge_in_progress) {
+      *(iter->s) = Status::MergeInProgress();
+    }
+
+    if (found_final_value) {
+      iter->value->PinSelf();
+      range->MarkKeyDone(iter);
+      RecordTick(moptions_.statistics, MEMTABLE_HIT);
+    }
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
 void MemTable::Update(SequenceNumber seq,
                       const Slice& key,
                       const Slice& value) {
diff --git a/db/memtable.h b/db/memtable.h
index f316ab8e29a..482e2371650 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -26,6 +26,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
 
@@ -63,6 +64,7 @@ struct MemTablePostProcessInfo {
   uint64_t num_deletes = 0;
 };
 
+using MultiGetRange = MultiGetContext::Range;
 // Note:  Many of the methods in this class have comments indicating that
 // external synchronization is required as these methods are not thread-safe.
 // It is up to higher layers of code to decide how to prevent concurrent
@@ -221,6 +223,9 @@ class MemTable {
                read_opts, callback, is_blob_index, do_merge);
   }
 
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
   //   if key exists in current memtable && prev_value is of type kTypeValue
@@ -507,6 +512,13 @@ class MemTable {
   void UpdateFlushState();
 
   void UpdateOldestKeyTime();
+
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, Status* s, MergeContext* merge_context,
+                    SequenceNumber* seq, bool* found_final_value,
+                    bool* merge_in_progress);
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index e3f0732de15..f0a0bc2a4e1 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -113,6 +113,17 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                      is_blob_index);
 }
 
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+                                   MultiGetRange* range, ReadCallback* callback,
+                                   bool* is_blob) {
+  for (auto memtable : memlist_) {
+    memtable->MultiGet(read_options, range, callback, is_blob);
+    if (range->empty()) {
+      return;
+    }
+  }
+}
+
 bool MemTableListVersion::GetMergeOperands(
     const LookupKey& key, Status* s, MergeContext* merge_context,
     SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 75cc1a524b2..0884e1ec0c3 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -72,6 +72,9 @@ class MemTableListVersion {
                read_opts, callback, is_blob_index);
   }
 
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
   // Returns all the merge operands corresponding to the key by searching all
   // memtables starting from the most recent one.
   bool GetMergeOperands(const LookupKey& key, Status* s,
diff --git a/table/multiget_context.h b/table/multiget_context.h
index 88ec4dcc453..fe6bbc3bf3d 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -229,6 +229,16 @@ class MultiGetContext {
       return ctx_->value_mask_ & (1ull << iter.index_);
     }
 
+    uint64_t KeysLeft() {
+      uint64_t new_val = skip_mask_ | ctx_->value_mask_;
+      uint64_t count = 0;
+      while (new_val) {
+        new_val = new_val & (new_val - 1);
+        count++;
+      }
+      return end_ - count;
+    }
+
    private:
     friend MultiGetContext;
     MultiGetContext* ctx_;
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 312d2805be6..eb19c369dc8 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -5,11 +5,11 @@
 
 #pragma once
 
+#include <array>
 #include <string>
-
-#include "rocksdb/slice.h"
-
 #include "port/port.h"
+#include "rocksdb/slice.h"
+#include "table/multiget_context.h"
 #include "util/hash.h"
 
 #include <atomic>
@@ -65,6 +65,8 @@ class DynamicBloom {
   // Multithreaded access to this function is OK
   bool MayContain(const Slice& key) const;
 
+  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+
   // Multithreaded access to this function is OK
   bool MayContainHash(uint32_t hash) const;
 
@@ -84,6 +86,8 @@ class DynamicBloom {
   // concurrency safety, working with bytes.
   template <typename OrFunc>
   void AddHash(uint32_t hash, const OrFunc& or_func);
+
+  bool DoubleProbe(uint32_t h32, size_t a) const;
 };
 
 inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
@@ -116,6 +120,22 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
   return (MayContainHash(BloomHash(key)));
 }
 
+inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+                                     bool* may_match) const {
+  std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+  std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+  for (int i = 0; i < num_keys; ++i) {
+    hashes[i] = BloomHash(*keys[i]);
+    size_t a = fastrange32(kLen, hashes[i]);
+    PREFETCH(data_ + a, 0, 3);
+    byte_offsets[i] = a;
+  }
+
+  for (int i = 0; i < num_keys; i++) {
+    may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
+  }
+}
+
 #if defined(_MSC_VER)
 #pragma warning(push)
 // local variable is initialized but not referenced
@@ -153,13 +173,17 @@ inline void DynamicBloom::Prefetch(uint32_t h32) {
 inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
   size_t a = fastrange32(kLen, h32);
   PREFETCH(data_ + a, 0, 3);
+  return DoubleProbe(h32, a);
+}
+
+inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
   // Expand/remix with 64-bit golden ratio
   uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
   for (unsigned i = 0;; ++i) {
     // Two bit probes per uint64_t probe
     uint64_t mask =
         ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
-    uint64_t val = data_[a ^ i].load(std::memory_order_relaxed);
+    uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
     if (i + 1 >= kNumDoubleProbes) {
       return (val & mask) == mask;
     } else if ((val & mask) != mask) {

From 1e9c8d42a01e6b1e5f6fbae6c295a37358d42e35 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 10 Oct 2019 17:20:01 -0700
Subject: [PATCH 441/572] Fix the rocksjava release Vagrant build on CentOS
 (#5901)

Summary:
Closes https://github.com/facebook/rocksdb/issues/5873
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5901

Differential Revision: D17869585

fbshipit-source-id: 559472486f1d3ac80c0c7df6c421c4b612b9b7f9
---
 java/crossbuild/Vagrantfile           | 4 ++--
 java/crossbuild/build-linux-centos.sh | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index 4a321774888..48ab03f8089 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -7,11 +7,11 @@ VAGRANTFILE_API_VERSION = "2"
 Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
   config.vm.define "linux32" do |linux32|
-    linux32.vm.box = "hansode/centos-6.7-i386"
+    linux32.vm.box = "bento/centos-6.10-i386"
   end
 
   config.vm.define "linux64" do |linux64|
-    linux64.vm.box = "hansode/centos-6.7-x86_64"
+    linux64.vm.box = "bento/centos-6.10"
   end
 
   config.vm.provider "virtualbox" do |v|
diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
index a9b5e0a9270..f7090856d08 100755
--- a/java/crossbuild/build-linux-centos.sh
+++ b/java/crossbuild/build-linux-centos.sh
@@ -10,7 +10,11 @@ sudo rm -f /etc/yum/vars/releasever
 sudo yum -y install epel-release
 
 # install all required packages for rocksdb that are available through yum
-sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel
+sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel cmake3
+
+# set up cmake3 as cmake binary
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 --slave /usr/local/bin/ctest ctest /usr/bin/ctest --slave /usr/local/bin/cpack cpack /usr/bin/cpack --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3
 
 # install gcc/g++ 4.8.2 from tru/devtools-2
 sudo wget -O /etc/yum.repos.d/devtools-2.repo https://people.centos.org/tru/devtools-2/devtools-2.repo
@@ -24,6 +28,8 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
 # set java home so we can build rocksdb jars
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0
 
+export PATH=$JAVA_HOME:/usr/local/bin:$PATH
+
 # build rocksdb
 cd /rocksdb
 scl enable devtoolset-2 'make jclean clean'

From ddb62d1f298d8276e09fe80b43a899ac18de1d22 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 10 Oct 2019 18:03:55 -0700
Subject: [PATCH 442/572] Remove a webhook due to potential security concern
 (#5902)

Summary:
As title.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5902

Differential Revision: D17858150

Pulled By: riversand963

fbshipit-source-id: db2cd8a756faf7b9751b2651a22e1b29ca9fecec
---
 .travis.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 75eaac8eab5..8ddafdf0c3a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -122,5 +122,3 @@ script:
 notifications:
     email:
       - leveldb@fb.com
-    webhooks:
-      - https://buildtimetrend.herokuapp.com/travis

From bc8b05cb779a578b5f5acf8d9390af1d17e65ff5 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 10 Oct 2019 19:18:19 -0700
Subject: [PATCH 443/572] Revert "Enable partitioned index/filter in stress
 tests (#5895)" (#5904)

Summary:
This reverts commit 2f4e288143f1c0c9f5a9e829d1b20ce62aca4224.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5904

Differential Revision: D17871282

Pulled By: riversand963

fbshipit-source-id: d210725f8f3b26d8eac25892094da09d9694337e
---
 tools/db_crashtest.py | 12 +++++-------
 tools/db_stress.cc    |  3 ++-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f0bdb633494..75cf95b2f7d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,8 +41,8 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
-    # Temporarily disable hash index
-    "index_type": lambda: random.choice([0,2]),
+    # Temporarily disable hash and partitioned index
+    "index_type": 0,
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -51,7 +51,7 @@
     "nooverwritepercent": 1,
     "open_files": lambda : random.choice([-1, 500000]),
     # Temporarily disable partitioned filter
-    "partition_filters": lambda: random.randint(0, 1),
+    "partition_filters": 0,
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -179,10 +179,8 @@ def finalize_and_sanitize(src_params):
         # now assertion failures are triggered.
         dest_params["compaction_ttl"] = 0
     if dest_params["partition_filters"] == 1:
-        if dest_params["index_type"] != 2:
-            dest_params["partition_filters"] = 0
-        else:
-            dest_params["use_block_based_filter"] = 0
+        dest_params["index_type"] = 2
+        dest_params["use_block_based_filter"] = 0
     return dest_params
 
 
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index eeeb8e3c671..84e6de97cc5 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -354,7 +354,8 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter"
               "instead of full filter for block based table");
 
 DEFINE_bool(partition_filters, false,
-            "use partitioned filters for block-based table");
+            "use partitioned filters "
+            "for block-based table");
 
 DEFINE_int32(
     index_type,

From b00761eea6f3c3413dab76ebd99f7a48911e4bb9 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrew.kryczka2@gmail.com>
Date: Fri, 11 Oct 2019 18:18:14 -0700
Subject: [PATCH 444/572] Fix block cache ID uniqueness for Windows builds
 (#5844)

Summary:
Since we do not evict a file's blocks from block cache before that file
is deleted, we require a file's cache ID prefix is both unique and
non-reusable. However, the Windows functionality we were relying on only
guaranteed uniqueness. That meant a newly created file could be assigned
the same cache ID prefix as a deleted file. If the newly created file
had block offsets matching the deleted file, full cache keys could be
exactly the same, resulting in obsolete data blocks returned from cache
when trying to read from the new file.

We noticed this when running on FAT32 where compaction was writing out
of order keys due to reading obsolete blocks from its input files. The
functionality is documented as behaving the same on NTFS, although I
wasn't able to repro it there.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5844

Test Plan:
we had a reliable repro of out-of-order keys on FAT32 that
was fixed by this change

Differential Revision: D17752442

fbshipit-source-id: 95d983f9196cf415f269e19293b97341edbf7e00
---
 db/db_test2.cc                                |  7 +-
 env/env_test.cc                               | 11 +++-
 port/win/io_win.cc                            | 66 ++++---------------
 table/block_based/block_based_table_reader.cc |  4 +-
 .../persistent_cache/persistent_cache_test.cc |  9 ++-
 5 files changed, 33 insertions(+), 64 deletions(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index c50d55cb90f..8d8b2171b42 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1942,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) {
 }
 #endif  // OS_LINUX
 
-#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
+// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
+// breaks when that function is not implemented and no regular block cache is
+// provided.
+#if !defined(OS_SOLARIS) && !defined(OS_WIN)
 TEST_F(DBTest2, PersistentCache) {
   int num_iter = 80;
 
@@ -2006,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) {
     }
   }
 }
-#endif // !OS_SOLARIS
+#endif  // !defined(OS_SOLARIS) && !defined(OS_WIN)
 
 namespace {
 void CountSyncPoint() {
diff --git a/env/env_test.cc b/env/env_test.cc
index fe42b852033..004adf26e5a 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -883,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) {
 }
 #endif  // !ROCKSDB_LITE
 
-// Only works in linux platforms
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
   // Create file.
   if (env_ == Env::Default()) {
@@ -926,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
     env_->DeleteFile(fname);
   }
 }
+#endif  // !defined(OS_WIN)
 
 // only works in linux platforms
 #ifdef ROCKSDB_FALLOCATE_PRESENT
@@ -1016,7 +1019,9 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
   return false;
 }
 
-// Only works in linux and WIN platforms
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
   if (env_ == Env::Default()) {
     // Check whether a bunch of concurrently existing files have unique IDs.
@@ -1058,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
   }
 }
 
-// Only works in linux and WIN platforms
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
   if (env_ == Env::Default()) {
     EnvOptions soptions;
@@ -1098,6 +1102,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
     ASSERT_TRUE(!HasPrefix(ids));
   }
 }
+#endif  // !defined(OS_WIN)
 
 TEST_P(EnvPosixTestWithParam, MultiRead) {
   EnvOptions soptions;
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 6fbf6fc6301..e050d69ddca 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile,
   return status;
 }
 
-size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
-
-  if (max_size < kMaxVarint64Length * 3) {
-    return 0;
-  }
-#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
-  // MINGGW as defined by CMake file.
-  // yuslepukhin: I hate the guts of the above macros.
-  // This impl does not guarantee uniqueness everywhere
-  // is reasonably good
-  BY_HANDLE_FILE_INFORMATION FileInfo;
-
-  BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
-
-  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
-
-  if (!result) {
-    return 0;
-  }
-
-  char* rid = id;
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
-
-  assert(rid >= id);
-  return static_cast<size_t>(rid - id);
-#else
-  FILE_ID_INFO FileInfo;
-  BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
-    sizeof(FileInfo));
-
-  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
-
-  if (!result) {
-    return 0;
-  }
-
-  static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
-    "Wrong sizeof expectations");
-  // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
-  static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
-    "Wrong sizeof expectations");
-
-  char* rid = id;
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
-  uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
-  rid = EncodeVarint64(rid, *file_id);
-  ++file_id;
-  rid = EncodeVarint64(rid, *file_id);
-
-  assert(rid >= id);
-  return static_cast<size_t>(rid - id);
-#endif
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+                           size_t /*max_size*/) {
+  // Returning 0 is safe as it causes the table reader to generate a unique ID.
+  // This is suboptimal for performance as it prevents multiple table readers
+  // for the same file from sharing cached blocks. For example, if users have
+  // a low value for `max_open_files`, there can be many table readers opened
+  // for the same file.
+  //
+  // TODO: this is a temporarily solution as it is safe but not optimal for
+  // performance. For more details see discussion in
+  // https://github.com/facebook/rocksdb/pull/5844.
+  return 0;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index acab50aaf3b..8baf399870a 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -981,7 +981,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
 
   // If the prefix wasn't generated or was too long,
   // create one from the cache.
-  if (cc && *size == 0) {
+  if (cc != nullptr && *size == 0) {
     char* end = EncodeVarint64(buffer, cc->NewId());
     *size = static_cast<size_t>(end - buffer);
   }
@@ -994,7 +994,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
 
   // If the prefix wasn't generated or was too long,
   // create one from the cache.
-  if (*size == 0) {
+  if (cc != nullptr && *size == 0) {
     char* end = EncodeVarint64(buffer, cc->NewId());
     *size = static_cast<size_t>(end - buffer);
   }
diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc
index e3b1e39e0e2..e52cf9bbcf3 100644
--- a/utilities/persistent_cache/persistent_cache_test.cc
+++ b/utilities/persistent_cache/persistent_cache_test.cc
@@ -6,7 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef ROCKSDB_LITE
+
+// GetUniqueIdFromFile is not implemented on Windows. Persistent cache
+// breaks when that function is not implemented
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
 #include "utilities/persistent_cache/persistent_cache_test.h"
 
@@ -466,6 +469,6 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
-#else
+#else   // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 int main() { return 0; }
-#endif
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)

From 4e729f909528b48650aa18fd80bf9e1991c02cc8 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Fri, 11 Oct 2019 20:28:36 -0700
Subject: [PATCH 445/572] Fix SeekForPrev bug with Partitioned Filters and
 Prefix (#5907)

Summary:
Partition Filters make use of a top-level index to find the partition that might have the bloom hash of the key. The index is with internal key format (before format version 3). Each partition contains the i) blooms of the keys in that range ii) bloom of prefixes of keys in that range, iii) the bloom of the prefix of the last key in the previous partition.
When ::SeekForPrev(key), we first perform a prefix bloom test on the SST file. The partition however is identified using the full internal key, rather than the prefix key. The reason is to be compatible with the internal key format of the top-level index. This creates a corner case. Example:
- SST k, Partition N: P1K1, P1K2
- SST k, top-level index: P1K2
- SST k+1, Partition 1: P2K1, P3K1
- SST k+1 top-level index: P3K1
When SeekForPrev(P1K3), it should point us to P1K2. However SST k top-level index would reject P1K3 since it is out of range.
One possible fix would be to search with the prefix P1 (instead of full internal key P1K3) however the details of properly comparing prefix with full internal key might get complicated. The fix we apply in this PR is to look into the last partition anyway even if the key is out of range.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5907

Differential Revision: D17889918

Pulled By: maysamyabandeh

fbshipit-source-id: 169fd7b3c71dbc08808eae5a8340611ebe5bdc1e
---
 table/block_based/partitioned_filter_block.cc      |  7 ++++++-
 table/block_based/partitioned_filter_block_test.cc | 12 +++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index c2d1917d056..9ecf5613052 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -201,7 +201,12 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
       index_key_includes_seq(), index_value_is_full());
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
-    return BlockHandle(0, 0);
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
   }
   assert(iter.Valid());
   BlockHandle fltr_blk_handle = iter.value().handle;
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index fdfc972091e..f849d62ed89 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -327,7 +327,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
   std::unique_ptr<PartitionedFilterBlockBuilder> builder(
       NewBuilder(pib.get(), prefix_extractor.get()));
-  const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"};
+  const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
   builder->Add(pkeys[0]);
   CutABlock(pib.get(), pkeys[0], pkeys[1]);
   builder->Add(pkeys[1]);
@@ -344,6 +344,16 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
         /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
         /*lookup_context=*/nullptr));
   }
+  // Non-existent keys but with the same prefix
+  const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+  for (auto key : pnonkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
 }
 
 // This reproduces the bug in format_version=3 that the seeking the prefix will

From 6febfd845128f5f2df8e50339750d9bab3b95790 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 14 Oct 2019 09:52:29 -0700
Subject: [PATCH 446/572] OnTableFileCreationCompleted use "(nil)" for empty
 file during flush (#5905)

Summary:
Compaction can call OnTableFileCreationCompleted(). If file is empty, "(nil)"
is used as the file name.
Do the same for flush.

Test plan (dev server):
```
make all
make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5905

Differential Revision: D17883285

Pulled By: riversand963

fbshipit-source-id: 6565884adbb00e8023d88b17dfb3b6eb92220b59
---
 HISTORY.md    | 1 +
 db/builder.cc | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index a3a0781ce78..bb46315190e 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -23,6 +23,7 @@
 * Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
 * Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
 * Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
+* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
diff --git a/db/builder.cc b/db/builder.cc
index 01ed32e0c64..bf6ef6354f6 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -243,6 +243,9 @@ Status BuildTable(
     env->DeleteFile(fname);
   }
 
+  if (meta->fd.GetFileSize() == 0) {
+    fname = "(nil)";
+  }
   // Output to event logger and fire events.
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger, ioptions.listeners, dbname, column_family_name, fname,

From a6e615a7badd9b88280bac48ee6314f8024de5bc Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 14 Oct 2019 10:33:31 -0700
Subject: [PATCH 447/572] Enable partitioned index/filter in stress tests
 (#5918)

Summary:
This is the 3rd attempt after the revert of https://github.com/facebook/rocksdb/issues/4020 and https://github.com/facebook/rocksdb/issues/5895
The last bug is fixed https://github.com/facebook/rocksdb/pull/5907
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5918

Test Plan:
```
make -j32 crash_test
```

Differential Revision: D17909489

Pulled By: maysamyabandeh

fbshipit-source-id: 7dfb8cf998c2d295c86465dd21734593d277887e
---
 tools/db_crashtest.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 75cf95b2f7d..54c3fb9aca9 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -41,8 +41,8 @@
     "enable_pipelined_write": 0,
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
-    # Temporarily disable hash and partitioned index
-    "index_type": 0,
+    # Temporarily disable hash index
+    "index_type": lambda: random.choice([0,2]),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -50,8 +50,7 @@
     "mmap_read": lambda: random.randint(0, 1),
     "nooverwritepercent": 1,
     "open_files": lambda : random.choice([-1, 500000]),
-    # Temporarily disable partitioned filter
-    "partition_filters": 0,
+    "partition_filters": lambda: random.randint(0, 1),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -179,8 +178,10 @@ def finalize_and_sanitize(src_params):
         # now assertion failures are triggered.
         dest_params["compaction_ttl"] = 0
     if dest_params["partition_filters"] == 1:
-        dest_params["index_type"] = 2
-        dest_params["use_block_based_filter"] = 0
+        if dest_params["index_type"] != 2:
+            dest_params["partition_filters"] = 0
+        else:
+            dest_params["use_block_based_filter"] = 0
     return dest_params
 
 

From 231fffd07c574bf6d75b59d3023a0dae6f1101c6 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 14 Oct 2019 12:23:39 -0700
Subject: [PATCH 448/572] Add Env::SanitizeEnvOptions (#5885)

Summary:
Add Env::SanitizeEnvOptions to allow underlying environments properly
configure env options.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5885

Test Plan:
```
make check
```

Differential Revision: D17910327

Pulled By: riversand963

fbshipit-source-id: 86a1ac616e485742c35c4a9cc9f1227c529fc00f
---
 env/env.cc            | 1 +
 include/rocksdb/env.h | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/env/env.cc b/env/env.cc
index 51614c12be6..6aad4a53e81 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -420,6 +420,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
       options.writable_file_max_buffer_size;
   env_options->allow_fallocate = options.allow_fallocate;
   env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
+  options.env->SanitizeEnvOptions(env_options);
 }
 
 }
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 0c5e590b2f1..e70a49ffc43 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -528,6 +528,8 @@ class Env {
     return Status::NotSupported();
   }
 
+  virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
   // If you're adding methods here, remember to add them to EnvWrapper too.
 
  protected:
@@ -1363,6 +1365,9 @@ class EnvWrapper : public Env {
   Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
     return target_->GetFreeSpace(path, diskfree);
   }
+  void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+    target_->SanitizeEnvOptions(env_opts);
+  }
 
  private:
   Env* target_;

From a59dc843a4a1b0a0e594bb2bc21077986f5ad97f Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 14 Oct 2019 12:52:15 -0700
Subject: [PATCH 449/572] Move blob_index.h to db/ (#5919)

Summary:
Extracted from PR https://github.com/facebook/rocksdb/issues/5903 for technical reasons.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5919

Test Plan: make check

Differential Revision: D17910132

Pulled By: ltamasi

fbshipit-source-id: 6ecbb8d6e84b2a1d1f28575ad48ac3cc65833eb5
---
 {utilities/blob_db => db}/blob_index.h     | 0
 utilities/blob_db/blob_compaction_filter.h | 2 +-
 utilities/blob_db/blob_db_impl.cc          | 2 +-
 utilities/blob_db/blob_db_test.cc          | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename {utilities/blob_db => db}/blob_index.h (100%)

diff --git a/utilities/blob_db/blob_index.h b/db/blob_index.h
similarity index 100%
rename from utilities/blob_db/blob_index.h
rename to db/blob_index.h
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index 7a8ea613573..5555d6f72a4 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -7,11 +7,11 @@
 
 #include <unordered_set>
 
+#include "db/blob_index.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "utilities/blob_db/blob_db_impl.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 448d846cb14..b7338f2305f 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -11,6 +11,7 @@
 #include <iomanip>
 #include <memory>
 
+#include "db/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "file/file_util.h"
@@ -40,7 +41,6 @@
 #include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
 #include "utilities/blob_db/blob_db_listener.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace {
 int kBlockBasedTableVersionFormat = 2;
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 19b8b0c727a..c45001c44ae 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -13,6 +13,7 @@
 #include <string>
 #include <vector>
 
+#include "db/blob_index.h"
 #include "db/db_test_util.h"
 #include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
@@ -26,7 +27,6 @@
 #include "util/string_util.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {

From 5f025ea8325a2ff5239ea28365073bf0b723514d Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 14 Oct 2019 15:19:31 -0700
Subject: [PATCH 450/572] BlobDB GC: add SST <-> oldest blob file referenced
 mapping (#5903)

Summary:
This is groundwork for adding garbage collection support to BlobDB. The
patch adds logic that keeps track of the oldest blob file referred to by
each SST file. The oldest blob file is identified during flush/
compaction (similarly to how the range of keys covered by the SST is
identified), and persisted in the manifest as a custom field of the new
file edit record. Blob indexes with TTL are ignored for the purposes of
identifying the oldest blob file (since such blob files are cleaned up by the
TTL logic in BlobDB).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5903

Test Plan:
Added new unit tests; also ran db_bench in BlobDB mode, inspected the
manifest using ldb, and confirmed (by scanning the SST files using
sst_dump) that the value of the oldest blob file number field matches
the contents of the file for each SST.

Differential Revision: D17859997

Pulled By: ltamasi

fbshipit-source-id: 21662c137c6259a6af70446faaf3a9912c550e90
---
 db/blob_index.h                         |   2 -
 db/builder.cc                           |   7 +-
 db/compaction/compaction_job.cc         |  12 ++-
 db/compaction/compaction_job_test.cc    | 115 ++++++++++++++++++++++--
 db/compaction/compaction_picker_test.cc |  12 ++-
 db/db_impl/db_impl_compaction_flush.cc  |   5 +-
 db/db_impl/db_impl_experimental.cc      |   2 +-
 db/db_impl/db_impl_open.cc              |   2 +-
 db/event_helpers.cc                     |   9 +-
 db/event_helpers.h                      |   4 +-
 db/external_sst_file_ingestion_job.cc   |   3 +-
 db/flush_job.cc                         |   2 +-
 db/flush_job_test.cc                    |  55 ++++++++++--
 db/import_column_family_job.cc          |   3 +-
 db/repair.cc                            |  31 +++----
 db/version_builder_test.cc              |  56 +++++++-----
 db/version_edit.cc                      |  66 +++++++++++++-
 db/version_edit.h                       |  91 +++++++++----------
 db/version_edit_test.cc                 |  29 ++++--
 db/version_set.cc                       |   6 +-
 db/version_set_test.cc                  |  31 +++----
 21 files changed, 382 insertions(+), 161 deletions(-)

diff --git a/db/blob_index.h b/db/blob_index.h
index fd91b547a84..4acb4ddeff9 100644
--- a/db/blob_index.h
+++ b/db/blob_index.h
@@ -10,7 +10,6 @@
 #include "util/string_util.h"
 
 namespace rocksdb {
-namespace blob_db {
 
 // BlobIndex is a pointer to the blob and metadata of the blob. The index is
 // stored in base DB as ValueType::kTypeBlobIndex.
@@ -156,6 +155,5 @@ class BlobIndex {
   CompressionType compression_ = kNoCompression;
 };
 
-}  // namespace blob_db
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/db/builder.cc b/db/builder.cc
index bf6ef6354f6..e5d08a0725f 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -124,7 +124,7 @@ Status BuildTable(
       if (!s.ok()) {
         EventHelpers::LogAndNotifyTableFileCreationFinished(
             event_logger, ioptions.listeners, dbname, column_family_name, fname,
-            job_id, meta->fd, tp, reason, s);
+            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
         return s;
       }
       file->SetIOPriority(io_priority);
@@ -157,8 +157,9 @@ Status BuildTable(
     for (; c_iter.Valid(); c_iter.Next()) {
       const Slice& key = c_iter.key();
       const Slice& value = c_iter.value();
+      const ParsedInternalKey& ikey = c_iter.ikey();
       builder->Add(key, value);
-      meta->UpdateBoundaries(key, c_iter.ikey().sequence);
+      meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
 
       // TODO(noetzli): Update stats after flush, too.
       if (io_priority == Env::IO_HIGH &&
@@ -249,7 +250,7 @@ Status BuildTable(
   // Output to event logger and fire events.
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger, ioptions.listeners, dbname, column_family_name, fname,
-      job_id, meta->fd, tp, reason, s);
+      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
 
   return s;
 }
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 7ec7e2b3301..2c625997ebe 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -933,8 +933,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     assert(sub_compact->current_output() != nullptr);
     sub_compact->builder->Add(key, value);
     sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+    const ParsedInternalKey& ikey = c_iter->ikey();
     sub_compact->current_output()->meta.UpdateBoundaries(
-        key, c_iter->ikey().sequence);
+        key, value, ikey.sequence, ikey.type);
     sub_compact->num_output_records++;
 
     // Close output file if it is big enough. Two possibilities determine it's
@@ -1349,17 +1350,20 @@ Status CompactionJob::FinishCompactionOutputFile(
   }
   std::string fname;
   FileDescriptor output_fd;
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
   if (meta != nullptr) {
     fname =
         TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
                       meta->fd.GetNumber(), meta->fd.GetPathId());
     output_fd = meta->fd;
+    oldest_blob_file_number = meta->oldest_blob_file_number;
   } else {
     fname = "(nil)";
   }
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
-      job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s);
+      job_id_, output_fd, oldest_blob_file_number, tp,
+      TableFileCreationReason::kCompaction, s);
 
 #ifndef ROCKSDB_LITE
   // Report new file to SstFileManagerImpl
@@ -1469,8 +1473,8 @@ Status CompactionJob::OpenCompactionOutputFile(
     LogFlush(db_options_.info_log);
     EventHelpers::LogAndNotifyTableFileCreationFinished(
         event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
-        fname, job_id_, FileDescriptor(), TableProperties(),
-        TableFileCreationReason::kCompaction, s);
+        fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+        TableProperties(), TableFileCreationReason::kCompaction, s);
     return s;
   }
 
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 3f403e1e507..ad2fbfa8dcf 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -12,6 +12,7 @@
 #include <string>
 #include <tuple>
 
+#include "db/blob_index.h"
 #include "db/column_family.h"
 #include "db/compaction/compaction_job.h"
 #include "db/db_impl/db_impl.h"
@@ -97,11 +98,34 @@ class CompactionJobTest : public testing::Test {
     return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
   }
 
-  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
-      const ValueType t) {
+  static std::string KeyStr(const std::string& user_key,
+                            const SequenceNumber seq_num, const ValueType t) {
     return InternalKey(user_key, seq_num, t).Encode().ToString();
   }
 
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+                                uint64_t size, uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrInlinedTTL(const Slice& value,
+                                       uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+    return blob_index;
+  }
+
   void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
     assert(contents.size() > 0);
 
@@ -110,6 +134,7 @@ class CompactionJobTest : public testing::Test {
     InternalKey smallest_key, largest_key;
     SequenceNumber smallest_seqno = kMaxSequenceNumber;
     SequenceNumber largest_seqno = 0;
+    uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
     for (auto kv : contents) {
       ParsedInternalKey key;
       std::string skey;
@@ -132,6 +157,24 @@ class CompactionJobTest : public testing::Test {
       }
 
       first_key = false;
+
+      if (key.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          continue;
+        }
+
+        if (blob_index.IsInlined() || blob_index.HasTTL() ||
+            blob_index.file_number() == kInvalidBlobFileNumber) {
+          continue;
+        }
+
+        if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+            oldest_blob_file_number > blob_index.file_number()) {
+          oldest_blob_file_number = blob_index.file_number();
+        }
+      }
     }
 
     uint64_t file_number = versions_->NewFileNumber();
@@ -140,7 +183,7 @@ class CompactionJobTest : public testing::Test {
 
     VersionEdit edit;
     edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
-        smallest_seqno, largest_seqno, false);
+                 smallest_seqno, largest_seqno, false, oldest_blob_file_number);
 
     mutex_.Lock();
     versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
@@ -250,7 +293,8 @@ class CompactionJobTest : public testing::Test {
       const stl_wrappers::KVMap& expected_results,
       const std::vector<SequenceNumber>& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
-      int output_level = 1, bool verify = true) {
+      int output_level = 1, bool verify = true,
+      uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
     auto cfd = versions_->GetColumnFamilySet()->GetDefault();
 
     size_t num_input_files = 0;
@@ -296,15 +340,20 @@ class CompactionJobTest : public testing::Test {
     mutex_.Unlock();
 
     if (verify) {
-      if (expected_results.size() == 0) {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+      if (expected_results.empty()) {
         ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
       } else {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
         ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
         mock_table_factory_->AssertLatestFile(expected_results);
+
+        auto output_files =
+            cfd->current()->storage_info()->LevelFiles(output_level);
+        ASSERT_EQ(output_files.size(), 1);
+        ASSERT_EQ(output_files[0]->oldest_blob_file_number,
+                  expected_oldest_blob_file_number);
       }
     }
   }
@@ -960,6 +1009,54 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
   RunCompaction({files}, expected_results);
 }
 
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+  NewDB();
+
+  // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+  // of identifying the oldest referenced blob file. Similarly, blob6 will be
+  // ignored because it has TTL and hence refers to a TTL blob file.
+  const stl_wrappers::KVMap::value_type blob1(
+      KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+  const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+                                              BlobStr(59, 123456, 999));
+  const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+                                              BlobStr(138, 1000, 1 << 8));
+  auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+  AddMockFile(file1);
+
+  const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+                                              BlobStr(199, 3 << 10, 1 << 20));
+  const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+                                              BlobStr(19, 6789, 333));
+  const stl_wrappers::KVMap::value_type blob6(
+      KeyStr("f", 6U, kTypeBlobIndex),
+      BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+  auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+  AddMockFile(file2);
+
+  const stl_wrappers::KVMap::value_type expected_blob1(
+      KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+  const stl_wrappers::KVMap::value_type expected_blob2(
+      KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+  const stl_wrappers::KVMap::value_type expected_blob3(
+      KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+  const stl_wrappers::KVMap::value_type expected_blob4(
+      KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+  const stl_wrappers::KVMap::value_type expected_blob5(
+      KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+  const stl_wrappers::KVMap::value_type expected_blob6(
+      KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+  auto expected_results =
+      mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+                          expected_blob4, expected_blob5, expected_blob6});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
+                kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
+                /* expected_oldest_blob_file_number */ 19);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index f3c94022b19..a76f3b450be 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -88,15 +88,13 @@ class CompactionPickerTest : public testing::Test {
            SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
            size_t compensated_file_size = 0) {
     assert(level < vstorage_->num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, path_id, file_size);
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
-    f->fd.smallest_seqno = smallest_seq;
-    f->fd.largest_seqno = largest_seq;
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
-    f->refs = 0;
     vstorage_->AddFile(level, f);
     files_.emplace_back(f);
     file_map_.insert({file_number, {f, level}});
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 8e4dc411f53..8512408d773 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1257,7 +1257,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction);
+                   f->marked_for_compaction, f->oldest_blob_file_number);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -2657,7 +2657,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
                            f->largest, f->fd.smallest_seqno,
-                           f->fd.largest_seqno, f->marked_for_compaction);
+                           f->fd.largest_seqno, f->marked_for_compaction,
+                           f->oldest_blob_file_number);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index f0e6fafccba..b76a8b2a46d 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -128,7 +128,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction);
+                   f->marked_for_compaction, f->oldest_blob_file_number);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 44f6e6e23ab..b5311060a27 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1210,7 +1210,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
                   meta.fd.smallest_seqno, meta.fd.largest_seqno,
-                  meta.marked_for_compaction);
+                  meta.marked_for_compaction, meta.oldest_blob_file_number);
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index 4c38ad31400..f5345c75507 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -70,8 +70,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
     const std::string& db_name, const std::string& cf_name,
     const std::string& file_path, int job_id, const FileDescriptor& fd,
-    const TableProperties& table_properties, TableFileCreationReason reason,
-    const Status& s) {
+    uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+    TableFileCreationReason reason, const Status& s) {
   if (s.ok() && event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
@@ -129,6 +129,11 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
       }
       jwriter.EndObject();
     }
+
+    if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+      jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+    }
+
     jwriter.EndObject();
 
     event_logger->Log(jwriter);
diff --git a/db/event_helpers.h b/db/event_helpers.h
index 88c72cd4e13..820eb09be4b 100644
--- a/db/event_helpers.h
+++ b/db/event_helpers.h
@@ -34,8 +34,8 @@ class EventHelpers {
       const std::vector<std::shared_ptr<EventListener>>& listeners,
       const std::string& db_name, const std::string& cf_name,
       const std::string& file_path, int job_id, const FileDescriptor& fd,
-      const TableProperties& table_properties, TableFileCreationReason reason,
-      const Status& s);
+      uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+      TableFileCreationReason reason, const Status& s);
   static void LogAndNotifyTableFileDeletion(
       EventLogger* event_logger, int job_id,
       uint64_t file_number, const std::string& file_path,
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 6b53ce391eb..b23339c78af 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -243,10 +243,11 @@ Status ExternalSstFileIngestionJob::Run() {
     if (!status.ok()) {
       return status;
     }
+
     edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
-                  false);
+                  false, kInvalidBlobFileNumber);
   }
   return status;
 }
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 589d81f2974..972112dd5c4 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -408,7 +408,7 @@ Status FlushJob::WriteLevel0Table() {
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
-                   meta_.marked_for_compaction);
+                   meta_.marked_for_compaction, meta_.oldest_blob_file_number);
   }
 
   // Note that here we treat flush as level 0 compaction in internal stats
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index e4400e84355..88affe22af0 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -4,9 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <algorithm>
+#include <array>
 #include <map>
 #include <string>
 
+#include "db/blob_index.h"
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/flush_job.h"
@@ -154,6 +156,7 @@ TEST_F(FlushJobTest, NonEmpty) {
   //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
   //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
   //   range-delete "9995" -> "9999" at seqno 10000
+  //   blob references with seqnos 10001..10006
   for (int i = 1; i < 10000; ++i) {
     std::string key(ToString((i + 1000) % 10000));
     std::string value("value" + key);
@@ -163,9 +166,43 @@ TEST_F(FlushJobTest, NonEmpty) {
       inserted_keys.insert({internal_key.Encode().ToString(), value});
     }
   }
-  new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
-  InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
-  inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+
+  {
+    new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+    InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+    inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+  }
+
+#ifndef ROCKSDB_LITE
+  // Note: the first two blob references will not be considered when resolving
+  // the oldest blob file referenced (the first one is inlined TTL, while the
+  // second one is TTL and thus points to a TTL blob file).
+  constexpr std::array<uint64_t, 6> blob_file_numbers{
+      kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
+  for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+    std::string key(ToString(i + 10001));
+    std::string blob_index;
+    if (i == 0) {
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+                                  "foo");
+    } else if (i == 1) {
+      BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+                               blob_file_numbers[i], /* offset */ i << 10,
+                               /* size */ i << 20, kNoCompression);
+    } else {
+      BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+                            /* offset */ i << 10, /* size */ i << 20,
+                            kNoCompression);
+    }
+
+    const SequenceNumber seq(i + 10001);
+    new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+
+    InternalKey internal_key(key, seq, kTypeBlobIndex);
+    inserted_keys.emplace_hint(inserted_keys.end(),
+                               internal_key.Encode().ToString(), blob_index);
+  }
+#endif
 
   autovector<MemTable*> to_delete;
   cfd->imm()->Add(new_mem, &to_delete);
@@ -194,11 +231,14 @@ TEST_F(FlushJobTest, NonEmpty) {
   ASSERT_GT(hist.average, 0.0);
 
   ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
-  ASSERT_EQ(
-      "9999a",
-      file_meta.largest.user_key().ToString());  // range tombstone end key
+  ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
   ASSERT_EQ(1, file_meta.fd.smallest_seqno);
-  ASSERT_EQ(10000, file_meta.fd.largest_seqno);  // range tombstone seqnum 10000
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+  ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+#else
+  ASSERT_EQ(10000, file_meta.fd.largest_seqno);
+#endif
   mock_table_factory_->AssertSingleFile(inserted_keys);
   job_context.Clean();
 }
@@ -261,6 +301,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
   ASSERT_EQ(0, file_meta.fd.smallest_seqno);
   ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
             file_meta.fd.largest_seqno);
+  ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
 
   for (auto m : to_delete) {
     delete m;
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 66b8b16220a..8dfa4b61cf5 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -136,10 +136,11 @@ Status ImportColumnFamilyJob::Run() {
   for (size_t i = 0; i < files_to_import_.size(); ++i) {
     const auto& f = files_to_import_[i];
     const auto& file_metadata = metadata_[i];
+
     edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, file_metadata.smallest_seqno,
-                  file_metadata.largest_seqno, false);
+                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber);
 
     // If incoming sequence number is higher, update local sequence number.
     if (file_metadata.largest_seqno > versions_->LastSequence()) {
diff --git a/db/repair.cc b/db/repair.cc
index 3557eb13992..4f474fee6f8 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -34,6 +34,7 @@
 // We scan every table to compute
 // (1) smallest/largest for the table
 // (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
 //
 // If we are unable to scan the file, then we ignore the table.
 //
@@ -224,8 +225,6 @@ class Repairer {
     FileMetaData meta;
     uint32_t column_family_id;
     std::string column_family_name;
-    SequenceNumber min_sequence;
-    SequenceNumber max_sequence;
   };
 
   std::string const dbname_;
@@ -526,10 +525,7 @@ class Repairer {
           TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
           /*level=*/-1, /*smallest_compaction_key=*/nullptr,
           /*largest_compaction_key=*/nullptr);
-      bool empty = true;
       ParsedInternalKey parsed;
-      t->min_sequence = 0;
-      t->max_sequence = 0;
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
         if (!ParseInternalKey(key, &parsed)) {
@@ -540,18 +536,9 @@ class Repairer {
         }
 
         counter++;
-        if (empty) {
-          empty = false;
-          t->meta.smallest.DecodeFrom(key);
-          t->min_sequence = parsed.sequence;
-        }
-        t->meta.largest.DecodeFrom(key);
-        if (parsed.sequence < t->min_sequence) {
-          t->min_sequence = parsed.sequence;
-        }
-        if (parsed.sequence > t->max_sequence) {
-          t->max_sequence = parsed.sequence;
-        }
+
+        t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+                                 parsed.type);
       }
       if (!iter->status().ok()) {
         status = iter->status();
@@ -570,8 +557,8 @@ class Repairer {
     SequenceNumber max_sequence = 0;
     for (size_t i = 0; i < tables_.size(); i++) {
       cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
-      if (max_sequence < tables_[i].max_sequence) {
-        max_sequence = tables_[i].max_sequence;
+      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+        max_sequence = tables_[i].meta.fd.largest_seqno;
       }
     }
     vset_.SetLastAllocatedSequence(max_sequence);
@@ -591,8 +578,10 @@ class Repairer {
       for (const auto* table : cf_id_and_tables.second) {
         edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
                      table->meta.fd.GetFileSize(), table->meta.smallest,
-                     table->meta.largest, table->min_sequence,
-                     table->max_sequence, table->meta.marked_for_compaction);
+                     table->meta.largest, table->meta.fd.smallest_seqno,
+                     table->meta.fd.largest_seqno,
+                     table->meta.marked_for_compaction,
+                     table->meta.oldest_blob_file_number);
       }
       assert(next_file_number_ > 0);
       vset_.MarkFileNumberUsed(next_file_number_ - 1);
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 3a144190cf1..1da20fab1ab 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -59,14 +59,11 @@ class VersionBuilderTest : public testing::Test {
            bool sampled = false, SequenceNumber smallest_seqno = 0,
            SequenceNumber largest_seqno = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, path_id, file_size);
-    f->smallest = GetInternalKey(smallest, smallest_seq);
-    f->largest = GetInternalKey(largest, largest_seq);
-    f->fd.smallest_seqno = smallest_seqno;
-    f->fd.largest_seqno = largest_seqno;
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+        GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber);
     f->compensated_file_size = file_size;
-    f->refs = 0;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
     vstorage_.AddFile(level, f);
@@ -115,7 +112,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -149,7 +147,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
 
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -186,7 +185,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
 
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -214,15 +214,20 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false);
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false);
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false);
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false);
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber);
 
   EnvOptions env_options;
 
@@ -248,24 +253,31 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false);
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false);
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false);
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false);
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_builder.Apply(&version_edit);
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
-                       GetInternalKey("950"), 200, 200, false);
+                       GetInternalKey("950"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
-                       GetInternalKey("850"), 200, 200, false);
+                       GetInternalKey("850"), 200, 200, false,
+                       kInvalidBlobFileNumber);
   version_builder.Apply(&version_edit2);
 
   version_builder.SaveTo(&new_vstorage);
diff --git a/db/version_edit.cc b/db/version_edit.cc
index ccf19682b86..be5fd355585 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -9,6 +9,7 @@
 
 #include "db/version_edit.h"
 
+#include "db/blob_index.h"
 #include "db/version_set.h"
 #include "logging/event_logger.h"
 #include "rocksdb/slice.h"
@@ -59,6 +60,7 @@ enum CustomTag : uint32_t {
   // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
   // removed when manifest becomes forward-comptabile.
   kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
   kPathId = 65,
 };
 // If this bit for the custom tag is set, opening DB should fail if
@@ -70,6 +72,49 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
   return number | (path_id * (kFileNumberMask + 1));
 }
 
+void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+                                    SequenceNumber seqno,
+                                    ValueType value_type) {
+  if (smallest.size() == 0) {
+    smallest.DecodeFrom(key);
+  }
+  largest.DecodeFrom(key);
+  fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+  fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+#ifndef ROCKSDB_LITE
+  if (value_type == kTypeBlobIndex) {
+    BlobIndex blob_index;
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return;
+    }
+
+    if (blob_index.IsInlined()) {
+      return;
+    }
+
+    if (blob_index.HasTTL()) {
+      return;
+    }
+
+    // Paranoid check: this should not happen because BlobDB numbers the blob
+    // files starting from 1.
+    if (blob_index.file_number() == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+        oldest_blob_file_number > blob_index.file_number()) {
+      oldest_blob_file_number = blob_index.file_number();
+    }
+  }
+#else
+  (void)value;
+  (void)value_type;
+#endif
+}
+
 void VersionEdit::Clear() {
   db_id_.clear();
   comparator_.clear();
@@ -134,7 +179,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
       return false;
     }
     bool has_customized_fields = false;
-    if (f.marked_for_compaction || has_min_log_number_to_keep_) {
+    if (f.marked_for_compaction || has_min_log_number_to_keep_ ||
+        f.oldest_blob_file_number != kInvalidBlobFileNumber) {
       PutVarint32(dst, kNewFile4);
       has_customized_fields = true;
     } else if (f.fd.GetPathId() == 0) {
@@ -197,6 +243,12 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
         PutLengthPrefixedSlice(dst, Slice(varint_log_number));
         min_log_num_written = true;
       }
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+        std::string oldest_blob_file_number;
+        PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+        PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+      }
       TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                                dst);
 
@@ -302,6 +354,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
           }
           has_min_log_number_to_keep_ = true;
           break;
+        case kOldestBlobFileNumber:
+          if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+            return "invalid oldest blob file number";
+          }
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -602,6 +659,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append(f.smallest.DebugString(hex_key));
     r.append(" .. ");
     r.append(f.largest.DebugString(hex_key));
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      r.append(" blob_file:");
+      AppendNumberTo(&r, f.oldest_blob_file_number);
+    }
   }
   r.append("\n  ColumnFamily: ");
   AppendNumberTo(&r, column_family_);
@@ -676,6 +737,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
       jw << "FileSize" << f.fd.GetFileSize();
       jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
       jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        jw << "OldestBlobFile" << f.oldest_blob_file_number;
+      }
       jw.EndArrayedObject();
     }
 
diff --git a/db/version_edit.h b/db/version_edit.h
index 7ab8fc0f161..3e1d7fc711e 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -22,7 +22,8 @@ namespace rocksdb {
 
 class VersionSet;
 
-const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kInvalidBlobFileNumber = 0;
 
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
@@ -91,7 +92,7 @@ struct FileMetaData {
   InternalKey largest;             // Largest internal key served by table
 
   // Needs to be disposed when refs becomes 0.
-  Cache::Handle* table_reader_handle;
+  Cache::Handle* table_reader_handle = nullptr;
 
   FileSampledStats stats;
 
@@ -100,45 +101,44 @@ struct FileMetaData {
   // File size compensated by deletion entry.
   // This is updated in Version::UpdateAccumulatedStats() first time when the
   // file is created or loaded.  After it is updated (!= 0), it is immutable.
-  uint64_t compensated_file_size;
+  uint64_t compensated_file_size = 0;
   // These values can mutate, but they can only be read or written from
   // single-threaded LogAndApply thread
-  uint64_t num_entries;            // the number of entries.
-  uint64_t num_deletions;          // the number of deletion entries.
-  uint64_t raw_key_size;           // total uncompressed key size.
-  uint64_t raw_value_size;         // total uncompressed value size.
-
-  int refs;  // Reference count
-
-  bool being_compacted;        // Is this file undergoing compaction?
-  bool init_stats_from_file;   // true if the data-entry stats of this file
-                               // has initialized from file.
-
-  bool marked_for_compaction;  // True if client asked us nicely to compact this
-                               // file.
-
-  FileMetaData()
-      : table_reader_handle(nullptr),
-        compensated_file_size(0),
-        num_entries(0),
-        num_deletions(0),
-        raw_key_size(0),
-        raw_value_size(0),
-        refs(0),
-        being_compacted(false),
-        init_stats_from_file(false),
-        marked_for_compaction(false) {}
+  uint64_t num_entries = 0;     // the number of entries.
+  uint64_t num_deletions = 0;   // the number of deletion entries.
+  uint64_t raw_key_size = 0;    // total uncompressed key size.
+  uint64_t raw_value_size = 0;  // total uncompressed value size.
+
+  int refs = 0;  // Reference count
+
+  bool being_compacted = false;       // Is this file undergoing compaction?
+  bool init_stats_from_file = false;  // true if the data-entry stats of this
+                                      // file has initialized from file.
+
+  bool marked_for_compaction = false;  // True if client asked us nicely to
+                                       // compact this file.
+
+  // Used only in BlobDB. The file number of the oldest blob file this SST file
+  // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+  FileMetaData() = default;
+
+  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+               const InternalKey& smallest_key, const InternalKey& largest_key,
+               const SequenceNumber& smallest_seq,
+               const SequenceNumber& largest_seq, bool marked_for_compact,
+               uint64_t oldest_blob_file)
+      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+        smallest(smallest_key),
+        largest(largest_key),
+        marked_for_compaction(marked_for_compact),
+        oldest_blob_file_number(oldest_blob_file) {}
 
   // REQUIRED: Keys must be given to the function in sorted order (it expects
   // the last key to be the largest).
-  void UpdateBoundaries(const Slice& key, SequenceNumber seqno) {
-    if (smallest.size() == 0) {
-      smallest.DecodeFrom(key);
-    }
-    largest.DecodeFrom(key);
-    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
-    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
-  }
+  void UpdateBoundaries(const Slice& key, const Slice& value,
+                        SequenceNumber seqno, ValueType value_type);
 
   // Unlike UpdateBoundaries, ranges do not need to be presented in any
   // particular order.
@@ -249,21 +249,18 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+  // referred to by this file if any, kInvalidBlobFileNumber otherwise.
   void AddFile(int level, uint64_t file, uint32_t file_path_id,
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
-               const SequenceNumber& largest_seqno,
-               bool marked_for_compaction) {
+               const SequenceNumber& largest_seqno, bool marked_for_compaction,
+               uint64_t oldest_blob_file_number) {
     assert(smallest_seqno <= largest_seqno);
-    FileMetaData f;
-    f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
-                          largest_seqno);
-    f.smallest = smallest;
-    f.largest = largest;
-    f.fd.smallest_seqno = smallest_seqno;
-    f.fd.largest_seqno = largest_seqno;
-    f.marked_for_compaction = marked_for_compaction;
-    new_files_.emplace_back(level, std::move(f));
+    new_files_.emplace_back(
+        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
+                            smallest_seqno, largest_seqno,
+                            marked_for_compaction, oldest_blob_file_number));
   }
 
   void AddFile(int level, const FileMetaData& f) {
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 69381ca53b8..05b953a7958 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -36,7 +36,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
     edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i, kBig + 600 + i, false);
+                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -53,13 +53,16 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false);
+               kBig + 601, false, kInvalidBlobFileNumber);
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
-               kBig + 602, true);
+               kBig + 602, true, kInvalidBlobFileNumber);
+  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+               kBig + 603, true, 1001);
 
   edit.DeleteFile(4, 700);
 
@@ -78,9 +81,18 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   ASSERT_TRUE(new_files[0].second.marked_for_compaction);
   ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
   ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[3].second.marked_for_compaction);
   ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
   ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
   ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[0].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[1].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[2].second.oldest_blob_file_number);
+  ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
 }
 
 TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
@@ -88,10 +100,10 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false);
+               kBig + 601, false, kInvalidBlobFileNumber);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -137,7 +149,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -164,7 +176,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
 
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
-  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+               kInvalidBlobFileNumber);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
diff --git a/db/version_set.cc b/db/version_set.cc
index fc9316a3d81..d13d315595f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3379,6 +3379,10 @@ std::string Version::DebugString(bool hex, bool print_stats) const {
       r.append(" .. ");
       r.append(files[i]->largest.DebugString(hex));
       r.append("]");
+      if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        r.append(" blob_file:");
+        AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+      }
       if (print_stats) {
         r.append("(");
         r.append(ToString(
@@ -4923,7 +4927,7 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
-                       f->marked_for_compaction);
+                       f->marked_for_compaction, f->oldest_blob_file_number);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index a848d32d2a9..7151e6d036a 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -35,10 +35,11 @@ class GenerateLevelFilesBriefTest : public testing::Test {
   void Add(const char* smallest, const char* largest,
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(files_.size() + 1, 0, 0);
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    FileMetaData* f = new FileMetaData(
+        files_.size() + 1, 0, 0,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
     files_.push_back(f);
   }
 
@@ -129,28 +130,22 @@ class VersionStorageInfoTest : public testing::Test {
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, 0, file_size);
-    f->smallest = GetInternalKey(smallest, 0);
-    f->largest = GetInternalKey(largest, 0);
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, GetInternalKey(smallest, 0),
+        GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber);
     f->compensated_file_size = file_size;
-    f->refs = 0;
-    f->num_entries = 0;
-    f->num_deletions = 0;
     vstorage_.AddFile(level, f);
   }
 
   void Add(int level, uint32_t file_number, const InternalKey& smallest,
            const InternalKey& largest, uint64_t file_size = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, 0, file_size);
-    f->smallest = smallest;
-    f->largest = largest;
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+        /* largest_seq */ 0, /* marked_for_compact */ false,
+        kInvalidBlobFileNumber);
     f->compensated_file_size = file_size;
-    f->refs = 0;
-    f->num_entries = 0;
-    f->num_deletions = 0;
     vstorage_.AddFile(level, f);
   }
 

From 93edd51c4a6dd56995c55bf282c13b9663871a5c Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 14 Oct 2019 15:37:12 -0700
Subject: [PATCH 451/572] bloom_test.cc: include <array> (#5920)

Summary:
Fix build failure on some platforms, reported in issue https://github.com/facebook/rocksdb/issues/5914
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5920

Test Plan: make bloom_test && ./bloom_test

Differential Revision: D17918328

Pulled By: pdillinger

fbshipit-source-id: b822004d4442de0171db2aeff433677783f7b94e
---
 util/bloom_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 921d2931f9a..fc0ea854b97 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -15,6 +15,7 @@ int main() {
 }
 #else
 
+#include <array>
 #include <vector>
 
 #include "logging/logging.h"

From 78b28d80b0d9ae7b8219748412f142781fdc1e06 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 14 Oct 2019 17:47:56 -0700
Subject: [PATCH 452/572] Support non-TTL Puts for BlobDB in db_bench (#5921)

Summary:
Currently, db_bench only supports PutWithTTL operations for BlobDB but
not regular Puts. The patch adds support for regular (non-TTL) Puts and also
changes the default for blob_db_max_ttl_range to zero, which corresponds
to no TTL.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5921

Test Plan:
make check

./db_bench -benchmarks=fillrandom -statistics -stats_interval_seconds=1
-duration=90 -num=500000 -use_blob_db=1 -blob_db_file_size=1000000
-target_file_size_base=1000000 (issues Put operations with no TTL)

./db_bench -benchmarks=fillrandom -statistics -stats_interval_seconds=1
-duration=90 -num=500000 -use_blob_db=1 -blob_db_file_size=1000000
-target_file_size_base=1000000 -blob_db_max_ttl_range=86400 (issues
PutWithTTL operations with random TTLs in the [0, blob_db_max_ttl_range)
interval, as before)

Differential Revision: D17919798

Pulled By: ltamasi

fbshipit-source-id: b946c3522b836b92b4c157ffbad24f92ba2b0a16
---
 tools/db_bench_tool.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 4f2a9f8bf95..cee43dff643 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -757,8 +757,9 @@ DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB.");
 DEFINE_uint64(blob_db_max_db_size, 0,
               "Max size limit of the directory where blob files are stored.");
 
-DEFINE_uint64(blob_db_max_ttl_range, 86400,
-              "TTL range to generate BlobDB data (in seconds).");
+DEFINE_uint64(
+    blob_db_max_ttl_range, 0,
+    "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
 
 DEFINE_uint64(blob_db_ttl_range_secs, 3600,
               "TTL bucket size to use when creating blob files.");
@@ -4188,10 +4189,14 @@ class Benchmark {
         if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
           Slice val = gen.Generate(value_size_);
-          int ttl = rand() % FLAGS_blob_db_max_ttl_range;
           blob_db::BlobDB* blobdb =
               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
-          s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+          if (FLAGS_blob_db_max_ttl_range > 0) {
+            int ttl = rand() % FLAGS_blob_db_max_ttl_range;
+            s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+          } else {
+            s = blobdb->Put(write_options_, key, val);
+          }
 #endif  //  ROCKSDB_LITE
         } else if (FLAGS_num_column_families <= 1) {
           batch.Put(key, gen.Generate(value_size_));

From 5ef27dea332a62c05707d3265864319f7eee603c Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 14 Oct 2019 22:12:31 -0700
Subject: [PATCH 453/572] Fix clang analyzer error (#5924)

Summary:
Without this PR, clang analyzer complains.
```
$USE_CLANG=1 make analyze
db/compaction/compaction_job_test.cc:161:20: warning: The left operand of '==' is a garbage value
      if (key.type == kTypeBlobIndex) {
                ~~~~~~~~ ^
                1 warning generated.
```

Test Plan (on devserver)
```
$USE_CLANG=1 make analyze
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5924

Differential Revision: D17923226

Pulled By: riversand963

fbshipit-source-id: 9d1eb769b5e0de7cb3d89dc90d1cfa895db7fdc8
---
 db/compaction/compaction_job_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index ad2fbfa8dcf..5e0191734aa 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -140,7 +140,7 @@ class CompactionJobTest : public testing::Test {
       std::string skey;
       std::string value;
       std::tie(skey, value) = kv;
-      ParseInternalKey(skey, &key);
+      bool parsed = ParseInternalKey(skey, &key);
 
       smallest_seqno = std::min(smallest_seqno, key.sequence);
       largest_seqno = std::max(largest_seqno, key.sequence);
@@ -158,7 +158,7 @@ class CompactionJobTest : public testing::Test {
 
       first_key = false;
 
-      if (key.type == kTypeBlobIndex) {
+      if (parsed && key.type == kTypeBlobIndex) {
         BlobIndex blob_index;
         const Status s = blob_index.DecodeFrom(value);
         if (!s.ok()) {

From 2c9e9f2a59d195a661f51336895615d9e8e8a8b0 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 16 Oct 2019 07:56:51 -0700
Subject: [PATCH 454/572] Update HISTORY for SeekForPrev bug fix (#5925)

Summary:
Update history for the bug fix in https://github.com/facebook/rocksdb/pull/5907
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5925

Differential Revision: D17952605

Pulled By: maysamyabandeh

fbshipit-source-id: 609afcbb2e4087f9153822c4d11193a75a7b0e7a
---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index bb46315190e..172f00a52bb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,7 @@
 * Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
 * Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.

From 1f9d7c0f543a15872ebc2da024e2b542c7507ff2 Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Wed, 16 Oct 2019 10:39:00 -0700
Subject: [PATCH 455/572] Fix OnFlushCompleted fired before flush result write
 to MANIFEST (#5908)

Summary:
When there are concurrent flush job on the same CF, `OnFlushCompleted` can be called before the flush result being install to LSM. Fixing the issue by passing `FlushJobInfo` through `MemTable`, and the thread who commit the flush result can fetch the `FlushJobInfo` and fire `OnFlushCompleted` on behave of the thread actually writing the SST.

Fix https://github.com/facebook/rocksdb/issues/5892
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5908

Test Plan: Add new test. The test will fail without the fix.

Differential Revision: D17916144

Pulled By: riversand963

fbshipit-source-id: e18df67d9533b5baee52ae3605026cdeb05cbe10
---
 HISTORY.md                             |  1 +
 db/db_flush_test.cc                    | 96 ++++++++++++++++++++++++++
 db/db_impl/db_impl.h                   |  8 +--
 db/db_impl/db_impl_compaction_flush.cc | 77 +++++++++------------
 db/flush_job.cc                        | 26 ++++++-
 db/flush_job.h                         | 18 ++++-
 db/flush_job_test.cc                   | 12 ++--
 db/memtable.h                          | 16 +++++
 db/memtable_list.cc                    | 11 ++-
 db/memtable_list.h                     |  5 +-
 db/memtable_list_test.cc               |  6 +-
 include/rocksdb/listener.h             |  1 +
 12 files changed, 215 insertions(+), 62 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 172f00a52bb..b2a3cf73797 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
 * Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
 * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 034ec63226c..c586cd3222d 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -7,10 +7,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <atomic>
+
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
@@ -323,6 +329,96 @@ TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+  class TestListener : public EventListener {
+   public:
+    void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+      // There's only one key in each flush.
+      ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+      ASSERT_NE(0, info.smallest_seqno);
+      if (info.smallest_seqno == seq1) {
+        // First flush completed
+        ASSERT_FALSE(completed1);
+        completed1 = true;
+        CheckFlushResultCommitted(db, seq1);
+      } else {
+        // Second flush completed
+        ASSERT_FALSE(completed2);
+        completed2 = true;
+        ASSERT_EQ(info.smallest_seqno, seq2);
+        CheckFlushResultCommitted(db, seq2);
+      }
+    }
+
+    void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+      InstrumentedMutex* mutex = db_impl->mutex();
+      mutex->Lock();
+      auto* cfd =
+          reinterpret_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())
+              ->cfd();
+      ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+      mutex->Unlock();
+    }
+
+    std::atomic<SequenceNumber> seq1{0};
+    std::atomic<SequenceNumber> seq2{0};
+    std::atomic<bool> completed1{false};
+    std::atomic<bool> completed2{false};
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+       {"DBImpl::FlushMemTableToOutputFile:Finish",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+        // Wait for the second flush finished, out of mutex.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+          TEST_SYNC_POINT(
+              "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+              "WaitSecond");
+        }
+      });
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+  options.max_background_jobs = 8;
+  // Allow 2 immutable memtables.
+  options.max_write_buffer_number = 3;
+  Reopen(options);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("foo", "v"));
+  listener->seq1 = db_->GetLatestSequenceNumber();
+  // t1 will wait for the second flush complete before committing flush result.
+  auto t1 = port::Thread([&]() {
+    // flush_opts.wait = true
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  });
+  // Wait for first flush scheduled.
+  TEST_SYNC_POINT(
+      "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+  // The second flush will exit early without commit its result. The work
+  // is delegated to the first flush.
+  ASSERT_OK(Put("bar", "v"));
+  listener->seq2 = db_->GetLatestSequenceNumber();
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+  t1.join();
+  ASSERT_TRUE(listener->completed1);
+  ASSERT_TRUE(listener->completed2);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index c1f4e66b9b0..4a9b243d922 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1005,11 +1005,11 @@ class DBImpl : public DB {
 
   void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                           const MutableCFOptions& mutable_cf_options,
-                          int job_id, TableProperties prop);
+                          int job_id);
 
-  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
-                              const MutableCFOptions& mutable_cf_options,
-                              int job_id, TableProperties prop);
+  void NotifyOnFlushCompleted(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
 
   void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
                                const Status& st,
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 8512408d773..4c37dc37bea 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -164,8 +164,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
-  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id,
-                     flush_job.GetTableProperties());
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
 #endif  // ROCKSDB_LITE
 
   Status s;
@@ -213,8 +212,8 @@ Status DBImpl::FlushMemTableToOutputFile(
   if (s.ok()) {
 #ifndef ROCKSDB_LITE
     // may temporarily unlock and lock the mutex.
-    NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
-                           job_context->job_id, flush_job.GetTableProperties());
+    NotifyOnFlushCompleted(cfd, mutable_cf_options,
+                           flush_job.GetCommittedFlushJobsInfo());
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
     if (sfm) {
@@ -233,6 +232,7 @@ Status DBImpl::FlushMemTableToOutputFile(
     }
 #endif  // ROCKSDB_LITE
   }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
   return s;
 }
 
@@ -303,7 +303,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
   autovector<Directory*> distinct_output_dirs;
   autovector<std::string> distinct_output_dir_paths;
-  std::vector<FlushJob> jobs;
+  std::vector<std::unique_ptr<FlushJob>> jobs;
   std::vector<MutableCFOptions> all_mutable_cf_options;
   int num_cfs = static_cast<int>(cfds.size());
   all_mutable_cf_options.reserve(num_cfs);
@@ -330,7 +330,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
     const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
-    jobs.emplace_back(
+    jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, env_options_for_compaction_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
@@ -338,8 +338,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
         data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
         stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
         false /* sync_output_directory */, false /* write_manifest */,
-        thread_pri);
-    jobs.back().PickMemTable();
+        thread_pri));
+    jobs.back()->PickMemTable();
   }
 
   std::vector<FileMetaData> file_meta(num_cfs);
@@ -351,7 +351,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
     // may temporarily unlock and lock the mutex.
     NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
-                       job_context->job_id, jobs[i].GetTableProperties());
+                       job_context->job_id);
   }
 #endif /* !ROCKSDB_LITE */
 
@@ -373,7 +373,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     // TODO (yanqin): parallelize jobs with threads.
     for (int i = 1; i != num_cfs; ++i) {
       exec_status[i].second =
-          jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]);
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
       exec_status[i].first = true;
     }
     if (num_cfs > 1) {
@@ -382,8 +382,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
       TEST_SYNC_POINT(
           "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
     }
+    assert(exec_status.size() > 0);
+    assert(!file_meta.empty());
     exec_status[0].second =
-        jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
+        jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
     exec_status[0].first = true;
 
     Status error_status;
@@ -424,7 +426,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     auto wait_to_install_func = [&]() {
       bool ready = true;
       for (size_t i = 0; i != cfds.size(); ++i) {
-        const auto& mems = jobs[i].GetMemTables();
+        const auto& mems = jobs[i]->GetMemTables();
         if (cfds[i]->IsDropped()) {
           // If the column family is dropped, then do not wait.
           continue;
@@ -465,7 +467,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     autovector<const MutableCFOptions*> mutable_cf_options_list;
     autovector<FileMetaData*> tmp_file_meta;
     for (int i = 0; i != num_cfs; ++i) {
-      const auto& mems = jobs[i].GetMemTables();
+      const auto& mems = jobs[i]->GetMemTables();
       if (!cfds[i]->IsDropped() && !mems.empty()) {
         tmp_cfds.emplace_back(cfds[i]);
         mems_list.emplace_back(&mems);
@@ -501,12 +503,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 #ifndef ROCKSDB_LITE
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
+    assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
     for (int i = 0; i != num_cfs; ++i) {
       if (cfds[i]->IsDropped()) {
         continue;
       }
-      NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i],
-                             job_context->job_id, jobs[i].GetTableProperties());
+      NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+                             jobs[i]->GetCommittedFlushJobsInfo());
       if (sfm) {
         std::string file_path = MakeTableFileName(
             cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
@@ -530,12 +533,12 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     // unref the versions.
     for (int i = 0; i != num_cfs; ++i) {
       if (!exec_status[i].first) {
-        jobs[i].Cancel();
+        jobs[i]->Cancel();
       }
     }
     for (int i = 0; i != num_cfs; ++i) {
       if (exec_status[i].first && exec_status[i].second.ok()) {
-        auto& mems = jobs[i].GetMemTables();
+        auto& mems = jobs[i]->GetMemTables();
         cfds[i]->imm()->RollbackMemtableFlush(mems,
                                               file_meta[i].fd.GetNumber());
       }
@@ -549,7 +552,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
 void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                                 const MutableCFOptions& mutable_cf_options,
-                                int job_id, TableProperties prop) {
+                                int job_id) {
 #ifndef ROCKSDB_LITE
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
@@ -580,7 +583,6 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
     info.triggered_writes_stop = triggered_writes_stop;
     info.smallest_seqno = file_meta->fd.smallest_seqno;
     info.largest_seqno = file_meta->fd.largest_seqno;
-    info.table_properties = prop;
     info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnFlushBegin(this, info);
@@ -594,15 +596,14 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   (void)file_meta;
   (void)mutable_cf_options;
   (void)job_id;
-  (void)prop;
 #endif  // ROCKSDB_LITE
 }
 
-void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
-                                    FileMetaData* file_meta,
-                                    const MutableCFOptions& mutable_cf_options,
-                                    int job_id, TableProperties prop) {
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
 #ifndef ROCKSDB_LITE
+  assert(flush_jobs_info != nullptr);
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
   }
@@ -619,34 +620,22 @@ void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
   // release lock while notifying events
   mutex_.Unlock();
   {
-    FlushJobInfo info;
-    info.cf_id = cfd->GetID();
-    info.cf_name = cfd->GetName();
-    // TODO(yhchiang): make db_paths dynamic in case flush does not
-    //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
-                                       file_meta->fd.GetNumber());
-    info.thread_id = env_->GetThreadID();
-    info.job_id = job_id;
-    info.triggered_writes_slowdown = triggered_writes_slowdown;
-    info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->fd.smallest_seqno;
-    info.largest_seqno = file_meta->fd.largest_seqno;
-    info.table_properties = prop;
-    info.flush_reason = cfd->GetFlushReason();
-    for (auto listener : immutable_db_options_.listeners) {
-      listener->OnFlushCompleted(this, info);
+    for (auto& info : *flush_jobs_info) {
+      info->triggered_writes_slowdown = triggered_writes_slowdown;
+      info->triggered_writes_stop = triggered_writes_stop;
+      for (auto listener : immutable_db_options_.listeners) {
+        listener->OnFlushCompleted(this, *info);
+      }
     }
+    flush_jobs_info->clear();
   }
   mutex_.Lock();
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
 #else
   (void)cfd;
-  (void)file_meta;
   (void)mutable_cf_options;
-  (void)job_id;
-  (void)prop;
+  (void)flush_jobs_info;
 #endif  // ROCKSDB_LITE
 }
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 972112dd5c4..4046b226bf9 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -241,7 +241,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
     s = cfd_->imm()->TryInstallMemtableFlushResults(
         cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
         meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
-        log_buffer_);
+        log_buffer_, &committed_flush_jobs_info_);
   }
 
   if (s.ok() && file_meta != nullptr) {
@@ -392,7 +392,7 @@ Status FlushJob::WriteLevel0Table() {
     if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
       s = output_file_directory_->Fsync();
     }
-    TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
     db_mutex_->Lock();
   }
   base_->Unref();
@@ -410,6 +410,10 @@ Status FlushJob::WriteLevel0Table() {
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
                    meta_.marked_for_compaction, meta_.oldest_blob_file_number);
   }
+#ifndef ROCKSDB_LITE
+  // Piggyback FlushJobInfo on the first first flushed memtable.
+  mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
 
   // Note that here we treat flush as level 0 compaction in internal stats
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
@@ -424,4 +428,22 @@ Status FlushJob::WriteLevel0Table() {
   return s;
 }
 
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+  db_mutex_->AssertHeld();
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo);
+  info->cf_id = cfd_->GetID();
+  info->cf_name = cfd_->GetName();
+  info->file_path = MakeTableFileName(cfd_->ioptions()->cf_paths[0].path,
+                                      meta_.fd.GetNumber());
+  info->thread_id = db_options_.env->GetThreadID();
+  info->job_id = job_context_->job_id;
+  info->smallest_seqno = meta_.fd.smallest_seqno;
+  info->largest_seqno = meta_.fd.largest_seqno;
+  info->table_properties = table_properties_;
+  info->flush_reason = cfd_->GetFlushReason();
+  return info;
+}
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/db/flush_job.h b/db/flush_job.h
index fdb0917bdba..b25aca3529c 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -11,10 +11,11 @@
 #include <atomic>
 #include <deque>
 #include <limits>
+#include <list>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
 
 #include "db/column_family.h"
 #include "db/dbformat.h"
@@ -34,6 +35,7 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
 #include "table/scoped_arena_iterator.h"
@@ -79,14 +81,22 @@ class FlushJob {
   Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
              FileMetaData* file_meta = nullptr);
   void Cancel();
-  TableProperties GetTableProperties() const { return table_properties_; }
   const autovector<MemTable*>& GetMemTables() const { return mems_; }
 
+#ifndef ROCKSDB_LITE
+  std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+    return &committed_flush_jobs_info_;
+  }
+#endif  // !ROCKSDB_LITE
+
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table();
+#ifndef ROCKSDB_LITE
+  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif  // !ROCKSDB_LITE
 
   const std::string& dbname_;
   ColumnFamilyData* cfd_;
@@ -131,6 +141,10 @@ class FlushJob {
   // In this case, only after all flush jobs succeed in flush can RocksDB
   // commit to the MANIFEST.
   const bool write_manifest_;
+  // The current flush job can commit flush result of a concurrent flush job.
+  // We collect FlushJobInfo of all jobs committed by current job and fire
+  // OnFlushCompleted for them.
+  std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
 
   // Variables below are set by PickMemTable():
   FileMetaData meta_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 88affe22af0..fec7379427a 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -347,18 +347,18 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relevant
-  std::vector<FlushJob> flush_jobs;
+  std::vector<std::unique_ptr<FlushJob>> flush_jobs;
   k = 0;
   for (auto cfd : all_cfds) {
     std::vector<SequenceNumber> snapshot_seqs;
-    flush_jobs.emplace_back(
+    flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
         &memtable_ids[k], env_options_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
         &job_context, nullptr, nullptr, nullptr, kNoCompression,
         db_options_.statistics.get(), &event_logger, true,
         false /* sync_output_directory */, false /* write_manifest */,
-        Env::Priority::USER);
+        Env::Priority::USER));
     k++;
   }
   HistogramData hist;
@@ -367,12 +367,12 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   file_metas.reserve(flush_jobs.size());
   mutex_.Lock();
   for (auto& job : flush_jobs) {
-    job.PickMemTable();
+    job->PickMemTable();
   }
   for (auto& job : flush_jobs) {
     FileMetaData meta;
     // Run will release and re-acquire  mutex
-    ASSERT_OK(job.Run(nullptr /**/, &meta));
+    ASSERT_OK(job->Run(nullptr /**/, &meta));
     file_metas.emplace_back(meta);
   }
   autovector<FileMetaData*> file_meta_ptrs;
@@ -381,7 +381,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   }
   autovector<const autovector<MemTable*>*> mems_list;
   for (size_t i = 0; i != all_cfds.size(); ++i) {
-    const auto& mems = flush_jobs[i].GetMemTables();
+    const auto& mems = flush_jobs[i]->GetMemTables();
     mems_list.push_back(&mems);
   }
   autovector<const MutableCFOptions*> mutable_cf_options_list;
diff --git a/db/memtable.h b/db/memtable.h
index 482e2371650..0aeadce80c8 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -32,6 +32,7 @@
 
 namespace rocksdb {
 
+struct FlushJobInfo;
 class Mutex;
 class MemTableIterator;
 class MergeContext;
@@ -423,6 +424,16 @@ class MemTable {
     flush_in_progress_ = in_progress;
   }
 
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -505,6 +516,11 @@ class MemTable {
   // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
   std::atomic<uint64_t> approximate_memory_usage_;
 
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
   // Returns a heuristic flush decision
   bool ShouldFlushNow();
 
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index f0a0bc2a4e1..de212b6a504 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -385,7 +385,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
     const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
     VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
     autovector<MemTable*>* to_delete, Directory* db_directory,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer,
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
@@ -443,6 +444,14 @@ Status MemTableList::TryInstallMemtableFlushResults(
                          cfd->GetName().c_str(), m->file_number_);
         edit_list.push_back(&m->edit_);
         memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+        std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+        if (info != nullptr) {
+          committed_flush_jobs_info->push_back(std::move(info));
+        }
+#else
+        (void)committed_flush_jobs_info;
+#endif  // !ROCKSDB_LITE
       }
       batch_count++;
     }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 0884e1ec0c3..d78a8b5ea9e 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -33,6 +33,8 @@ class InstrumentedMutex;
 class MergeIteratorBuilder;
 class MemTableList;
 
+struct FlushJobInfo;
+
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
 // Iterator code paths
@@ -254,7 +256,8 @@ class MemTableList {
       const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
       VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
       autovector<MemTable*>* to_delete, Directory* db_directory,
-      LogBuffer* log_buffer);
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index b8dc802166e..32a227f4b55 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -117,9 +117,11 @@ class MemTableListTest : public testing::Test {
     // Create dummy mutex.
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
-    return list->TryInstallMemtableFlushResults(
+    std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+    Status s = list->TryInstallMemtableFlushResults(
         cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
-        file_num, to_delete, nullptr, &log_buffer);
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+    return s;
   }
 
   // Calls MemTableList::InstallMemtableFlushResults() and sets up all
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 5be55cbede8..8d11bfaeaf8 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -459,6 +459,7 @@ class EventListener {
 #else
 
 class EventListener {};
+struct FlushJobInfo {};
 
 #endif  // ROCKSDB_LITE
 

From fdc1cb43a64a1a523740eedf3ab0a733d0878743 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 17 Oct 2019 19:35:22 -0700
Subject: [PATCH 456/572] Support decoding blob indexes in sst_dump (#5926)

Summary:
The patch adds a new command line parameter --decode_blob_index to sst_dump.
If this switch is specified, sst_dump prints blob indexes in a human readable format,
printing the blob file number, offset, size, and expiration (if applicable) for blob
references, and the blob value (and expiration) for inlined blobs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5926

Test Plan:
Used db_bench's BlobDB mode to generate SST files containing blob references with
and without expiration, as well as inlined blobs with and without expiration (note: the
latter are stored as plain values), and confirmed sst_dump correctly prints all four types
of records.

Differential Revision: D17939077

Pulled By: ltamasi

fbshipit-source-id: edc5f58fee94ba35f6699c6a042d5758f5b3963d
---
 db/blob_index.h           | 20 ++++++++++++++++++++
 tools/ldb_cmd.cc          |  4 +++-
 tools/sst_dump_tool.cc    | 31 ++++++++++++++++++++++++++-----
 tools/sst_dump_tool_imp.h |  4 +++-
 4 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/db/blob_index.h b/db/blob_index.h
index 4acb4ddeff9..e1d41e27410 100644
--- a/db/blob_index.h
+++ b/db/blob_index.h
@@ -5,6 +5,9 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include <sstream>
+#include <string>
+
 #include "rocksdb/options.h"
 #include "util/coding.h"
 #include "util/string_util.h"
@@ -108,6 +111,23 @@ class BlobIndex {
     return Status::OK();
   }
 
+  std::string DebugString(bool output_hex) {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_;
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
   static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
                                const Slice& value) {
     assert(dst != nullptr);
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index a47b3d4b8d0..9ff9bb0e9dd 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -2945,7 +2945,9 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
     return;
   }
   // no verification
-  rocksdb::SstFileDumper dumper(options, filename, false, output_hex);
+  // TODO: add support for decoding blob indexes in ldb as well
+  rocksdb::SstFileDumper dumper(options, filename, /* verify_checksum */ false,
+                                output_hex, /* decode_blob_index */ false);
   Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
                                     false,            // has_from
                                     from_key, false,  // has_to
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index fa4a431e81b..9c7de34cd8c 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <vector>
 
+#include "db/blob_index.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "options/cf_options.h"
@@ -42,11 +43,12 @@ namespace rocksdb {
 
 SstFileDumper::SstFileDumper(const Options& options,
                              const std::string& file_path, bool verify_checksum,
-                             bool output_hex)
+                             bool output_hex, bool decode_blob_index)
     : file_name_(file_path),
       read_num_(0),
       verify_checksum_(verify_checksum),
       output_hex_(output_hex),
+      decode_blob_index_(decode_blob_index),
       options_(options),
       ioptions_(options_),
       moptions_(ColumnFamilyOptions(options_)),
@@ -379,9 +381,22 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
     }
 
     if (print_kv) {
-      fprintf(stdout, "%s => %s\n",
-          ikey.DebugString(output_hex_).c_str(),
-          value.ToString(output_hex_).c_str());
+      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+                value.ToString(output_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index\n",
+                  ikey.DebugString(output_hex_).c_str());
+          continue;
+        }
+
+        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+                blob_index.DebugString(output_hex_).c_str());
+      }
     }
   }
 
@@ -425,6 +440,9 @@ void print_help() {
     --output_hex
       Can be combined with scan command to print the keys and values in Hex
 
+    --decode_blob_index
+      Decode blob indexes and print them in a human-readable format during scans.
+
     --from=<user_key>
       Key to start reading from when executing check|scan
 
@@ -475,6 +493,7 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
   uint64_t n;
   bool verify_checksum = false;
   bool output_hex = false;
+  bool decode_blob_index = false;
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
@@ -499,6 +518,8 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
       output_hex = true;
+    } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
+      decode_blob_index = true;
     } else if (strcmp(argv[i], "--input_key_hex") == 0) {
       input_key_hex = true;
     } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
@@ -638,7 +659,7 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
     }
 
     rocksdb::SstFileDumper dumper(options, filename, verify_checksum,
-                                  output_hex);
+                                  output_hex, decode_blob_index);
     if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
               dumper.getStatus().ToString().c_str());
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index f8d54435285..5226502edfe 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -18,7 +18,8 @@ namespace rocksdb {
 class SstFileDumper {
  public:
   explicit SstFileDumper(const Options& options, const std::string& file_name,
-                         bool verify_checksum, bool output_hex);
+                         bool verify_checksum, bool output_hex,
+                         bool decode_blob_index);
 
   Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
                         const std::string& from_key, bool has_to,
@@ -64,6 +65,7 @@ class SstFileDumper {
   uint64_t read_num_;
   bool verify_checksum_;
   bool output_hex_;
+  bool decode_blob_index_;
   EnvOptions soptions_;
 
   // options_ and internal_comparator_ will also be used in

From e60cc0925cd4dabd426debed6d14bf1a0bd7748d Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 18 Oct 2019 09:45:23 -0700
Subject: [PATCH 457/572] Expose db stress tests (#5937)

Summary:
expose db stress test by providing db_stress_tool.h in public header.
This PR does the following:
- adds a new header, db_stress_tool.h, in include/rocksdb/
- renames db_stress.cc to db_stress_tool.cc
- adds a db_stress.cc which simply invokes a test function.
- update Makefile accordingly.

Test Plan (dev server):
```
make db_stress
./db_stress
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5937

Differential Revision: D17997647

Pulled By: riversand963

fbshipit-source-id: 1a8d9994f89ce198935566756947c518f0052410
---
 Makefile                         |    2 +-
 include/rocksdb/db_stress_tool.h |    9 +
 tools/db_stress.cc               | 4710 +-----------------------------
 tools/db_stress_tool.cc          | 4686 +++++++++++++++++++++++++++++
 4 files changed, 4699 insertions(+), 4708 deletions(-)
 create mode 100644 include/rocksdb/db_stress_tool.h
 create mode 100644 tools/db_stress_tool.cc

diff --git a/Makefile b/Makefile
index 99d25d02e4a..725a45dec4b 100644
--- a/Makefile
+++ b/Makefile
@@ -1165,7 +1165,7 @@ memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
 filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
-db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+db_stress: tools/db_stress_tool.o tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
 write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL)
diff --git a/include/rocksdb/db_stress_tool.h b/include/rocksdb/db_stress_tool.h
new file mode 100644
index 00000000000..2ae54980e9a
--- /dev/null
+++ b/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+namespace rocksdb {
+int db_stress_tool(int argc, char** argv);
+}  // namespace rocksdb
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 84e6de97cc5..adc159ae6b7 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -6,4720 +6,16 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// The test uses an array to compare against values written to the database.
-// Keys written to the array are in 1:1 correspondence to the actual values in
-// the database according to the formula in the function GenerateValue.
-
-// Space is reserved in the array from 0 to FLAGS_max_key and values are
-// randomly written/deleted/read from those positions. During verification we
-// compare all the positions in the array. To shorten/elongate the running
-// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
-// (sometimes also FLAGS_threads).
-//
-// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
-// different behavior. See comment of the flag for details.
 
 #ifndef GFLAGS
 #include <cstdio>
+
 int main() {
   fprintf(stderr, "Please install gflags to run rocksdb tools\n");
   return 1;
 }
 #else
+#include <rocksdb/db_stress_tool.h>
 
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <cinttypes>
-#include <exception>
-#include <queue>
-#include <thread>
-
-#include "db/db_impl/db_impl.h"
-#include "db/version_set.h"
-#include "hdfs/env_hdfs.h"
-#include "logging/logging.h"
-#include "monitoring/histogram.h"
-#include "options/options_helper.h"
-#include "port/port.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/utilities/backupable_db.h"
-#include "rocksdb/utilities/checkpoint.h"
-#include "rocksdb/utilities/db_ttl.h"
-#include "rocksdb/utilities/debug.h"
-#include "rocksdb/utilities/options_util.h"
-#include "rocksdb/utilities/transaction.h"
-#include "rocksdb/utilities/transaction_db.h"
-#include "rocksdb/write_batch.h"
-#include "util/coding.h"
-#include "util/compression.h"
-#include "util/crc32c.h"
-#include "util/gflags_compat.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-#include "util/string_util.h"
-// SyncPoint is not supported in Released Windows Mode.
-#if !(defined NDEBUG) || !defined(OS_WIN)
-#include "test_util/sync_point.h"
-#endif  // !(defined NDEBUG) || !defined(OS_WIN)
-#include "test_util/testutil.h"
-
-#include "utilities/merge_operators.h"
-
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-
-static const long KB = 1024;
-static const int kRandomValueMaxFactor = 3;
-static const int kValueMaxLen = 100;
-
-static bool ValidateUint32Range(const char* flagname, uint64_t value) {
-  if (value > std::numeric_limits<uint32_t>::max()) {
-    fprintf(stderr,
-            "Invalid value for --%s: %lu, overflow\n",
-            flagname,
-            (unsigned long)value);
-    return false;
-  }
-  return true;
-}
-
-DEFINE_uint64(seed, 2341234, "Seed for PRNG");
-static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
-
-DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
-
-DEFINE_int64(max_key, 1 * KB* KB,
-             "Max number of key/values to place in database");
-
-DEFINE_int32(column_families, 10, "Number of column families");
-
-DEFINE_string(
-    options_file, "",
-    "The path to a RocksDB options file.  If specified, then db_stress will "
-    "run with the RocksDB options in the default column family of the "
-    "specified options file. Note that, when an options file is provided, "
-    "db_stress will ignore the flag values for all options that may be passed "
-    "via options file.");
-
-DEFINE_int64(
-    active_width, 0,
-    "Number of keys in active span of the key-range at any given time. The "
-    "span begins with its left endpoint at key 0, gradually moves rightwards, "
-    "and ends with its right endpoint at max_key. If set to 0, active_width "
-    "will be sanitized to be equal to max_key.");
-
-// TODO(noetzli) Add support for single deletes
-DEFINE_bool(test_batches_snapshots, false,
-            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
-            " which read/write/delete multiple keys in a batch. In this mode,"
-            " we do not verify db content by comparing the content with the "
-            "pre-allocated array. Instead, we do partial verification inside"
-            " MultiGet() by checking various values in a batch. Benefit of"
-            " this mode:\n"
-            "\t(a) No need to acquire mutexes during writes (less cache "
-            "flushes in multi-core leading to speed up)\n"
-            "\t(b) No long validation at the end (more speed up)\n"
-            "\t(c) Test snapshot and atomicity of batch writes");
-
-DEFINE_bool(atomic_flush, false,
-            "If set, enables atomic flush in the options.\n");
-
-DEFINE_bool(test_cf_consistency, false,
-            "If set, runs the stress test dedicated to verifying writes to "
-            "multiple column families are consistent. Setting this implies "
-            "`atomic_flush=true` is set true if `disable_wal=false`.\n");
-
-DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
-
-DEFINE_int32(ttl, -1,
-             "Opens the db with this ttl value if this is not -1. "
-             "Carefully specify a large value such that verifications on "
-             "deleted values don't fail");
-
-DEFINE_int32(value_size_mult, 8,
-             "Size of value will be this number times rand_int(1,3) bytes");
-
-DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
-
-DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
-
-DEFINE_bool(verify_before_write, false, "Verify before write");
-
-DEFINE_bool(histogram, false, "Print histogram of operation timings");
-
-DEFINE_bool(destroy_db_initially, true,
-            "Destroys the database dir before start if this is true");
-
-DEFINE_bool(verbose, false, "Verbose");
-
-DEFINE_bool(progress_reports, true,
-            "If true, db_stress will report number of finished operations");
-
-DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
-              "Number of bytes to buffer in all memtables before compacting");
-
-DEFINE_int32(write_buffer_size,
-             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
-             "Number of bytes to buffer in memtable before compacting");
-
-DEFINE_int32(max_write_buffer_number,
-             rocksdb::Options().max_write_buffer_number,
-             "The number of in-memory memtables. "
-             "Each memtable is of size FLAGS_write_buffer_size.");
-
-DEFINE_int32(min_write_buffer_number_to_merge,
-             rocksdb::Options().min_write_buffer_number_to_merge,
-             "The minimum number of write buffers that will be merged together "
-             "before writing to storage. This is cheap because it is an "
-             "in-memory merge. If this feature is not enabled, then all these "
-             "write buffers are flushed to L0 as separate files and this "
-             "increases read amplification because a get request has to check "
-             "in all of these files. Also, an in-memory merge may result in "
-             "writing less data to storage if there are duplicate records in"
-             " each of these individual write buffers.");
-
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             rocksdb::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
-DEFINE_int64(max_write_buffer_size_to_maintain,
-             rocksdb::Options().max_write_buffer_size_to_maintain,
-             "The total maximum size of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
-DEFINE_double(memtable_prefix_bloom_size_ratio,
-              rocksdb::Options().memtable_prefix_bloom_size_ratio,
-              "creates prefix blooms for memtables, each with size "
-              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
-
-DEFINE_bool(memtable_whole_key_filtering,
-            rocksdb::Options().memtable_whole_key_filtering,
-            "Enable whole key filtering in memtables.");
-
-DEFINE_int32(open_files, rocksdb::Options().max_open_files,
-             "Maximum number of files to keep open at the same time "
-             "(use default if == 0)");
-
-DEFINE_int64(compressed_cache_size, -1,
-             "Number of bytes to use as a cache of compressed data."
-             " Negative means use default settings.");
-
-DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
-
-DEFINE_int32(level0_file_num_compaction_trigger,
-             rocksdb::Options().level0_file_num_compaction_trigger,
-             "Level0 compaction start trigger");
-
-DEFINE_int32(level0_slowdown_writes_trigger,
-             rocksdb::Options().level0_slowdown_writes_trigger,
-             "Number of files in level-0 that will slow down writes");
-
-DEFINE_int32(level0_stop_writes_trigger,
-             rocksdb::Options().level0_stop_writes_trigger,
-             "Number of files in level-0 that will trigger put stop.");
-
-DEFINE_int32(block_size,
-             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
-             "Number of bytes in a block.");
-
-DEFINE_int32(
-    format_version,
-    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
-    "Format version of SST files.");
-
-DEFINE_int32(index_block_restart_interval,
-             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
-             "Number of keys between restart points "
-             "for delta encoding of keys in index block.");
-
-DEFINE_int32(max_background_compactions,
-             rocksdb::Options().max_background_compactions,
-             "The maximum number of concurrent background compactions "
-             "that can occur in parallel.");
-
-DEFINE_int32(num_bottom_pri_threads, 0,
-             "The number of threads in the bottom-priority thread pool (used "
-             "by universal compaction only).");
-
-DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
-             "The interval (in milliseconds) to adjust compaction thread pool "
-             "size. Don't change it periodically if the value is 0.");
-
-DEFINE_int32(compaction_thread_pool_variations, 2,
-             "Range of background thread pool size variations when adjusted "
-             "periodically.");
-
-DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
-             "The maximum number of concurrent background flushes "
-             "that can occur in parallel.");
-
-DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
-             " compaction in universal style");
-
-DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
-             "compact in universal style compaction");
-
-DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
-             " in universal style compaction");
-
-DEFINE_int32(universal_max_size_amplification_percent, 0,
-             "The max size amplification for universal style compaction");
-
-DEFINE_int32(clear_column_family_one_in, 1000000,
-             "With a chance of 1/N, delete a column family and then recreate "
-             "it again. If N == 0, never drop/create column families. "
-             "When test_batches_snapshots is true, this flag has no effect");
-
-DEFINE_int32(set_options_one_in, 0,
-             "With a chance of 1/N, change some random options");
-
-DEFINE_int32(set_in_place_one_in, 0,
-             "With a chance of 1/N, toggle in place support option");
-
-DEFINE_int64(cache_size, 2LL * KB * KB * KB,
-             "Number of bytes to use as a cache of uncompressed data.");
-
-DEFINE_bool(cache_index_and_filter_blocks, false,
-            "True if indexes/filters should be cached in block cache.");
-
-DEFINE_bool(use_clock_cache, false,
-            "Replace default LRU block cache with clock cache.");
-
-DEFINE_uint64(subcompactions, 1,
-              "Maximum number of subcompactions to divide L0-L1 compactions "
-              "into.");
-
-DEFINE_uint64(periodic_compaction_seconds, 1000,
-              "Files older than this value will be picked up for compaction.");
-
-DEFINE_uint64(compaction_ttl, 1000,
-              "Files older than TTL will be compacted to the next level.");
-
-DEFINE_bool(allow_concurrent_memtable_write, false,
-            "Allow multi-writers to update mem tables in parallel.");
-
-DEFINE_bool(enable_write_thread_adaptive_yield, true,
-            "Use a yielding spin loop for brief writer thread waits.");
-
-static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
-
-static bool ValidateInt32Positive(const char* flagname, int32_t value) {
-  if (value < 0) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(reopen, 10, "Number of times database reopens");
-static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
-
-DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
-             "Negative means use default settings.");
-
-DEFINE_bool(use_block_based_filter, false, "use block based filter"
-              "instead of full filter for block based table");
-
-DEFINE_bool(partition_filters, false,
-            "use partitioned filters "
-            "for block-based table");
-
-DEFINE_int32(
-    index_type,
-    static_cast<int32_t>(rocksdb::BlockBasedTableOptions::kBinarySearch),
-    "Type of block-based table index (see `enum IndexType` in table.h)");
-
-DEFINE_string(db, "", "Use the db with the following name.");
-
-DEFINE_string(secondaries_base, "",
-              "Use this path as the base path for secondary instances.");
-
-DEFINE_bool(enable_secondary, false, "Enable secondary instance.");
-
-DEFINE_string(
-    expected_values_path, "",
-    "File where the array of expected uint32_t values will be stored. If "
-    "provided and non-empty, the DB state will be verified against these "
-    "values after recovery. --max_key and --column_family must be kept the "
-    "same across invocations of this program that use the same "
-    "--expected_values_path.");
-
-DEFINE_bool(verify_checksum, false,
-            "Verify checksum for every block read from storage");
-
-DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
-            "Allow reads to occur via mmap-ing files");
-
-DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
-            "Allow writes to occur via mmap-ing files");
-
-DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
-            "Use O_DIRECT for reading data");
-
-DEFINE_bool(use_direct_io_for_flush_and_compaction,
-            rocksdb::Options().use_direct_io_for_flush_and_compaction,
-            "Use O_DIRECT for writing data");
-
-// Database statistics
-static std::shared_ptr<rocksdb::Statistics> dbstats;
-DEFINE_bool(statistics, false, "Create database statistics");
-
-DEFINE_bool(sync, false, "Sync all writes to disk");
-
-DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
-
-DEFINE_int32(kill_random_test, 0,
-             "If non-zero, kill at various points in source code with "
-             "probability 1/this");
-static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
-extern int rocksdb_kill_odds;
-
-DEFINE_string(kill_prefix_blacklist, "",
-              "If non-empty, kill points with prefix in the list given will be"
-              " skipped. Items are comma-separated.");
-extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
-
-DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
-
-DEFINE_uint64(recycle_log_file_num, rocksdb::Options().recycle_log_file_num,
-              "Number of old WAL files to keep around for later recycling");
-
-DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
-             "Target level-1 file size for compaction");
-
-DEFINE_int32(target_file_size_multiplier, 1,
-             "A multiplier to compute target level-N file size (N >= 2)");
-
-DEFINE_uint64(max_bytes_for_level_base,
-              rocksdb::Options().max_bytes_for_level_base,
-              "Max bytes for level-1");
-
-DEFINE_double(max_bytes_for_level_multiplier, 2,
-              "A multiplier to compute max bytes for level-N (N >= 2)");
-
-DEFINE_int32(range_deletion_width, 10,
-             "The width of the range deletion intervals.");
-
-DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
-
-DEFINE_bool(rate_limit_bg_reads, false,
-            "Use options.rate_limiter on compaction reads");
-
-DEFINE_bool(use_txn, false,
-            "Use TransactionDB. Currently the default write policy is "
-            "TxnDBWritePolicy::WRITE_PREPARED");
-
-DEFINE_int32(backup_one_in, 0,
-             "If non-zero, then CreateNewBackup() will be called once for "
-             "every N operations on average.  0 indicates CreateNewBackup() "
-             "is disabled.");
-
-DEFINE_int32(checkpoint_one_in, 0,
-             "If non-zero, then CreateCheckpoint() will be called once for "
-             "every N operations on average.  0 indicates CreateCheckpoint() "
-             "is disabled.");
-
-DEFINE_int32(ingest_external_file_one_in, 0,
-             "If non-zero, then IngestExternalFile() will be called once for "
-             "every N operations on average.  0 indicates IngestExternalFile() "
-             "is disabled.");
-
-DEFINE_int32(ingest_external_file_width, 1000,
-             "The width of the ingested external files.");
-
-DEFINE_int32(compact_files_one_in, 0,
-             "If non-zero, then CompactFiles() will be called once for every N "
-             "operations on average.  0 indicates CompactFiles() is disabled.");
-
-DEFINE_int32(compact_range_one_in, 0,
-             "If non-zero, then CompactRange() will be called once for every N "
-             "operations on average.  0 indicates CompactRange() is disabled.");
-
-DEFINE_int32(flush_one_in, 0,
-             "If non-zero, then Flush() will be called once for every N ops "
-             "on average.  0 indicates calls to Flush() are disabled.");
-
-DEFINE_int32(compact_range_width, 10000,
-             "The width of the ranges passed to CompactRange().");
-
-DEFINE_int32(acquire_snapshot_one_in, 0,
-             "If non-zero, then acquires a snapshot once every N operations on "
-             "average.");
-
-DEFINE_bool(compare_full_db_state_snapshot, false,
-            "If set we compare state of entire db (in one of the threads) with"
-            "each snapshot.");
-
-DEFINE_uint64(snapshot_hold_ops, 0,
-              "If non-zero, then releases snapshots N operations after they're "
-              "acquired.");
-
-DEFINE_bool(use_multiget, false,
-            "If set, use the batched MultiGet API for reads");
-
-static bool ValidateInt32Percent(const char* flagname, int32_t value) {
-  if (value < 0 || value>100) {
-    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-
-DEFINE_int32(readpercent, 10,
-             "Ratio of reads to total workload (expressed as a percentage)");
-static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
-
-DEFINE_int32(prefixpercent, 20,
-             "Ratio of prefix iterators to total workload (expressed as a"
-             " percentage)");
-static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
-
-DEFINE_int32(writepercent, 45,
-             "Ratio of writes to total workload (expressed as a percentage)");
-static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
-
-DEFINE_int32(delpercent, 15,
-             "Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
-
-DEFINE_int32(delrangepercent, 0,
-             "Ratio of range deletions to total workload (expressed as a "
-             "percentage). Cannot be used with test_batches_snapshots");
-static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
-
-DEFINE_int32(nooverwritepercent, 60,
-             "Ratio of keys without overwrite to total workload (expressed as "
-             " a percentage)");
-static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
-
-DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
-             " (expressed as a percentage)");
-static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
-
-DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
-static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
-
-namespace {
-enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "none"))
-    return rocksdb::kNoCompression;
-  else if (!strcasecmp(ctype, "snappy"))
-    return rocksdb::kSnappyCompression;
-  else if (!strcasecmp(ctype, "zlib"))
-    return rocksdb::kZlibCompression;
-  else if (!strcasecmp(ctype, "bzip2"))
-    return rocksdb::kBZip2Compression;
-  else if (!strcasecmp(ctype, "lz4"))
-    return rocksdb::kLZ4Compression;
-  else if (!strcasecmp(ctype, "lz4hc"))
-    return rocksdb::kLZ4HCCompression;
-  else if (!strcasecmp(ctype, "xpress"))
-    return rocksdb::kXpressCompression;
-  else if (!strcasecmp(ctype, "zstd"))
-    return rocksdb::kZSTD;
-
-  fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
-  return rocksdb::kSnappyCompression; //default value
-}
-
-enum rocksdb::ChecksumType StringToChecksumType(const char* ctype) {
-  assert(ctype);
-  auto iter = rocksdb::checksum_type_string_map.find(ctype);
-  if (iter != rocksdb::checksum_type_string_map.end()) {
-    return iter->second;
-  }
-  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
-  return rocksdb::kCRC32c;
-}
-
-std::string ChecksumTypeToString(rocksdb::ChecksumType ctype) {
-  auto iter = std::find_if(
-      rocksdb::checksum_type_string_map.begin(),
-      rocksdb::checksum_type_string_map.end(),
-      [&](const std::pair<std::string, rocksdb::ChecksumType>&
-              name_and_enum_val) { return name_and_enum_val.second == ctype; });
-  assert(iter != rocksdb::checksum_type_string_map.end());
-  return iter->first;
-}
-
-std::vector<std::string> SplitString(std::string src) {
-  std::vector<std::string> ret;
-  if (src.empty()) {
-    return ret;
-  }
-  size_t pos = 0;
-  size_t pos_comma;
-  while ((pos_comma = src.find(',', pos)) != std::string::npos) {
-    ret.push_back(src.substr(pos, pos_comma - pos));
-    pos = pos_comma + 1;
-  }
-  ret.push_back(src.substr(pos, src.length()));
-  return ret;
-}
-}  // namespace
-
-DEFINE_string(compression_type, "snappy",
-              "Algorithm to use to compress the database");
-static enum rocksdb::CompressionType FLAGS_compression_type_e =
-    rocksdb::kSnappyCompression;
-
-DEFINE_int32(compression_max_dict_bytes, 0,
-             "Maximum size of dictionary used to prime the compression "
-             "library.");
-
-DEFINE_int32(compression_zstd_max_train_bytes, 0,
-             "Maximum size of training data passed to zstd's dictionary "
-             "trainer.");
-
-DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
-static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
-
-DEFINE_string(hdfs, "", "Name of hdfs environment");
-// posix or hdfs environment
-static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
-
-DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
-
-DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
-static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
-
-DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
-
-DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
-
-DEFINE_int32(secondary_catch_up_one_in, 0,
-             "If non-zero, the secondaries attemp to catch up with the primary "
-             "once for every N operations on average. 0 indicates the "
-             "secondaries do not try to catch up after open.");
-
-static std::shared_ptr<rocksdb::Statistics> dbstats_secondaries;
-
-enum RepFactory {
-  kSkipList,
-  kHashSkipList,
-  kVectorRep
-};
-
-namespace {
-enum RepFactory StringToRepFactory(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "skip_list"))
-    return kSkipList;
-  else if (!strcasecmp(ctype, "prefix_hash"))
-    return kHashSkipList;
-  else if (!strcasecmp(ctype, "vector"))
-    return kVectorRep;
-
-  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
-  return kSkipList;
-}
-
-#ifdef _MSC_VER
-#pragma warning(push)
-// truncation of constant value on static_cast
-#pragma warning(disable : 4309)
-#endif
-bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
-  std::string ret = src.ToString();
-  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
-    if (ret[i] != static_cast<char>(255)) {
-      ret[i] = ret[i] + 1;
-      break;
-    } else if (i != 0) {
-      ret[i] = 0;
-    } else {
-      // all FF. No next prefix
-      return false;
-    }
-  }
-  *v = ret;
-  return true;
-}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-}  // namespace
-
-static enum RepFactory FLAGS_rep_factory;
-DEFINE_string(memtablerep, "prefix_hash", "");
-
-static bool ValidatePrefixSize(const char* flagname, int32_t value) {
-  if (value < -1 || value > 8) {
-    fprintf(stderr, "Invalid value for --%s: %d. -1 <= PrefixSize <= 8\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(prefix_size, 7,
-             "Control the prefix size for HashSkipListRep. "
-             "-1 is disabled.");
-static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
-
-DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
-            "that behaves like a Put");
-
-DEFINE_bool(use_full_merge_v1, false,
-            "On true, use a merge operator that implement the deprecated "
-            "version of FullMerge");
-
-namespace rocksdb {
-
-// convert long to a big-endian slice key
-static std::string Key(int64_t val) {
-  std::string little_endian_key;
-  std::string big_endian_key;
-  PutFixed64(&little_endian_key, val);
-  assert(little_endian_key.size() == sizeof(val));
-  big_endian_key.resize(sizeof(val));
-  for (size_t i = 0 ; i < sizeof(val); ++i) {
-    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
-  }
-  return big_endian_key;
-}
-
-static bool GetIntVal(std::string big_endian_key, uint64_t *key_p) {
-  unsigned int size_key = sizeof(*key_p);
-  assert(big_endian_key.size() == size_key);
-  std::string little_endian_key;
-  little_endian_key.resize(size_key);
-  for (size_t i = 0 ; i < size_key; ++i) {
-    little_endian_key[i] = big_endian_key[size_key - 1 - i];
-  }
-  Slice little_endian_slice = Slice(little_endian_key);
-  return GetFixed64(&little_endian_slice, key_p);
-}
-
-static std::string StringToHex(const std::string& str) {
-  std::string result = "0x";
-  result.append(Slice(str).ToString(true));
-  return result;
-}
-
-
-class StressTest;
-namespace {
-
-class Stats {
- private:
-  uint64_t start_;
-  uint64_t finish_;
-  double  seconds_;
-  long done_;
-  long gets_;
-  long prefixes_;
-  long writes_;
-  long deletes_;
-  size_t single_deletes_;
-  long iterator_size_sums_;
-  long founds_;
-  long iterations_;
-  long range_deletions_;
-  long covered_by_range_deletions_;
-  long errors_;
-  long num_compact_files_succeed_;
-  long num_compact_files_failed_;
-  int next_report_;
-  size_t bytes_;
-  uint64_t last_op_finish_;
-  HistogramImpl hist_;
-
- public:
-  Stats() { }
-
-  void Start() {
-    next_report_ = 100;
-    hist_.Clear();
-    done_ = 0;
-    gets_ = 0;
-    prefixes_ = 0;
-    writes_ = 0;
-    deletes_ = 0;
-    single_deletes_ = 0;
-    iterator_size_sums_ = 0;
-    founds_ = 0;
-    iterations_ = 0;
-    range_deletions_ = 0;
-    covered_by_range_deletions_ = 0;
-    errors_ = 0;
-    bytes_ = 0;
-    seconds_ = 0;
-    num_compact_files_succeed_ = 0;
-    num_compact_files_failed_ = 0;
-    start_ = FLAGS_env->NowMicros();
-    last_op_finish_ = start_;
-    finish_ = start_;
-  }
-
-  void Merge(const Stats& other) {
-    hist_.Merge(other.hist_);
-    done_ += other.done_;
-    gets_ += other.gets_;
-    prefixes_ += other.prefixes_;
-    writes_ += other.writes_;
-    deletes_ += other.deletes_;
-    single_deletes_ += other.single_deletes_;
-    iterator_size_sums_ += other.iterator_size_sums_;
-    founds_ += other.founds_;
-    iterations_ += other.iterations_;
-    range_deletions_ += other.range_deletions_;
-    covered_by_range_deletions_ = other.covered_by_range_deletions_;
-    errors_ += other.errors_;
-    bytes_ += other.bytes_;
-    seconds_ += other.seconds_;
-    num_compact_files_succeed_ += other.num_compact_files_succeed_;
-    num_compact_files_failed_ += other.num_compact_files_failed_;
-    if (other.start_ < start_) start_ = other.start_;
-    if (other.finish_ > finish_) finish_ = other.finish_;
-  }
-
-  void Stop() {
-    finish_ = FLAGS_env->NowMicros();
-    seconds_ = (finish_ - start_) * 1e-6;
-  }
-
-  void FinishedSingleOp() {
-    if (FLAGS_histogram) {
-      auto now = FLAGS_env->NowMicros();
-      auto micros = now - last_op_finish_;
-      hist_.Add(micros);
-      if (micros > 20000) {
-        fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, "");
-      }
-      last_op_finish_ = now;
-    }
-
-      done_++;
-    if (FLAGS_progress_reports) {
-      if (done_ >= next_report_) {
-        if      (next_report_ < 1000)   next_report_ += 100;
-        else if (next_report_ < 5000)   next_report_ += 500;
-        else if (next_report_ < 10000)  next_report_ += 1000;
-        else if (next_report_ < 50000)  next_report_ += 5000;
-        else if (next_report_ < 100000) next_report_ += 10000;
-        else if (next_report_ < 500000) next_report_ += 50000;
-        else                            next_report_ += 100000;
-        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
-      }
-    }
-  }
-
-  void AddBytesForWrites(long nwrites, size_t nbytes) {
-    writes_ += nwrites;
-    bytes_ += nbytes;
-  }
-
-  void AddGets(long ngets, long nfounds) {
-    founds_ += nfounds;
-    gets_ += ngets;
-  }
-
-  void AddPrefixes(long nprefixes, long count) {
-    prefixes_ += nprefixes;
-    iterator_size_sums_ += count;
-  }
-
-  void AddIterations(long n) { iterations_ += n; }
-
-  void AddDeletes(long n) { deletes_ += n; }
-
-  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
-
-  void AddRangeDeletions(long n) { range_deletions_ += n; }
-
-  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
-
-  void AddErrors(long n) { errors_ += n; }
-
-  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
-
-  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
-
-  void Report(const char* name) {
-    std::string extra;
-    if (bytes_ < 1 || done_ < 1) {
-      fprintf(stderr, "No writes or ops?\n");
-      return;
-    }
-
-    double elapsed = (finish_ - start_) * 1e-6;
-    double bytes_mb = bytes_ / 1048576.0;
-    double rate = bytes_mb / elapsed;
-    double throughput = (double)done_/elapsed;
-
-    fprintf(stdout, "%-12s: ", name);
-    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
-            seconds_ * 1e6 / done_, (long)throughput);
-    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
-            "", bytes_mb, rate, (100*writes_)/done_, done_);
-    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
-    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
-    fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
-           single_deletes_);
-    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
-            gets_, founds_);
-    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
-    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
-            iterator_size_sums_);
-    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
-    fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_);
-    fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "",
-            covered_by_range_deletions_);
-
-    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
-    fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "",
-            num_compact_files_succeed_);
-    fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "",
-            num_compact_files_failed_);
-
-    if (FLAGS_histogram) {
-      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
-    }
-    fflush(stdout);
-  }
-};
-
-// State shared by all concurrent executions of the same benchmark.
-class SharedState {
- public:
-  // indicates a key may have any value (or not be present) as an operation on
-  // it is incomplete.
-  static const uint32_t UNKNOWN_SENTINEL;
-  // indicates a key should definitely be deleted
-  static const uint32_t DELETION_SENTINEL;
-
-  explicit SharedState(StressTest* stress_test)
-      : cv_(&mu_),
-        seed_(static_cast<uint32_t>(FLAGS_seed)),
-        max_key_(FLAGS_max_key),
-        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
-        num_threads_(FLAGS_threads),
-        num_initialized_(0),
-        num_populated_(0),
-        vote_reopen_(0),
-        num_done_(0),
-        start_(false),
-        start_verify_(false),
-        should_stop_bg_thread_(false),
-        bg_thread_finished_(false),
-        stress_test_(stress_test),
-        verification_failure_(false),
-        no_overwrite_ids_(FLAGS_column_families),
-        values_(nullptr),
-        printing_verification_results_(false) {
-    // Pick random keys in each column family that will not experience
-    // overwrite
-
-    printf("Choosing random keys with no overwrite\n");
-    Random64 rnd(seed_);
-    // Start with the identity permutation. Subsequent iterations of
-    // for loop below will start with perm of previous for loop
-    int64_t *permutation = new int64_t[max_key_];
-    for (int64_t i = 0; i < max_key_; i++) {
-      permutation[i] = i;
-    }
-    // Now do the Knuth shuffle
-    int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
-    // Only need to figure out first num_no_overwrite_keys of permutation
-    no_overwrite_ids_.reserve(num_no_overwrite_keys);
-    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
-      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
-      // Swap i and rand_index;
-      int64_t temp = permutation[i];
-      permutation[i] = permutation[rand_index];
-      permutation[rand_index] = temp;
-      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
-      // permutation
-      no_overwrite_ids_.insert(permutation[i]);
-    }
-    delete[] permutation;
-
-    size_t expected_values_size =
-        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
-    bool values_init_needed = false;
-    Status status;
-    if (!FLAGS_expected_values_path.empty()) {
-      if (!std::atomic<uint32_t>{}.is_lock_free()) {
-        status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on platforms without lock-free "
-            "std::atomic<uint32_t>");
-      }
-      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
-        status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on when "
-            "--clear_column_family_one_in is greater than zero.");
-      }
-      uint64_t size = 0;
-      if (status.ok()) {
-        status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
-      }
-      std::unique_ptr<WritableFile> wfile;
-      if (status.ok() && size == 0) {
-        const EnvOptions soptions;
-        status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
-                                            soptions);
-      }
-      if (status.ok() && size == 0) {
-        std::string buf(expected_values_size, '\0');
-        status = wfile->Append(buf);
-        values_init_needed = true;
-      }
-      if (status.ok()) {
-        status = FLAGS_env->NewMemoryMappedFileBuffer(
-            FLAGS_expected_values_path, &expected_mmap_buffer_);
-      }
-      if (status.ok()) {
-        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
-        values_ =
-            static_cast<std::atomic<uint32_t>*>(expected_mmap_buffer_->GetBase());
-        assert(values_ != nullptr);
-      } else {
-        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
-                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
-        assert(values_ == nullptr);
-      }
-    }
-    if (values_ == nullptr) {
-      values_allocation_.reset(
-          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
-      values_ = &values_allocation_[0];
-      values_init_needed = true;
-    }
-    assert(values_ != nullptr);
-    if (values_init_needed) {
-      for (int i = 0; i < FLAGS_column_families; ++i) {
-        for (int j = 0; j < max_key_; ++j) {
-          Delete(i, j, false /* pending */);
-        }
-      }
-    }
-
-    if (FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
-      return;
-    }
-
-    long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
-    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
-      num_locks++;
-    }
-    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
-    key_locks_.resize(FLAGS_column_families);
-
-    for (int i = 0; i < FLAGS_column_families; ++i) {
-      key_locks_[i].resize(num_locks);
-      for (auto& ptr : key_locks_[i]) {
-        ptr.reset(new port::Mutex);
-      }
-    }
-  }
-
-  ~SharedState() {}
-
-  port::Mutex* GetMutex() {
-    return &mu_;
-  }
-
-  port::CondVar* GetCondVar() {
-    return &cv_;
-  }
-
-  StressTest* GetStressTest() const {
-    return stress_test_;
-  }
-
-  int64_t GetMaxKey() const {
-    return max_key_;
-  }
-
-  uint32_t GetNumThreads() const {
-    return num_threads_;
-  }
-
-  void IncInitialized() {
-    num_initialized_++;
-  }
-
-  void IncOperated() {
-    num_populated_++;
-  }
-
-  void IncDone() {
-    num_done_++;
-  }
-
-  void IncVotedReopen() {
-    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
-  }
-
-  bool AllInitialized() const {
-    return num_initialized_ >= num_threads_;
-  }
-
-  bool AllOperated() const {
-    return num_populated_ >= num_threads_;
-  }
-
-  bool AllDone() const {
-    return num_done_ >= num_threads_;
-  }
-
-  bool AllVotedReopen() {
-    return (vote_reopen_ == 0);
-  }
-
-  void SetStart() {
-    start_ = true;
-  }
-
-  void SetStartVerify() {
-    start_verify_ = true;
-  }
-
-  bool Started() const {
-    return start_;
-  }
-
-  bool VerifyStarted() const {
-    return start_verify_;
-  }
-
-  void SetVerificationFailure() { verification_failure_.store(true); }
-
-  bool HasVerificationFailedYet() { return verification_failure_.load(); }
-
-  port::Mutex* GetMutexForKey(int cf, int64_t key) {
-    return key_locks_[cf][key >> log2_keys_per_lock_].get();
-  }
-
-  void LockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex->Lock();
-    }
-  }
-
-  void UnlockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex->Unlock();
-    }
-  }
-
-  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
-    return values_[cf * max_key_ + key];
-  }
-
-  void ClearColumnFamily(int cf) {
-    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
-              DELETION_SENTINEL);
-  }
-
-  // @param pending True if the update may have started but is not yet
-  //    guaranteed finished. This is useful for crash-recovery testing when the
-  //    process may crash before updating the expected values array.
-  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
-    if (!pending) {
-      // prevent expected-value update from reordering before Write
-      std::atomic_thread_fence(std::memory_order_release);
-    }
-    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
-                         std::memory_order_relaxed);
-    if (pending) {
-      // prevent Write from reordering before expected-value update
-      std::atomic_thread_fence(std::memory_order_release);
-    }
-  }
-
-  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
-
-  // @param pending See comment above Put()
-  // Returns true if the key was not yet deleted.
-  bool Delete(int cf, int64_t key, bool pending) {
-    if (Value(cf, key) == DELETION_SENTINEL) {
-      return false;
-    }
-    Put(cf, key, DELETION_SENTINEL, pending);
-    return true;
-  }
-
-  // @param pending See comment above Put()
-  // Returns true if the key was not yet deleted.
-  bool SingleDelete(int cf, int64_t key, bool pending) {
-    return Delete(cf, key, pending);
-  }
-
-  // @param pending See comment above Put()
-  // Returns number of keys deleted by the call.
-  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
-    int covered = 0;
-    for (int64_t key = begin_key; key < end_key; ++key) {
-      if (Delete(cf, key, pending)) {
-        ++covered;
-      }
-    }
-    return covered;
-  }
-
-  bool AllowsOverwrite(int64_t key) {
-    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
-  }
-
-  bool Exists(int cf, int64_t key) {
-    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
-    // is disallowed can't be accidentally added a second time, in which case
-    // SingleDelete wouldn't be able to properly delete the key. It does allow
-    // the case where a SingleDelete might be added which covers nothing, but
-    // that's not a correctness issue.
-    uint32_t expected_value = Value(cf, key).load();
-    return expected_value != DELETION_SENTINEL;
-  }
-
-  uint32_t GetSeed() const { return seed_; }
-
-  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
-
-  bool ShoudStopBgThread() { return should_stop_bg_thread_; }
-
-  void SetBgThreadFinish() { bg_thread_finished_ = true; }
-
-  bool BgThreadFinished() const { return bg_thread_finished_; }
-
-  bool ShouldVerifyAtBeginning() const {
-    return expected_mmap_buffer_.get() != nullptr;
-  }
-
-  bool PrintingVerificationResults() {
-    bool tmp = false;
-    return !printing_verification_results_.compare_exchange_strong(
-        tmp, true, std::memory_order_relaxed);
-  }
-
-  void FinishPrintingVerificationResults() {
-    printing_verification_results_.store(false, std::memory_order_relaxed);
-  }
-
- private:
-  port::Mutex mu_;
-  port::CondVar cv_;
-  const uint32_t seed_;
-  const int64_t max_key_;
-  const uint32_t log2_keys_per_lock_;
-  const int num_threads_;
-  long num_initialized_;
-  long num_populated_;
-  long vote_reopen_;
-  long num_done_;
-  bool start_;
-  bool start_verify_;
-  bool should_stop_bg_thread_;
-  bool bg_thread_finished_;
-  StressTest* stress_test_;
-  std::atomic<bool> verification_failure_;
-
-  // Keys that should not be overwritten
-  std::unordered_set<size_t> no_overwrite_ids_;
-
-  std::atomic<uint32_t>* values_;
-  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
-  // Has to make it owned by a smart ptr as port::Mutex is not copyable
-  // and storing it in the container may require copying depending on the impl.
-  std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
-  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
-  std::atomic<bool> printing_verification_results_;
-};
-
-const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
-const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
-
-// Per-thread state for concurrent executions of the same benchmark.
-struct ThreadState {
-  uint32_t tid;  // 0..n-1
-  Random rand;   // Has different seeds for different threads
-  SharedState* shared;
-  Stats stats;
-  struct SnapshotState {
-    const Snapshot* snapshot;
-    // The cf from which we did a Get at this snapshot
-    int cf_at;
-    // The name of the cf at the time that we did a read
-    std::string cf_at_name;
-    // The key with which we did a Get at this snapshot
-    std::string key;
-    // The status of the Get
-    Status status;
-    // The value of the Get
-    std::string value;
-    // optional state of all keys in the db
-    std::vector<bool> *key_vec;
-  };
-  std::queue<std::pair<uint64_t, SnapshotState> > snapshot_queue;
-
-  ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
-};
-
-class DbStressListener : public EventListener {
- public:
-  DbStressListener(const std::string& db_name,
-                   const std::vector<DbPath>& db_paths,
-                   const std::vector<ColumnFamilyDescriptor>& column_families)
-      : db_name_(db_name),
-        db_paths_(db_paths),
-        column_families_(column_families),
-        num_pending_file_creations_(0) {}
-  virtual ~DbStressListener() {
-    assert(num_pending_file_creations_ == 0);
-  }
-#ifndef ROCKSDB_LITE
-  virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
-    assert(IsValidColumnFamilyName(info.cf_name));
-    VerifyFilePath(info.file_path);
-    // pretending doing some work here
-    std::this_thread::sleep_for(
-        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
-  }
-
-  virtual void OnCompactionCompleted(DB* /*db*/,
-                                     const CompactionJobInfo& ci) override {
-    assert(IsValidColumnFamilyName(ci.cf_name));
-    assert(ci.input_files.size() + ci.output_files.size() > 0U);
-    for (const auto& file_path : ci.input_files) {
-      VerifyFilePath(file_path);
-    }
-    for (const auto& file_path : ci.output_files) {
-      VerifyFilePath(file_path);
-    }
-    // pretending doing some work here
-    std::this_thread::sleep_for(
-        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
-  }
-
-  virtual void OnTableFileCreationStarted(
-      const TableFileCreationBriefInfo& /*info*/) override {
-    ++num_pending_file_creations_;
-  }
-  virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
-    assert(info.db_name == db_name_);
-    assert(IsValidColumnFamilyName(info.cf_name));
-    if (info.file_size) {
-      VerifyFilePath(info.file_path);
-    }
-    assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
-    if (info.status.ok() && info.file_size > 0) {
-      assert(info.table_properties.data_size > 0 ||
-             info.table_properties.num_range_deletions > 0);
-      assert(info.table_properties.raw_key_size > 0);
-      assert(info.table_properties.num_entries > 0);
-    }
-    --num_pending_file_creations_;
-  }
-
- protected:
-  bool IsValidColumnFamilyName(const std::string& cf_name) const {
-    if (cf_name == kDefaultColumnFamilyName) {
-      return true;
-    }
-    // The column family names in the stress tests are numbers.
-    for (size_t i = 0; i < cf_name.size(); ++i) {
-      if (cf_name[i] < '0' || cf_name[i] > '9') {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void VerifyFileDir(const std::string& file_dir) {
-#ifndef NDEBUG
-    if (db_name_ == file_dir) {
-      return;
-    }
-    for (const auto& db_path : db_paths_) {
-      if (db_path.path == file_dir) {
-        return;
-      }
-    }
-    for (auto& cf : column_families_) {
-      for (const auto& cf_path : cf.options.cf_paths) {
-        if (cf_path.path == file_dir) {
-            return;
-        }
-      }
-    }
-    assert(false);
-#else
-    (void)file_dir;
-#endif  // !NDEBUG
-  }
-
-  void VerifyFileName(const std::string& file_name) {
-#ifndef NDEBUG
-    uint64_t file_number;
-    FileType file_type;
-    bool result = ParseFileName(file_name, &file_number, &file_type);
-    assert(result);
-    assert(file_type == kTableFile);
-#else
-    (void)file_name;
-#endif  // !NDEBUG
-  }
-
-  void VerifyFilePath(const std::string& file_path) {
-#ifndef NDEBUG
-    size_t pos = file_path.find_last_of("/");
-    if (pos == std::string::npos) {
-      VerifyFileName(file_path);
-    } else {
-      if (pos > 0) {
-        VerifyFileDir(file_path.substr(0, pos));
-      }
-      VerifyFileName(file_path.substr(pos));
-    }
-#else
-    (void)file_path;
-#endif  // !NDEBUG
-  }
-#endif  // !ROCKSDB_LITE
-
- private:
-  std::string db_name_;
-  std::vector<DbPath> db_paths_;
-  std::vector<ColumnFamilyDescriptor> column_families_;
-  std::atomic<int> num_pending_file_creations_;
-};
-
-}  // namespace
-
-class StressTest {
- public:
-  StressTest()
-      : cache_(NewCache(FLAGS_cache_size)),
-        compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
-        filter_policy_(FLAGS_bloom_bits >= 0
-                           ? FLAGS_use_block_based_filter
-                                 ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
-                                 : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
-                           : nullptr),
-        db_(nullptr),
-#ifndef ROCKSDB_LITE
-        txn_db_(nullptr),
-#endif
-        new_column_family_name_(1),
-        num_times_reopened_(0),
-        db_preload_finished_(false) {
-    if (FLAGS_destroy_db_initially) {
-      std::vector<std::string> files;
-      FLAGS_env->GetChildren(FLAGS_db, &files);
-      for (unsigned int i = 0; i < files.size(); i++) {
-        if (Slice(files[i]).starts_with("heap-")) {
-          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
-        }
-      }
-      Options options;
-      options.env = FLAGS_env;
-      Status s = DestroyDB(FLAGS_db, options);
-      if (!s.ok()) {
-        fprintf(stderr, "Cannot destroy original db: %s\n",
-                s.ToString().c_str());
-        exit(1);
-      }
-    }
-  }
-
-  virtual ~StressTest() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-
-    assert(secondaries_.size() == secondary_cfh_lists_.size());
-    size_t n = secondaries_.size();
-    for (size_t i = 0; i != n; ++i) {
-      for (auto* cf : secondary_cfh_lists_[i]) {
-        delete cf;
-      }
-      secondary_cfh_lists_[i].clear();
-      delete secondaries_[i];
-    }
-    secondaries_.clear();
-  }
-
-  std::shared_ptr<Cache> NewCache(size_t capacity) {
-    if (capacity <= 0) {
-      return nullptr;
-    }
-    if (FLAGS_use_clock_cache) {
-      auto cache = NewClockCache((size_t)capacity);
-      if (!cache) {
-        fprintf(stderr, "Clock cache not supported.");
-        exit(1);
-      }
-      return cache;
-    } else {
-      return NewLRUCache((size_t)capacity);
-    }
-  }
-
-  bool BuildOptionsTable() {
-    if (FLAGS_set_options_one_in <= 0) {
-      return true;
-    }
-
-    std::unordered_map<std::string, std::vector<std::string> > options_tbl = {
-        {"write_buffer_size",
-         {ToString(options_.write_buffer_size),
-          ToString(options_.write_buffer_size * 2),
-          ToString(options_.write_buffer_size * 4)}},
-        {"max_write_buffer_number",
-         {ToString(options_.max_write_buffer_number),
-          ToString(options_.max_write_buffer_number * 2),
-          ToString(options_.max_write_buffer_number * 4)}},
-        {"arena_block_size",
-         {
-             ToString(options_.arena_block_size),
-             ToString(options_.write_buffer_size / 4),
-             ToString(options_.write_buffer_size / 8),
-         }},
-        {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
-        {"max_successive_merges", {"0", "2", "4"}},
-        {"inplace_update_num_locks", {"100", "200", "300"}},
-        // TODO(ljin): enable test for this option
-        // {"disable_auto_compactions", {"100", "200", "300"}},
-        {"soft_rate_limit", {"0", "0.5", "0.9"}},
-        {"hard_rate_limit", {"0", "1.1", "2.0"}},
-        {"level0_file_num_compaction_trigger",
-         {
-             ToString(options_.level0_file_num_compaction_trigger),
-             ToString(options_.level0_file_num_compaction_trigger + 2),
-             ToString(options_.level0_file_num_compaction_trigger + 4),
-         }},
-        {"level0_slowdown_writes_trigger",
-         {
-             ToString(options_.level0_slowdown_writes_trigger),
-             ToString(options_.level0_slowdown_writes_trigger + 2),
-             ToString(options_.level0_slowdown_writes_trigger + 4),
-         }},
-        {"level0_stop_writes_trigger",
-         {
-             ToString(options_.level0_stop_writes_trigger),
-             ToString(options_.level0_stop_writes_trigger + 2),
-             ToString(options_.level0_stop_writes_trigger + 4),
-         }},
-        {"max_compaction_bytes",
-         {
-             ToString(options_.target_file_size_base * 5),
-             ToString(options_.target_file_size_base * 15),
-             ToString(options_.target_file_size_base * 100),
-         }},
-        {"target_file_size_base",
-         {
-             ToString(options_.target_file_size_base),
-             ToString(options_.target_file_size_base * 2),
-             ToString(options_.target_file_size_base * 4),
-         }},
-        {"target_file_size_multiplier",
-         {
-             ToString(options_.target_file_size_multiplier), "1", "2",
-         }},
-        {"max_bytes_for_level_base",
-         {
-             ToString(options_.max_bytes_for_level_base / 2),
-             ToString(options_.max_bytes_for_level_base),
-             ToString(options_.max_bytes_for_level_base * 2),
-         }},
-        {"max_bytes_for_level_multiplier",
-         {
-             ToString(options_.max_bytes_for_level_multiplier), "1", "2",
-         }},
-        {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
-    };
-
-    options_table_ = std::move(options_tbl);
-
-    for (const auto& iter : options_table_) {
-      options_index_.push_back(iter.first);
-    }
-    return true;
-  }
-
-  bool Run() {
-    uint64_t now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Initializing db_stress\n",
-            FLAGS_env->TimeToString(now / 1000000).c_str());
-    PrintEnv();
-    Open();
-    BuildOptionsTable();
-    SharedState shared(this);
-
-    if (FLAGS_read_only) {
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
-              FLAGS_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
-      PreloadDbAndReopenAsReadOnly(FLAGS_max_key, &shared);
-    }
-    uint32_t n = shared.GetNumThreads();
-
-    now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Initializing worker threads\n",
-            FLAGS_env->TimeToString(now / 1000000).c_str());
-    std::vector<ThreadState*> threads(n);
-    for (uint32_t i = 0; i < n; i++) {
-      threads[i] = new ThreadState(i, &shared);
-      FLAGS_env->StartThread(ThreadBody, threads[i]);
-    }
-    ThreadState bg_thread(0, &shared);
-    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-      FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
-    }
-
-    // Each thread goes through the following states:
-    // initializing -> wait for others to init -> read/populate/depopulate
-    // wait for others to operate -> verify -> done
-
-    {
-      MutexLock l(shared.GetMutex());
-      while (!shared.AllInitialized()) {
-        shared.GetCondVar()->Wait();
-      }
-      if (shared.ShouldVerifyAtBeginning()) {
-        if (shared.HasVerificationFailedYet()) {
-          printf("Crash-recovery verification failed :(\n");
-        } else {
-          printf("Crash-recovery verification passed :)\n");
-        }
-      }
-
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Starting database operations\n",
-              FLAGS_env->TimeToString(now/1000000).c_str());
-
-      shared.SetStart();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllOperated()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      now = FLAGS_env->NowMicros();
-      if (FLAGS_test_batches_snapshots) {
-        fprintf(stdout, "%s Limited verification already done during gets\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      } else {
-        fprintf(stdout, "%s Starting verification\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      }
-
-      shared.SetStartVerify();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllDone()) {
-        shared.GetCondVar()->Wait();
-      }
-    }
-
-    for (unsigned int i = 1; i < n; i++) {
-      threads[0]->stats.Merge(threads[i]->stats);
-    }
-    threads[0]->stats.Report("Stress Test");
-
-    for (unsigned int i = 0; i < n; i++) {
-      delete threads[i];
-      threads[i] = nullptr;
-    }
-    now = FLAGS_env->NowMicros();
-    if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
-      fprintf(stdout, "%s Verification successful\n",
-              FLAGS_env->TimeToString(now/1000000).c_str());
-    }
-    PrintStatistics();
-
-    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-      MutexLock l(shared.GetMutex());
-      shared.SetShouldStopBgThread();
-      while (!shared.BgThreadFinished()) {
-        shared.GetCondVar()->Wait();
-      }
-    }
-
-#ifndef ROCKSDB_LITE
-    if (FLAGS_enable_secondary) {
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Start to verify secondaries against primary\n",
-              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
-                  .c_str());
-    }
-    for (size_t k = 0; k != secondaries_.size(); ++k) {
-      Status s = secondaries_[k]->TryCatchUpWithPrimary();
-      if (!s.ok()) {
-        fprintf(stderr, "Secondary failed to catch up with primary\n");
-        return false;
-      }
-      ReadOptions ropts;
-      ropts.total_order_seek = true;
-      // Verify only the default column family since the primary may have
-      // dropped other column families after most recent reopen.
-      std::unique_ptr<Iterator> iter1(db_->NewIterator(ropts));
-      std::unique_ptr<Iterator> iter2(secondaries_[k]->NewIterator(ropts));
-      for (iter1->SeekToFirst(), iter2->SeekToFirst();
-           iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
-        if (iter1->key().compare(iter2->key()) != 0 ||
-            iter1->value().compare(iter2->value())) {
-          fprintf(stderr,
-                  "Secondary %d contains different data from "
-                  "primary.\nPrimary: %s : %s\nSecondary: %s : %s\n",
-                  static_cast<int>(k),
-                  iter1->key().ToString(/*hex=*/true).c_str(),
-                  iter1->value().ToString(/*hex=*/true).c_str(),
-                  iter2->key().ToString(/*hex=*/true).c_str(),
-                  iter2->value().ToString(/*hex=*/true).c_str());
-          return false;
-        }
-      }
-      if (iter1->Valid() && !iter2->Valid()) {
-        fprintf(stderr,
-                "Secondary %d record count is smaller than that of primary\n",
-                static_cast<int>(k));
-        return false;
-      } else if (!iter1->Valid() && iter2->Valid()) {
-        fprintf(stderr,
-                "Secondary %d record count is larger than that of primary\n",
-                static_cast<int>(k));
-        return false;
-      }
-    }
-    if (FLAGS_enable_secondary) {
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Verification of secondaries succeeded\n",
-              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
-                  .c_str());
-    }
-#endif  // ROCKSDB_LITE
-
-    if (shared.HasVerificationFailedYet()) {
-      printf("Verification failed :(\n");
-      return false;
-    }
-    return true;
-  }
-
- protected:
-  static void ThreadBody(void* v) {
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    if (shared->ShouldVerifyAtBeginning()) {
-      thread->shared->GetStressTest()->VerifyDb(thread);
-    }
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncInitialized();
-      if (shared->AllInitialized()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->Started()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-    thread->shared->GetStressTest()->OperateDb(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncOperated();
-      if (shared->AllOperated()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->VerifyStarted()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-
-    thread->shared->GetStressTest()->VerifyDb(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncDone();
-      if (shared->AllDone()) {
-        shared->GetCondVar()->SignalAll();
-      }
-    }
-  }
-
-  static void PoolSizeChangeThread(void* v) {
-    assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    while (true) {
-      {
-        MutexLock l(shared->GetMutex());
-        if (shared->ShoudStopBgThread()) {
-          shared->SetBgThreadFinish();
-          shared->GetCondVar()->SignalAll();
-          return;
-        }
-      }
-
-      auto thread_pool_size_base = FLAGS_max_background_compactions;
-      auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
-      int new_thread_pool_size =
-          thread_pool_size_base - thread_pool_size_var +
-          thread->rand.Next() % (thread_pool_size_var * 2 + 1);
-      if (new_thread_pool_size < 1) {
-        new_thread_pool_size = 1;
-      }
-      FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
-      // Sleep up to 3 seconds
-      FLAGS_env->SleepForMicroseconds(
-          thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
-              1000 +
-          1);
-    }
-  }
-
-  static void PrintKeyValue(int cf, uint64_t key, const char* value,
-      size_t sz) {
-    if (!FLAGS_verbose) {
-      return;
-    }
-    std::string tmp;
-    tmp.reserve(sz * 2 + 16);
-    char buf[4];
-    for (size_t i = 0; i < sz; i++) {
-      snprintf(buf, 4, "%X", value[i]);
-      tmp.append(buf);
-    }
-    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
-            key, sz, tmp.c_str());
-  }
-
-  static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
-    const double completed_ratio =
-        static_cast<double>(iteration) / FLAGS_ops_per_thread;
-    const int64_t base_key = static_cast<int64_t>(
-        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
-    return base_key + thread->rand.Next() % FLAGS_active_width;
-  }
-
-  static std::vector<int64_t> GenerateNKeys(
-      ThreadState* thread,
-      int num_keys,
-      uint64_t iteration) {
-    const double completed_ratio =
-        static_cast<double>(iteration) / FLAGS_ops_per_thread;
-    const int64_t base_key = static_cast<int64_t>(
-        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
-    std::vector<int64_t> keys;
-    keys.reserve(num_keys);
-    int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width;
-    keys.push_back(next_key);
-    for (int i = 1; i < num_keys; ++i) {
-      // This may result in some duplicate keys
-      next_key = next_key + thread->rand.Next() %
-        (FLAGS_active_width - (next_key - base_key));
-      keys.push_back(next_key);
-    }
-    return keys;
-  }
-
-  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
-    size_t value_sz =
-        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
-    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
-    (void) max_sz;
-    *((uint32_t*)v) = rand;
-    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
-      v[i] = (char)(rand ^ i);
-    }
-    v[value_sz] = '\0';
-    return value_sz; // the size of the value set.
-  }
-
-  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
-                    ThreadState::SnapshotState& snap_state) {
-    Status s;
-    if (cf->GetName() != snap_state.cf_at_name) {
-      return s;
-    }
-    ReadOptions ropt;
-    ropt.snapshot = snap_state.snapshot;
-    PinnableSlice exp_v(&snap_state.value);
-    exp_v.PinSelf();
-    PinnableSlice v;
-    s = db->Get(ropt, cf, snap_state.key, &v);
-    if (!s.ok() && !s.IsNotFound()) {
-      return s;
-    }
-    if (snap_state.status != s) {
-      return Status::Corruption(
-          "The snapshot gave inconsistent results for key " +
-          ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
-          " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
-          ") vs. (" + s.ToString() + ")");
-    }
-    if (s.ok()) {
-      if (exp_v != v) {
-        return Status::Corruption("The snapshot gave inconsistent values: (" +
-                                  exp_v.ToString() + ") vs. (" + v.ToString() +
-                                  ")");
-      }
-    }
-    if (snap_state.key_vec != nullptr) {
-      // When `prefix_extractor` is set, seeking to beginning and scanning
-      // across prefixes are only supported with `total_order_seek` set.
-      ropt.total_order_seek = true;
-      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
-      std::unique_ptr<std::vector<bool>> tmp_bitvec(new std::vector<bool>(FLAGS_max_key));
-      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-        uint64_t key_val;
-        if (GetIntVal(iterator->key().ToString(), &key_val)) {
-          (*tmp_bitvec.get())[key_val] = true;
-        }
-      }
-      if (!std::equal(snap_state.key_vec->begin(),
-                      snap_state.key_vec->end(),
-                      tmp_bitvec.get()->begin())) {
-        return Status::Corruption("Found inconsistent keys at this snapshot");
-      }
-    }
-    return Status::OK();
-  }
-
-  // Currently PreloadDb has to be single-threaded.
-  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
-                                    SharedState* shared) {
-    WriteOptions write_opts;
-    write_opts.disableWAL = FLAGS_disable_wal;
-    if (FLAGS_sync) {
-      write_opts.sync = true;
-    }
-    char value[100];
-    int cf_idx = 0;
-    Status s;
-    for (auto cfh : column_families_) {
-      for (int64_t k = 0; k != number_of_keys; ++k) {
-        std::string key_str = Key(k);
-        Slice key = key_str;
-        size_t sz = GenerateValue(0 /*value_base*/, value, sizeof(value));
-        Slice v(value, sz);
-        shared->Put(cf_idx, k, 0, true /* pending */);
-
-        if (FLAGS_use_merge) {
-          if (!FLAGS_use_txn) {
-            s = db_->Merge(write_opts, cfh, key, v);
-          } else {
-#ifndef ROCKSDB_LITE
-            Transaction* txn;
-            s = NewTxn(write_opts, &txn);
-            if (s.ok()) {
-              s = txn->Merge(cfh, key, v);
-              if (s.ok()) {
-                s = CommitTxn(txn);
-              }
-            }
-#endif
-          }
-        } else {
-          if (!FLAGS_use_txn) {
-            s = db_->Put(write_opts, cfh, key, v);
-          } else {
-#ifndef ROCKSDB_LITE
-            Transaction* txn;
-            s = NewTxn(write_opts, &txn);
-            if (s.ok()) {
-              s = txn->Put(cfh, key, v);
-              if (s.ok()) {
-                s = CommitTxn(txn);
-              }
-            }
-#endif
-          }
-        }
-
-        shared->Put(cf_idx, k, 0, false /* pending */);
-        if (!s.ok()) {
-          break;
-        }
-      }
-      if (!s.ok()) {
-        break;
-      }
-      ++cf_idx;
-    }
-    if (s.ok()) {
-      s = db_->Flush(FlushOptions(), column_families_);
-    }
-    if (s.ok()) {
-      for (auto cf : column_families_) {
-        delete cf;
-      }
-      column_families_.clear();
-      delete db_;
-      db_ = nullptr;
-#ifndef ROCKSDB_LITE
-      txn_db_ = nullptr;
-#endif
-
-      db_preload_finished_.store(true);
-      auto now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Reopening database in read-only\n",
-              FLAGS_env->TimeToString(now / 1000000).c_str());
-      // Reopen as read-only, can ignore all options related to updates
-      Open();
-    } else {
-      fprintf(stderr, "Failed to preload db");
-      exit(1);
-    }
-  }
-
-  Status SetOptions(ThreadState* thread) {
-    assert(FLAGS_set_options_one_in > 0);
-    std::unordered_map<std::string, std::string> opts;
-    std::string name = options_index_[
-      thread->rand.Next() % options_index_.size()];
-    int value_idx = thread->rand.Next() % options_table_[name].size();
-    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
-      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
-      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
-    } else if (name == "level0_file_num_compaction_trigger" ||
-               name == "level0_slowdown_writes_trigger" ||
-               name == "level0_stop_writes_trigger") {
-      opts["level0_file_num_compaction_trigger"] =
-        options_table_["level0_file_num_compaction_trigger"][value_idx];
-      opts["level0_slowdown_writes_trigger"] =
-        options_table_["level0_slowdown_writes_trigger"][value_idx];
-      opts["level0_stop_writes_trigger"] =
-        options_table_["level0_stop_writes_trigger"][value_idx];
-    } else {
-      opts[name] = options_table_[name][value_idx];
-    }
-
-    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
-    auto cfh = column_families_[rand_cf_idx];
-    return db_->SetOptions(cfh, opts);
-  }
-
-#ifndef ROCKSDB_LITE
-  Status NewTxn(WriteOptions& write_opts, Transaction** txn) {
-    if (!FLAGS_use_txn) {
-      return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
-    }
-    static std::atomic<uint64_t> txn_id = {0};
-    TransactionOptions txn_options;
-    *txn = txn_db_->BeginTransaction(write_opts, txn_options);
-    auto istr = std::to_string(txn_id.fetch_add(1));
-    Status s = (*txn)->SetName("xid" + istr);
-    return s;
-  }
-
-  Status CommitTxn(Transaction* txn) {
-    if (!FLAGS_use_txn) {
-      return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
-    }
-    Status s = txn->Prepare();
-    if (s.ok()) {
-      s = txn->Commit();
-    }
-    delete txn;
-    return s;
-  }
-#endif
-
-  virtual void OperateDb(ThreadState* thread) {
-    ReadOptions read_opts(FLAGS_verify_checksum, true);
-    WriteOptions write_opts;
-    auto shared = thread->shared;
-    char value[100];
-    std::string from_db;
-    if (FLAGS_sync) {
-      write_opts.sync = true;
-    }
-    write_opts.disableWAL = FLAGS_disable_wal;
-    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
-    const int writeBound = prefixBound + (int)FLAGS_writepercent;
-    const int delBound = writeBound + (int)FLAGS_delpercent;
-    const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
-    const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
-
-    thread->stats.Start();
-    for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (open_cnt != 0) {
-          thread->stats.FinishedSingleOp();
-          MutexLock l(thread->shared->GetMutex());
-          while (!thread->snapshot_queue.empty()) {
-            db_->ReleaseSnapshot(
-                thread->snapshot_queue.front().second.snapshot);
-            delete thread->snapshot_queue.front().second.key_vec;
-            thread->snapshot_queue.pop();
-          }
-          thread->shared->IncVotedReopen();
-          if (thread->shared->AllVotedReopen()) {
-            thread->shared->GetStressTest()->Reopen();
-            thread->shared->GetCondVar()->SignalAll();
-          } else {
-            thread->shared->GetCondVar()->Wait();
-          }
-          // Commenting this out as we don't want to reset stats on each open.
-          // thread->stats.Start();
-      }
-
-      for (uint64_t i = 0; i < ops_per_open; i++) {
-        if (thread->shared->HasVerificationFailedYet()) {
-          break;
-        }
-
-        // Change Options
-        if (FLAGS_set_options_one_in > 0 &&
-            thread->rand.OneIn(FLAGS_set_options_one_in)) {
-          SetOptions(thread);
-        }
-
-        if (FLAGS_set_in_place_one_in > 0 &&
-            thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
-          options_.inplace_update_support ^= options_.inplace_update_support;
-        }
-
-        MaybeClearOneColumnFamily(thread);
-
-#ifndef ROCKSDB_LITE
-        if (FLAGS_compact_files_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
-          auto* random_cf =
-              column_families_[thread->rand.Next() % FLAGS_column_families];
-          rocksdb::ColumnFamilyMetaData cf_meta_data;
-          db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
-
-          // Randomly compact up to three consecutive files from a level
-          const int kMaxRetry = 3;
-          for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
-            size_t random_level = thread->rand.Uniform(
-                static_cast<int>(cf_meta_data.levels.size()));
-
-            const auto& files = cf_meta_data.levels[random_level].files;
-            if (files.size() > 0) {
-              size_t random_file_index =
-                  thread->rand.Uniform(static_cast<int>(files.size()));
-              if (files[random_file_index].being_compacted) {
-                // Retry as the selected file is currently being compacted
-                continue;
-              }
-
-              std::vector<std::string> input_files;
-              input_files.push_back(files[random_file_index].name);
-              if (random_file_index > 0 &&
-                  !files[random_file_index - 1].being_compacted) {
-                input_files.push_back(files[random_file_index - 1].name);
-              }
-              if (random_file_index + 1 < files.size() &&
-                  !files[random_file_index + 1].being_compacted) {
-                input_files.push_back(files[random_file_index + 1].name);
-              }
-
-              size_t output_level =
-                  std::min(random_level + 1, cf_meta_data.levels.size() - 1);
-              auto s =
-                  db_->CompactFiles(CompactionOptions(), random_cf, input_files,
-                                    static_cast<int>(output_level));
-              if (!s.ok()) {
-                fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
-                        s.ToString().c_str());
-                thread->stats.AddNumCompactFilesFailed(1);
-              } else {
-                thread->stats.AddNumCompactFilesSucceed(1);
-              }
-              break;
-            }
-          }
-        }
-#endif                // !ROCKSDB_LITE
-        int64_t rand_key = GenerateOneKey(thread, i);
-        int rand_column_family = thread->rand.Next() % FLAGS_column_families;
-        std::string keystr = Key(rand_key);
-        Slice key = keystr;
-        std::unique_ptr<MutexLock> lock;
-        if (ShouldAcquireMutexOnKey()) {
-          lock.reset(new MutexLock(
-              shared->GetMutexForKey(rand_column_family, rand_key)));
-        }
-
-        auto column_family = column_families_[rand_column_family];
-
-        if (FLAGS_compact_range_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
-          int64_t end_key_num;
-          if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
-            end_key_num = port::kMaxInt64;
-          } else {
-            end_key_num = FLAGS_compact_range_width + rand_key;
-          }
-          std::string end_key_buf = Key(end_key_num);
-          Slice end_key(end_key_buf);
-
-          CompactRangeOptions cro;
-          cro.exclusive_manual_compaction =
-              static_cast<bool>(thread->rand.Next() % 2);
-          Status status = db_->CompactRange(cro, column_family, &key, &end_key);
-          if (!status.ok()) {
-            printf("Unable to perform CompactRange(): %s\n",
-                   status.ToString().c_str());
-          }
-        }
-
-        std::vector<int> rand_column_families =
-            GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
-
-        if (FLAGS_flush_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
-          FlushOptions flush_opts;
-          std::vector<ColumnFamilyHandle*> cfhs;
-          std::for_each(
-              rand_column_families.begin(), rand_column_families.end(),
-              [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
-          Status status = db_->Flush(flush_opts, cfhs);
-          if (!status.ok()) {
-            fprintf(stdout, "Unable to perform Flush(): %s\n",
-                    status.ToString().c_str());
-          }
-        }
-
-        std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
-
-        if (FLAGS_ingest_external_file_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
-          TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
-        }
-
-        if (FLAGS_backup_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
-          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
-          if (!s.ok()) {
-            VerificationAbort(shared, "Backup/restore gave inconsistent state",
-                              s);
-          }
-        }
-
-        if (FLAGS_checkpoint_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
-          Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
-          if (!s.ok()) {
-            VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
-          }
-        }
-
-        if (FLAGS_acquire_snapshot_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
-          auto snapshot = db_->GetSnapshot();
-          ReadOptions ropt;
-          ropt.snapshot = snapshot;
-          std::string value_at;
-          // When taking a snapshot, we also read a key from that snapshot. We
-          // will later read the same key before releasing the snapshot and verify
-          // that the results are the same.
-          auto status_at = db_->Get(ropt, column_family, key, &value_at);
-          std::vector<bool> *key_vec = nullptr;
-
-          if (FLAGS_compare_full_db_state_snapshot &&
-              (thread->tid == 0)) {
-            key_vec = new std::vector<bool>(FLAGS_max_key);
-            // When `prefix_extractor` is set, seeking to beginning and scanning
-            // across prefixes are only supported with `total_order_seek` set.
-            ropt.total_order_seek = true;
-            std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
-            for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-              uint64_t key_val;
-              if (GetIntVal(iterator->key().ToString(), &key_val)) {
-                (*key_vec)[key_val] = true;
-              }
-            }
-          }
-
-          ThreadState::SnapshotState snap_state = {
-              snapshot, rand_column_family, column_family->GetName(),
-              keystr,   status_at,          value_at, key_vec};
-          thread->snapshot_queue.emplace(
-              std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
-              snap_state);
-        }
-        while (!thread->snapshot_queue.empty() &&
-               i >= thread->snapshot_queue.front().first) {
-          auto snap_state = thread->snapshot_queue.front().second;
-          assert(snap_state.snapshot);
-          // Note: this is unsafe as the cf might be dropped concurrently. But it
-          // is ok since unclean cf drop is cunnrently not supported by write
-          // prepared transactions.
-          Status s =
-              AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
-          if (!s.ok()) {
-            VerificationAbort(shared, "Snapshot gave inconsistent state", s);
-          }
-          db_->ReleaseSnapshot(snap_state.snapshot);
-          delete snap_state.key_vec;
-          thread->snapshot_queue.pop();
-        }
-
-        int prob_op = thread->rand.Uniform(100);
-        // Reset this in case we pick something other than a read op. We don't
-        // want to use a stale value when deciding at the beginning of the loop
-        // whether to vote to reopen
-        if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
-          // OPERATION read
-          if (FLAGS_use_multiget) {
-            // Leave room for one more iteration of the loop with a single key
-            // batch. This is to ensure that each thread does exactly the same
-            // number of ops
-            int multiget_batch_size = static_cast<int>(
-                std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
-                         FLAGS_ops_per_thread - i - 1));
-            // If its the last iteration, ensure that multiget_batch_size is 1
-            multiget_batch_size = std::max(multiget_batch_size, 1);
-            rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
-            TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
-            i += multiget_batch_size - 1;
-          } else {
-            TestGet(thread, read_opts, rand_column_families, rand_keys);
-          }
-        } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
-          // OPERATION prefix scan
-          // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
-          // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
-          // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
-          // prefix
-          TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
-        } else if (prefixBound <= prob_op && prob_op < writeBound) {
-          // OPERATION write
-          TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
-                  value, lock);
-        } else if (writeBound <= prob_op && prob_op < delBound) {
-          // OPERATION delete
-          TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
-        } else if (delBound <= prob_op && prob_op < delRangeBound) {
-          // OPERATION delete range
-          TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
-                          lock);
-        } else {
-          // OPERATION iterate
-          int num_seeks = static_cast<int>(
-              std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
-                       FLAGS_ops_per_thread - i - 1));
-          rand_keys = GenerateNKeys(thread, num_seeks, i);
-          i += num_seeks - 1;
-          TestIterate(thread, read_opts, rand_column_families, rand_keys);
-        }
-        thread->stats.FinishedSingleOp();
-#ifndef ROCKSDB_LITE
-        uint32_t tid = thread->tid;
-        assert(secondaries_.empty() ||
-               static_cast<size_t>(tid) < secondaries_.size());
-        if (FLAGS_secondary_catch_up_one_in > 0 &&
-            thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
-          Status s = secondaries_[tid]->TryCatchUpWithPrimary();
-          if (!s.ok()) {
-            VerificationAbort(shared, "Secondary instance failed to catch up", s);
-            break;
-          }
-        }
-#endif
-      }
-    }
-    while (!thread->snapshot_queue.empty()) {
-      db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
-      delete thread->snapshot_queue.front().second.key_vec;
-      thread->snapshot_queue.pop();
-    }
-
-    thread->stats.Stop();
-  }
-
-  virtual void VerifyDb(ThreadState* thread) const = 0;
-
-  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
-
-  virtual bool ShouldAcquireMutexOnKey() const { return false; }
-
-  virtual std::vector<int> GenerateColumnFamilies(
-      const int /* num_column_families */, int rand_column_family) const {
-    return {rand_column_family};
-  }
-
-  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
-    return {rand_key};
-  }
-
-  virtual Status TestGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) = 0;
-
-  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) = 0;
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) = 0;
-
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& read_opts,
-      const std::vector<int>& cf_ids, const std::vector<int64_t>& keys,
-      char (&value)[100], std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual Status TestDeleteRange(ThreadState* thread,
-      WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual void TestIngestExternalFile(
-      ThreadState* thread, const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  // Given a key K, this creates an iterator which scans to K and then
-  // does a random sequence of Next/Prev operations.
-  virtual Status TestIterate(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    Status s;
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ReadOptions readoptionscopy = read_opts;
-    readoptionscopy.snapshot = snapshot;
-
-    std::string upper_bound_str;
-    Slice upper_bound;
-    if (thread->rand.OneIn(16)) {
-      // in 1/16 chance, set a iterator upper bound
-      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
-      upper_bound_str = Key(rand_upper_key);
-      upper_bound = Slice(upper_bound_str);
-      // uppder_bound can be smaller than seek key, but the query itself
-      // should not crash either.
-      readoptionscopy.iterate_upper_bound = &upper_bound;
-    }
-    std::string lower_bound_str;
-    Slice lower_bound;
-    if (thread->rand.OneIn(16)) {
-      // in 1/16 chance, enable iterator lower bound
-      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
-      lower_bound_str = Key(rand_lower_key);
-      lower_bound = Slice(lower_bound_str);
-      // uppder_bound can be smaller than seek key, but the query itself
-      // should not crash either.
-      readoptionscopy.iterate_lower_bound = &lower_bound;
-    }
-
-    auto cfh = column_families_[rand_column_families[0]];
-    std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
-
-    for (int64_t rkey : rand_keys) {
-      std::string key_str = Key(rkey);
-      Slice key = key_str;
-
-      if (readoptionscopy.iterate_upper_bound != nullptr &&
-          thread->rand.OneIn(2)) {
-        // 1/2 chance, change the upper bound.
-        // It is possible that it is changed without first use, but there is no
-        // problem with that.
-        int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
-        upper_bound_str = Key(rand_upper_key);
-        upper_bound = Slice(upper_bound_str);
-      }
-
-      // Set up an iterator and does the same without bounds and with total
-      // order seek and compare the results. This is to identify bugs related
-      // to bounds, prefix extractor or reseeking. Sometimes we are comparing
-      // iterators with the same set-up, and it doesn't hurt to check them
-      // to be equal.
-      ReadOptions cmp_ro;
-      cmp_ro.snapshot = snapshot;
-      cmp_ro.total_order_seek = true;
-      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cfh));
-      bool diverged = false;
-
-      iter->Seek(key);
-      cmp_iter->Seek(key);
-      VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
-                     &diverged);
-
-      for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-        if (thread->rand.OneIn(2)) {
-          iter->Next();
-          if (!diverged) {
-            assert(cmp_iter->Valid());
-            cmp_iter->Next();
-          }
-        } else {
-          iter->Prev();
-          if (!diverged) {
-            assert(cmp_iter->Valid());
-            cmp_iter->Prev();
-          }
-        }
-        VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
-                       &diverged);
-      }
-
-      if (s.ok()) {
-        thread->stats.AddIterations(1);
-      } else {
-        thread->stats.AddErrors(1);
-        break;
-      }
-    }
-
-    db_->ReleaseSnapshot(snapshot);
-
-    return s;
-  }
-
-  // Compare the two iterator, iter and cmp_iter are in the same position,
-  // unless iter might be made invalidate or undefined because of
-  // upper or lower bounds, or prefix extractor.
-  // Will flag failure if the verification fails.
-  // diverged = true if the two iterator is already diverged.
-  // True if verification passed, false if not.
-  void VerifyIterator(ThreadState* thread, const ReadOptions& ro,
-                      Iterator* iter, Iterator* cmp_iter, const Slice& seek_key,
-                      bool* diverged) {
-    if (*diverged) {
-      return;
-    }
-
-    if (ro.iterate_lower_bound != nullptr) {
-      // Lower bound would create a lot of discrepency for now so disabling
-      // the verification for now.
-      *diverged = true;
-      return;
-    }
-
-    if (iter->Valid() && !cmp_iter->Valid()) {
-      fprintf(stderr,
-              "Control interator is invalid but iterator has key %s seek key "
-              "%s\n",
-              iter->key().ToString(true).c_str(),
-              seek_key.ToString(true).c_str());
-      if (ro.iterate_upper_bound != nullptr) {
-        fprintf(stderr, "upper bound %s\n",
-                ro.iterate_upper_bound->ToString(true).c_str());
-      }
-      if (ro.iterate_lower_bound != nullptr) {
-        fprintf(stderr, "lower bound %s\n",
-                ro.iterate_lower_bound->ToString(true).c_str());
-      }
-
-      *diverged = true;
-    } else if (cmp_iter->Valid()) {
-      // Iterator is not valid. It can be legimate if it has already been
-      // out of upper or lower bound, or filtered out by prefix iterator.
-      const Slice& total_order_key = cmp_iter->key();
-      const SliceTransform* pe = options_.prefix_extractor.get();
-      const Comparator* cmp = options_.comparator;
-
-      if (pe != nullptr) {
-        if (!pe->InDomain(seek_key)) {
-          // Prefix seek a non-in-domain key is undefined. Skip checking for
-          // this scenario.
-          *diverged = true;
-          return;
-        }
-
-        if (!pe->InDomain(total_order_key) ||
-            pe->Transform(total_order_key) != pe->Transform(seek_key)) {
-          // If the prefix is exhausted, the only thing needs to check
-          // is the iterator isn't return a position in prefix.
-          // Either way, checking can stop from here.
-          *diverged = true;
-          if (!iter->Valid() || !pe->InDomain(iter->key()) ||
-              pe->Transform(iter->key()) != pe->Transform(seek_key)) {
-            return;
-          }
-          fprintf(stderr,
-                  "Iterator stays in prefix bug contol doesn't"
-                  " seek key %s iterator key %s control iterator key %s\n",
-                  seek_key.ToString(true).c_str(),
-                  iter->key().ToString(true).c_str(),
-                  cmp_iter->key().ToString(true).c_str());
-        }
-      }
-      // Check upper or lower bounds.
-      if (!*diverged) {
-        if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
-            (!iter->Valid() &&
-             (ro.iterate_upper_bound == nullptr ||
-              cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
-             (ro.iterate_lower_bound == nullptr ||
-              cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
-          fprintf(stderr,
-                  "Iterator diverged from control iterator which"
-                  " has value %s seek key %s\n",
-                  total_order_key.ToString(true).c_str(),
-                  seek_key.ToString(true).c_str());
-          if (iter->Valid()) {
-            fprintf(stderr, "iterator has value %s\n",
-                    iter->key().ToString(true).c_str());
-          } else {
-            fprintf(stderr, "iterator is not valid\n");
-          }
-          if (ro.iterate_upper_bound != nullptr) {
-            fprintf(stderr, "upper bound %s\n",
-                    ro.iterate_upper_bound->ToString(true).c_str());
-          }
-          if (ro.iterate_lower_bound != nullptr) {
-            fprintf(stderr, "lower bound %s\n",
-                    ro.iterate_lower_bound->ToString(true).c_str());
-          }
-          *diverged = true;
-        }
-      }
-    }
-    if (*diverged) {
-      thread->stats.AddErrors(1);
-      // Fail fast to preserve the DB state.
-      thread->shared->SetVerificationFailure();
-    }
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual Status TestBackupRestore(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestBackupRestore\n");
-    std::terminate();
-  }
-
-  virtual Status TestCheckpoint(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestCheckpoint\n");
-    std::terminate();
-  }
-#else  // ROCKSDB_LITE
-  virtual Status TestBackupRestore(ThreadState* thread,
-                                   const std::vector<int>& rand_column_families,
-                                   const std::vector<int64_t>& rand_keys) {
-    // Note the column families chosen by `rand_column_families` cannot be
-    // dropped while the locks for `rand_keys` are held. So we should not have
-    // to worry about accessing those column families throughout this function.
-    assert(rand_column_families.size() == rand_keys.size());
-    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
-    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
-    BackupableDBOptions backup_opts(backup_dir);
-    BackupEngine* backup_engine = nullptr;
-    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
-    if (s.ok()) {
-      s = backup_engine->CreateNewBackup(db_);
-    }
-    if (s.ok()) {
-      delete backup_engine;
-      backup_engine = nullptr;
-      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
-    }
-    if (s.ok()) {
-      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
-                                                   restore_dir /* wal_dir */);
-    }
-    if (s.ok()) {
-      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
-    }
-    DB* restored_db = nullptr;
-    std::vector<ColumnFamilyHandle*> restored_cf_handles;
-    if (s.ok()) {
-      Options restore_options(options_);
-      restore_options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      assert(FLAGS_clear_column_family_one_in == 0);
-      for (auto name : column_family_names_) {
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
-      }
-      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
-                   &restored_cf_handles, &restored_db);
-    }
-    // for simplicity, currently only verifies existence/non-existence of a few
-    // keys
-    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-      std::string key_str = Key(rand_keys[i]);
-      Slice key = key_str;
-      std::string restored_value;
-      Status get_status = restored_db->Get(
-          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
-          &restored_value);
-      bool exists =
-          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
-      if (get_status.ok()) {
-        if (!exists) {
-          s = Status::Corruption(
-              "key exists in restore but not in original db");
-        }
-      } else if (get_status.IsNotFound()) {
-        if (exists) {
-          s = Status::Corruption(
-              "key exists in original db but not in restore");
-        }
-      } else {
-        s = get_status;
-      }
-    }
-    if (backup_engine != nullptr) {
-      delete backup_engine;
-      backup_engine = nullptr;
-    }
-    if (restored_db != nullptr) {
-      for (auto* cf_handle : restored_cf_handles) {
-        restored_db->DestroyColumnFamilyHandle(cf_handle);
-      }
-      delete restored_db;
-      restored_db = nullptr;
-    }
-    if (!s.ok()) {
-      printf("A backup/restore operation failed with: %s\n",
-             s.ToString().c_str());
-    }
-    return s;
-  }
-
-  virtual Status TestCheckpoint(ThreadState* thread,
-                                const std::vector<int>& rand_column_families,
-                                const std::vector<int64_t>& rand_keys) {
-    // Note the column families chosen by `rand_column_families` cannot be
-    // dropped while the locks for `rand_keys` are held. So we should not have
-    // to worry about accessing those column families throughout this function.
-    assert(rand_column_families.size() == rand_keys.size());
-    std::string checkpoint_dir =
-        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
-    Checkpoint* checkpoint = nullptr;
-    Status s = Checkpoint::Create(db_, &checkpoint);
-    if (s.ok()) {
-      s = checkpoint->CreateCheckpoint(checkpoint_dir);
-    }
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* checkpoint_db = nullptr;
-    if (s.ok()) {
-      delete checkpoint;
-      checkpoint = nullptr;
-      Options options(options_);
-      options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descs;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      if (FLAGS_clear_column_family_one_in == 0) {
-        for (const auto& name : column_family_names_) {
-          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
-        }
-        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
-                                &cf_handles, &checkpoint_db);
-      }
-    }
-    if (checkpoint_db != nullptr) {
-      for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-        std::string key_str = Key(rand_keys[i]);
-        Slice key = key_str;
-        std::string value;
-        Status get_status = checkpoint_db->Get(
-            ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
-        bool exists =
-            thread->shared->Exists(rand_column_families[i], rand_keys[i]);
-        if (get_status.ok()) {
-          if (!exists) {
-            s = Status::Corruption(
-                "key exists in checkpoint but not in original db");
-          }
-        } else if (get_status.IsNotFound()) {
-          if (exists) {
-            s = Status::Corruption(
-                "key exists in original db but not in checkpoint");
-          }
-        } else {
-          s = get_status;
-        }
-      }
-      for (auto cfh : cf_handles) {
-        delete cfh;
-      }
-      cf_handles.clear();
-      delete checkpoint_db;
-      checkpoint_db = nullptr;
-    }
-    DestroyDB(checkpoint_dir, Options());
-    if (!s.ok()) {
-      fprintf(stderr, "A checkpoint operation failed with: %s\n",
-              s.ToString().c_str());
-    }
-    return s;
-  }
-#endif  // ROCKSDB_LITE
-
-  void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
-    printf("Verification failed: %s. Status is %s\n", msg.c_str(),
-           s.ToString().c_str());
-    shared->SetVerificationFailure();
-  }
-
-  void VerificationAbort(SharedState* shared, std::string msg, int cf,
-                         int64_t key) const {
-    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf, key,
-           msg.c_str());
-    shared->SetVerificationFailure();
-  }
-
-  void PrintEnv() const {
-    fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
-            kMinorVersion);
-    fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
-    fprintf(stdout, "TransactionDB             : %s\n",
-            FLAGS_use_txn ? "true" : "false");
-    fprintf(stdout, "Read only mode            : %s\n",
-            FLAGS_read_only ? "true" : "false");
-    fprintf(stdout, "Atomic flush              : %s\n",
-            FLAGS_atomic_flush ? "true" : "false");
-    fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
-    if (!FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "Clear CFs one in          : %d\n",
-              FLAGS_clear_column_family_one_in);
-    }
-    fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
-    fprintf(stdout, "Ops per thread            : %lu\n",
-            (unsigned long)FLAGS_ops_per_thread);
-    std::string ttl_state("unused");
-    if (FLAGS_ttl > 0) {
-      ttl_state = NumberToString(FLAGS_ttl);
-    }
-    fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
-    fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
-    fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
-    fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
-    fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
-    fprintf(stdout, "Delete range percentage   : %d%%\n", FLAGS_delrangepercent);
-    fprintf(stdout, "No overwrite percentage   : %d%%\n",
-            FLAGS_nooverwritepercent);
-    fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
-    fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
-            FLAGS_db_write_buffer_size);
-    fprintf(stdout, "Write-buffer-size         : %d\n",
-            FLAGS_write_buffer_size);
-    fprintf(stdout, "Iterations                : %lu\n",
-            (unsigned long)FLAGS_num_iterations);
-    fprintf(stdout, "Max key                   : %lu\n",
-            (unsigned long)FLAGS_max_key);
-    fprintf(stdout, "Ratio #ops/#keys          : %f\n",
-            (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
-    fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
-    fprintf(stdout, "Batches/snapshots         : %d\n",
-            FLAGS_test_batches_snapshots);
-    fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
-    fprintf(stdout, "Num keys per lock         : %d\n",
-            1 << FLAGS_log2_keys_per_lock);
-    std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
-    fprintf(stdout, "Compression               : %s\n", compression.c_str());
-    std::string checksum = ChecksumTypeToString(FLAGS_checksum_type_e);
-    fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
-    fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
-            FLAGS_subcompactions);
-    fprintf(stdout, "Use MultiGet              : %s\n",
-            FLAGS_use_multiget ? "true" : "false");
-
-    const char* memtablerep = "";
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        memtablerep = "skip_list";
-        break;
-      case kHashSkipList:
-        memtablerep = "prefix_hash";
-        break;
-      case kVectorRep:
-        memtablerep = "vector";
-        break;
-    }
-
-    fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
-
-    fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
-    if (!rocksdb_kill_prefix_blacklist.empty()) {
-      fprintf(stdout, "Skipping kill points prefixes:\n");
-      for (auto& p : rocksdb_kill_prefix_blacklist) {
-        fprintf(stdout, "  %s\n", p.c_str());
-      }
-    }
-    fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
-            FLAGS_periodic_compaction_seconds);
-    fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
-            FLAGS_compaction_ttl);
-
-    fprintf(stdout, "------------------------------------------------\n");
-  }
-
-  void Open() {
-    assert(db_ == nullptr);
-#ifndef ROCKSDB_LITE
-    assert(txn_db_ == nullptr);
-#endif
-    if (FLAGS_options_file.empty()) {
-      BlockBasedTableOptions block_based_options;
-      block_based_options.block_cache = cache_;
-      block_based_options.cache_index_and_filter_blocks =
-          FLAGS_cache_index_and_filter_blocks;
-      block_based_options.block_cache_compressed = compressed_cache_;
-      block_based_options.checksum = FLAGS_checksum_type_e;
-      block_based_options.block_size = FLAGS_block_size;
-      block_based_options.format_version =
-          static_cast<uint32_t>(FLAGS_format_version);
-      block_based_options.index_block_restart_interval =
-          static_cast<int32_t>(FLAGS_index_block_restart_interval);
-      block_based_options.filter_policy = filter_policy_;
-      block_based_options.partition_filters = FLAGS_partition_filters;
-      block_based_options.index_type =
-          static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
-      options_.table_factory.reset(
-          NewBlockBasedTableFactory(block_based_options));
-      options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
-      options_.write_buffer_size = FLAGS_write_buffer_size;
-      options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-      options_.min_write_buffer_number_to_merge =
-          FLAGS_min_write_buffer_number_to_merge;
-      options_.max_write_buffer_number_to_maintain =
-          FLAGS_max_write_buffer_number_to_maintain;
-      options_.max_write_buffer_size_to_maintain =
-          FLAGS_max_write_buffer_size_to_maintain;
-      options_.memtable_prefix_bloom_size_ratio =
-          FLAGS_memtable_prefix_bloom_size_ratio;
-      options_.memtable_whole_key_filtering =
-          FLAGS_memtable_whole_key_filtering;
-      options_.max_background_compactions = FLAGS_max_background_compactions;
-      options_.max_background_flushes = FLAGS_max_background_flushes;
-      options_.compaction_style =
-          static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-      if (FLAGS_prefix_size >= 0) {
-        options_.prefix_extractor.reset(
-            NewFixedPrefixTransform(FLAGS_prefix_size));
-      }
-      options_.max_open_files = FLAGS_open_files;
-      options_.statistics = dbstats;
-      options_.env = FLAGS_env;
-      options_.use_fsync = FLAGS_use_fsync;
-      options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
-      options_.allow_mmap_reads = FLAGS_mmap_read;
-      options_.allow_mmap_writes = FLAGS_mmap_write;
-      options_.use_direct_reads = FLAGS_use_direct_reads;
-      options_.use_direct_io_for_flush_and_compaction =
-          FLAGS_use_direct_io_for_flush_and_compaction;
-      options_.recycle_log_file_num =
-          static_cast<size_t>(FLAGS_recycle_log_file_num);
-      options_.target_file_size_base = FLAGS_target_file_size_base;
-      options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-      options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-      options_.max_bytes_for_level_multiplier =
-          FLAGS_max_bytes_for_level_multiplier;
-      options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-      options_.level0_slowdown_writes_trigger =
-          FLAGS_level0_slowdown_writes_trigger;
-      options_.level0_file_num_compaction_trigger =
-          FLAGS_level0_file_num_compaction_trigger;
-      options_.compression = FLAGS_compression_type_e;
-      options_.compression_opts.max_dict_bytes =
-          FLAGS_compression_max_dict_bytes;
-      options_.compression_opts.zstd_max_train_bytes =
-          FLAGS_compression_zstd_max_train_bytes;
-      options_.create_if_missing = true;
-      options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
-      options_.inplace_update_support = FLAGS_in_place_update;
-      options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
-      options_.allow_concurrent_memtable_write =
-          FLAGS_allow_concurrent_memtable_write;
-      options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
-      options_.ttl = FLAGS_compaction_ttl;
-      options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
-      options_.enable_write_thread_adaptive_yield =
-          FLAGS_enable_write_thread_adaptive_yield;
-      options_.compaction_options_universal.size_ratio =
-          FLAGS_universal_size_ratio;
-      options_.compaction_options_universal.min_merge_width =
-          FLAGS_universal_min_merge_width;
-      options_.compaction_options_universal.max_merge_width =
-          FLAGS_universal_max_merge_width;
-      options_.compaction_options_universal.max_size_amplification_percent =
-          FLAGS_universal_max_size_amplification_percent;
-      options_.atomic_flush = FLAGS_atomic_flush;
-    } else {
-#ifdef ROCKSDB_LITE
-      fprintf(stderr, "--options_file not supported in lite mode\n");
-      exit(1);
-#else
-      DBOptions db_options;
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      Status s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(),
-                                     &db_options, &cf_descriptors);
-      if (!s.ok()) {
-        fprintf(stderr, "Unable to load options file %s --- %s\n",
-                FLAGS_options_file.c_str(), s.ToString().c_str());
-        exit(1);
-      }
-      options_ = Options(db_options, cf_descriptors[0].options);
-#endif  // ROCKSDB_LITE
-    }
-
-    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
-      options_.rate_limiter.reset(NewGenericRateLimiter(
-          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
-          10 /* fairness */,
-          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
-                                    : RateLimiter::Mode::kWritesOnly));
-      if (FLAGS_rate_limit_bg_reads) {
-        options_.new_table_reader_for_compaction_inputs = true;
-      }
-    }
-
-    if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
-      fprintf(stderr,
-              "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
-      exit(1);
-    }
-    if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) {
-      fprintf(stderr,
-              "WARNING: prefix_size is non-zero but "
-              "memtablerep != prefix_hash\n");
-    }
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        // no need to do anything
-        break;
-#ifndef ROCKSDB_LITE
-      case kHashSkipList:
-        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
-        break;
-      case kVectorRep:
-        options_.memtable_factory.reset(new VectorRepFactory());
-        break;
-#else
-      default:
-        fprintf(stderr,
-                "RocksdbLite only supports skip list mem table. Skip "
-                "--rep_factory\n");
-#endif  // ROCKSDB_LITE
-    }
-
-    if (FLAGS_use_full_merge_v1) {
-      options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
-    } else {
-      options_.merge_operator = MergeOperators::CreatePutOperator();
-    }
-
-    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
-
-    Status s;
-    if (FLAGS_ttl == -1) {
-      std::vector<std::string> existing_column_families;
-      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
-                                 &existing_column_families);  // ignore errors
-      if (!s.ok()) {
-        // DB doesn't exist
-        assert(existing_column_families.empty());
-        assert(column_family_names_.empty());
-        column_family_names_.push_back(kDefaultColumnFamilyName);
-      } else if (column_family_names_.empty()) {
-        // this is the first call to the function Open()
-        column_family_names_ = existing_column_families;
-      } else {
-        // this is a reopen. just assert that existing column_family_names are
-        // equivalent to what we remember
-        auto sorted_cfn = column_family_names_;
-        std::sort(sorted_cfn.begin(), sorted_cfn.end());
-        std::sort(existing_column_families.begin(),
-                  existing_column_families.end());
-        if (sorted_cfn != existing_column_families) {
-          fprintf(stderr,
-                  "Expected column families differ from the existing:\n");
-          printf("Expected: {");
-          for (auto cf : sorted_cfn) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-          printf("Existing: {");
-          for (auto cf : existing_column_families) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-        }
-        assert(sorted_cfn == existing_column_families);
-      }
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      for (auto name : column_family_names_) {
-        if (name != kDefaultColumnFamilyName) {
-          new_column_family_name_ =
-              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
-        }
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
-      }
-      while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
-        std::string name = ToString(new_column_family_name_.load());
-        new_column_family_name_++;
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
-        column_family_names_.push_back(name);
-      }
-      options_.listeners.clear();
-      options_.listeners.emplace_back(
-          new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
-      options_.create_missing_column_families = true;
-      if (!FLAGS_use_txn) {
-        if (db_preload_finished_.load() && FLAGS_read_only) {
-          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
-                                  &column_families_, &db_);
-        } else {
-          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                       &column_families_, &db_);
-        }
-      } else {
-#ifndef ROCKSDB_LITE
-        TransactionDBOptions txn_db_options;
-        // For the moment it is sufficient to test WRITE_PREPARED policy
-        txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
-        s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
-                                cf_descriptors, &column_families_, &txn_db_);
-        db_ = txn_db_;
-        // after a crash, rollback to commit recovered transactions
-        std::vector<Transaction*> trans;
-        txn_db_->GetAllPreparedTransactions(&trans);
-        Random rand(static_cast<uint32_t>(FLAGS_seed));
-        for (auto txn : trans) {
-          if (rand.OneIn(2)) {
-            s = txn->Commit();
-            assert(s.ok());
-          } else {
-            s = txn->Rollback();
-            assert(s.ok());
-          }
-          delete txn;
-        }
-        trans.clear();
-        txn_db_->GetAllPreparedTransactions(&trans);
-        assert(trans.size() == 0);
-#endif
-      }
-      assert(!s.ok() || column_families_.size() ==
-                            static_cast<size_t>(FLAGS_column_families));
-
-      if (FLAGS_enable_secondary) {
-#ifndef ROCKSDB_LITE
-        secondaries_.resize(FLAGS_threads);
-        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
-        secondary_cfh_lists_.clear();
-        secondary_cfh_lists_.resize(FLAGS_threads);
-        Options tmp_opts;
-        tmp_opts.max_open_files = FLAGS_open_files;
-        tmp_opts.statistics = dbstats_secondaries;
-        tmp_opts.env = FLAGS_env;
-        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
-          const std::string secondary_path =
-              FLAGS_secondaries_base + "/" + std::to_string(i);
-          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
-                                  cf_descriptors, &secondary_cfh_lists_[i],
-                                  &secondaries_[i]);
-          if (!s.ok()) {
-            break;
-          }
-        }
-#else
-        fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
-        exit(1);
-#endif
-      }
-    } else {
-#ifndef ROCKSDB_LITE
-      DBWithTTL* db_with_ttl;
-      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
-      db_ = db_with_ttl;
-      if (FLAGS_enable_secondary) {
-        secondaries_.resize(FLAGS_threads);
-        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
-        Options tmp_opts;
-        tmp_opts.max_open_files = FLAGS_open_files;
-        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
-          const std::string secondary_path =
-              FLAGS_secondaries_base + "/" + std::to_string(i);
-          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
-                                  &secondaries_[i]);
-          if (!s.ok()) {
-            break;
-          }
-        }
-      }
-#else
-      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
-      exit(1);
-#endif
-    }
-    if (!s.ok()) {
-      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
-    }
-  }
-
-  void Reopen() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-    db_ = nullptr;
-#ifndef ROCKSDB_LITE
-    txn_db_ = nullptr;
-#endif
-
-    assert(secondaries_.size() == secondary_cfh_lists_.size());
-    size_t n = secondaries_.size();
-    for (size_t i = 0; i != n; ++i) {
-      for (auto* cf : secondary_cfh_lists_[i]) {
-        delete cf;
-      }
-      secondary_cfh_lists_[i].clear();
-      delete secondaries_[i];
-    }
-    secondaries_.clear();
-
-    num_times_reopened_++;
-    auto now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Reopening database for the %dth time\n",
-            FLAGS_env->TimeToString(now/1000000).c_str(),
-            num_times_reopened_);
-    Open();
-  }
-
-  void PrintStatistics() {
-    if (dbstats) {
-      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
-    }
-    if (dbstats_secondaries) {
-      fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
-              dbstats_secondaries->ToString().c_str());
-    }
-  }
-
-  std::shared_ptr<Cache> cache_;
-  std::shared_ptr<Cache> compressed_cache_;
-  std::shared_ptr<const FilterPolicy> filter_policy_;
-  DB* db_;
-#ifndef ROCKSDB_LITE
-  TransactionDB* txn_db_;
-#endif
-  Options options_;
-  std::vector<ColumnFamilyHandle*> column_families_;
-  std::vector<std::string> column_family_names_;
-  std::atomic<int> new_column_family_name_;
-  int num_times_reopened_;
-  std::unordered_map<std::string, std::vector<std::string>> options_table_;
-  std::vector<std::string> options_index_;
-  std::atomic<bool> db_preload_finished_;
-
-  // Fields used for stress-testing secondary instance in the same process
-  std::vector<DB*> secondaries_;
-  std::vector<std::vector<ColumnFamilyHandle*> > secondary_cfh_lists_;
-};
-
-class NonBatchedOpsStressTest : public StressTest {
- public:
-  NonBatchedOpsStressTest() {}
-
-  virtual ~NonBatchedOpsStressTest() {}
-
-  virtual void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    auto shared = thread->shared;
-    const int64_t max_key = shared->GetMaxKey();
-    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
-    int64_t start = keys_per_thread * thread->tid;
-    int64_t end = start + keys_per_thread;
-    uint64_t prefix_to_use =
-        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
-    if (thread->tid == shared->GetNumThreads() - 1) {
-      end = max_key;
-    }
-    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (!thread->rand.OneIn(2)) {
-        // Use iterator to verify this range
-        std::unique_ptr<Iterator> iter(
-            db_->NewIterator(options, column_families_[cf]));
-        iter->Seek(Key(start));
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          // TODO(ljin): update "long" to uint64_t
-          // Reseek when the prefix changes
-          if (prefix_to_use > 0 &&
-              i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
-            iter->Seek(Key(i));
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = iter->status();
-          if (iter->Valid()) {
-            if (iter->key().compare(k) > 0) {
-              s = Status::NotFound(Slice());
-            } else if (iter->key().compare(k) == 0) {
-              from_db = iter->value().ToString();
-              iter->Next();
-            } else if (iter->key().compare(k) < 0) {
-              VerificationAbort(shared, "An out of range key was found",
-                                static_cast<int>(cf), i);
-            }
-          } else {
-            // The iterator found no value for the key in question, so do not
-            // move to the next item in the iterator
-            s = Status::NotFound(Slice());
-          }
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      } else {
-        // Use Get to verify this range
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = db_->Get(options, column_families_[cf], k, &from_db);
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      }
-    }
-  }
-
-  virtual void MaybeClearOneColumnFamily(ThreadState* thread) {
-    if (FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
-      if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
-        // drop column family and then create it again (can't drop default)
-        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
-        std::string new_name =
-            ToString(new_column_family_name_.fetch_add(1));
-        {
-          MutexLock l(thread->shared->GetMutex());
-          fprintf(
-              stdout,
-              "[CF %d] Dropping and recreating column family. new name: %s\n",
-              cf, new_name.c_str());
-        }
-        thread->shared->LockColumnFamily(cf);
-        Status s = db_->DropColumnFamily(column_families_[cf]);
-        delete column_families_[cf];
-        if (!s.ok()) {
-          fprintf(stderr, "dropping column family error: %s\n",
-              s.ToString().c_str());
-          std::terminate();
-        }
-        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
-                                    &column_families_[cf]);
-        column_family_names_[cf] = new_name;
-        thread->shared->ClearColumnFamily(cf);
-        if (!s.ok()) {
-          fprintf(stderr, "creating column family error: %s\n",
-              s.ToString().c_str());
-          std::terminate();
-        }
-        thread->shared->UnlockColumnFamily(cf);
-      }
-    }
-  }
-
-  virtual bool ShouldAcquireMutexOnKey() const { return true; }
-
-  virtual Status TestGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    std::string from_db;
-    Status s = db_->Get(read_opts, cfh, key, &from_db);
-    if (s.ok()) {
-      // found case
-      thread->stats.AddGets(1, 1);
-    } else if (s.IsNotFound()) {
-      // not found case
-      thread->stats.AddGets(1, 0);
-    } else {
-      // errors case
-      thread->stats.AddErrors(1);
-    }
-    return s;
-  }
-
-  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    size_t num_keys = rand_keys.size();
-    std::vector<std::string> key_str;
-    std::vector<Slice> keys;
-    key_str.reserve(num_keys);
-    keys.reserve(num_keys);
-    std::vector<PinnableSlice> values(num_keys);
-    std::vector<Status> statuses(num_keys);
-    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
-
-    for (size_t i = 0; i < num_keys; ++i) {
-      key_str.emplace_back(Key(rand_keys[i]));
-      keys.emplace_back(key_str.back());
-    }
-    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
-                  statuses.data());
-    for (const auto& s : statuses) {
-      if (s.ok()) {
-        // found case
-        thread->stats.AddGets(1, 1);
-      } else if (s.IsNotFound()) {
-        // not found case
-        thread->stats.AddGets(1, 0);
-      } else {
-        // errors case
-        thread->stats.AddErrors(1);
-      }
-    }
-    return statuses;
-  }
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-
-    std::string upper_bound;
-    Slice ub_slice;
-    ReadOptions ro_copy = read_opts;
-    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
-      // For half of the time, set the upper bound to the next prefix
-      ub_slice = Slice(upper_bound);
-      ro_copy.iterate_upper_bound = &ub_slice;
-    }
-
-    Iterator* iter = db_->NewIterator(ro_copy, cfh);
-    long count = 0;
-    for (iter->Seek(prefix);
-        iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
-      ++count;
-    }
-    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
-    Status s = iter->status();
-    if (iter->status().ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    delete iter;
-    return s;
-  }
-
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      char (&value) [100], std::unique_ptr<MutexLock>& lock) {
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    while (!shared->AllowsOverwrite(rand_key) &&
-           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
-      lock.reset();
-      rand_key = thread->rand.Next() % max_key;
-      rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      lock.reset(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    }
-
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    ColumnFamilyHandle* cfh = column_families_[rand_column_family];
-
-    if (FLAGS_verify_before_write) {
-      std::string key_str2 = Key(rand_key);
-      Slice k = key_str2;
-      std::string from_db;
-      Status s = db_->Get(read_opts, cfh, k, &from_db);
-      if (!VerifyValue(rand_column_family, rand_key, read_opts, shared,
-            from_db, s, true)) {
-        return s;
-      }
-    }
-    uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
-    size_t sz = GenerateValue(value_base, value, sizeof(value));
-    Slice v(value, sz);
-    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
-    Status s;
-    if (FLAGS_use_merge) {
-      if (!FLAGS_use_txn) {
-        s = db_->Merge(write_opts, cfh, key, v);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Merge(cfh, key, v);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-    } else {
-      if (!FLAGS_use_txn) {
-        s = db_->Put(write_opts, cfh, key, v);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Put(cfh, key, v);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-    }
-    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
-    if (!s.ok()) {
-      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-      std::terminate();
-    }
-    thread->stats.AddBytesForWrites(1, sz);
-    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
-        value, sz);
-    return s;
-  }
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) {
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-
-    // OPERATION delete
-    // If the chosen key does not allow overwrite and it does not exist,
-    // choose another key.
-    while (!shared->AllowsOverwrite(rand_key) &&
-           !shared->Exists(rand_column_family, rand_key)) {
-      lock.reset();
-      rand_key = thread->rand.Next() % max_key;
-      rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      lock.reset(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    }
-
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_family];
-
-    // Use delete if the key may be overwritten and a single deletion
-    // otherwise.
-    Status s;
-    if (shared->AllowsOverwrite(rand_key)) {
-      shared->Delete(rand_column_family, rand_key, true /* pending */);
-      if (!FLAGS_use_txn) {
-        s = db_->Delete(write_opts, cfh, key);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Delete(cfh, key);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-      shared->Delete(rand_column_family, rand_key, false /* pending */);
-      thread->stats.AddDeletes(1);
-      if (!s.ok()) {
-        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-        std::terminate();
-      }
-    } else {
-      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
-      if (!FLAGS_use_txn) {
-        s = db_->SingleDelete(write_opts, cfh, key);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->SingleDelete(cfh, key);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
-      thread->stats.AddSingleDeletes(1);
-      if (!s.ok()) {
-        fprintf(stderr, "single delete error: %s\n",
-                s.ToString().c_str());
-        std::terminate();
-      }
-    }
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* thread,
-      WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) {
-    // OPERATION delete range
-    std::vector<std::unique_ptr<MutexLock>> range_locks;
-    // delete range does not respect disallowed overwrites. the keys for
-    // which overwrites are disallowed are randomly distributed so it
-    // could be expensive to find a range where each key allows
-    // overwrites.
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    if (rand_key > max_key - FLAGS_range_deletion_width) {
-      lock.reset();
-      rand_key = thread->rand.Next() %
-                 (max_key - FLAGS_range_deletion_width + 1);
-      range_locks.emplace_back(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    } else {
-      range_locks.emplace_back(std::move(lock));
-    }
-    for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
-      if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
-        range_locks.emplace_back(new MutexLock(
-              shared->GetMutexForKey(rand_column_family, rand_key + j)));
-      }
-    }
-    shared->DeleteRange(rand_column_family, rand_key,
-                        rand_key + FLAGS_range_deletion_width,
-                        true /* pending */);
-
-    std::string keystr = Key(rand_key);
-    Slice key = keystr;
-    auto cfh = column_families_[rand_column_family];
-    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
-    Slice end_key = end_keystr;
-    Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
-    if (!s.ok()) {
-      fprintf(stderr, "delete range error: %s\n",
-              s.ToString().c_str());
-      std::terminate();
-    }
-    int covered = shared->DeleteRange(
-        rand_column_family, rand_key,
-        rand_key + FLAGS_range_deletion_width, false /* pending */);
-    thread->stats.AddRangeDeletions(1);
-    thread->stats.AddCoveredByRangeDeletions(covered);
-    return s;
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestIngestExternalFile\n");
-    std::terminate();
-  }
-#else
-  virtual void TestIngestExternalFile(
-      ThreadState* thread, const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys, std::unique_ptr<MutexLock>& lock) {
-    const std::string sst_filename =
-        FLAGS_db + "/." + ToString(thread->tid) + ".sst";
-    Status s;
-    if (FLAGS_env->FileExists(sst_filename).ok()) {
-      // Maybe we terminated abnormally before, so cleanup to give this file
-      // ingestion a clean slate
-      s = FLAGS_env->DeleteFile(sst_filename);
-    }
-
-    SstFileWriter sst_file_writer(EnvOptions(), options_);
-    if (s.ok()) {
-      s = sst_file_writer.Open(sst_filename);
-    }
-    int64_t key_base = rand_keys[0];
-    int column_family = rand_column_families[0];
-    std::vector<std::unique_ptr<MutexLock> > range_locks;
-    std::vector<uint32_t> values;
-    SharedState* shared = thread->shared;
-
-    // Grab locks, set pending state on expected values, and add keys
-    for (int64_t key = key_base;
-         s.ok() && key < std::min(key_base + FLAGS_ingest_external_file_width,
-                                  shared->GetMaxKey());
-         ++key) {
-      if (key == key_base) {
-        range_locks.emplace_back(std::move(lock));
-      } else if ((key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
-        range_locks.emplace_back(
-            new MutexLock(shared->GetMutexForKey(column_family, key)));
-      }
-
-      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
-      values.push_back(value_base);
-      shared->Put(column_family, key, value_base, true /* pending */);
-
-      char value[100];
-      size_t value_len = GenerateValue(value_base, value, sizeof(value));
-      auto key_str = Key(key);
-      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
-    }
-
-    if (s.ok()) {
-      s = sst_file_writer.Finish();
-    }
-    if (s.ok()) {
-      s = db_->IngestExternalFile(column_families_[column_family],
-                                  {sst_filename}, IngestExternalFileOptions());
-    }
-    if (!s.ok()) {
-      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
-      std::terminate();
-    }
-    int64_t key = key_base;
-    for (int32_t value : values) {
-      shared->Put(column_family, key, value, false /* pending */);
-      ++key;
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool VerifyValue(int cf, int64_t key, const ReadOptions& /*opts*/,
-                   SharedState* shared, const std::string& value_from_db,
-                   Status s, bool strict = false) const {
-    if (shared->HasVerificationFailedYet()) {
-      return false;
-    }
-    // compare value_from_db with the value in the shared state
-    char value[kValueMaxLen];
-    uint32_t value_base = shared->Get(cf, key);
-    if (value_base == SharedState::UNKNOWN_SENTINEL) {
-      return true;
-    }
-    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
-      return true;
-    }
-
-    if (s.ok()) {
-      if (value_base == SharedState::DELETION_SENTINEL) {
-        VerificationAbort(shared, "Unexpected value found", cf, key);
-        return false;
-      }
-      size_t sz = GenerateValue(value_base, value, sizeof(value));
-      if (value_from_db.length() != sz) {
-        VerificationAbort(shared, "Length of value read is not equal", cf, key);
-        return false;
-      }
-      if (memcmp(value_from_db.data(), value, sz) != 0) {
-        VerificationAbort(shared, "Contents of value read don't match", cf,
-                          key);
-        return false;
-      }
-    } else {
-      if (value_base != SharedState::DELETION_SENTINEL) {
-        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-class BatchedOpsStressTest : public StressTest {
- public:
-  BatchedOpsStressTest() {}
-  virtual ~BatchedOpsStressTest() {}
-
-  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
-  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
-  // Also refer BatchedOpsStressTest::TestGet
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& /* read_opts */,
-      const std::vector<int>& rand_column_families, const std::vector<int64_t>& rand_keys,
-      char (&value)[100], std::unique_ptr<MutexLock>& /* lock */) {
-    uint32_t value_base =
-        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
-    size_t sz = GenerateValue(value_base, value, sizeof(value));
-    Slice v(value, sz);
-    std::string keys[10] = {"9", "8", "7", "6", "5",
-                            "4", "3", "2", "1", "0"};
-    std::string values[10] = {"9", "8", "7", "6", "5",
-                              "4", "3", "2", "1", "0"};
-    Slice value_slices[10];
-    WriteBatch batch;
-    Status s;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key_str;
-      values[i] += v.ToString();
-      value_slices[i] = values[i];
-      if (FLAGS_use_merge) {
-        batch.Merge(cfh, keys[i], value_slices[i]);
-      } else {
-        batch.Put(cfh, keys[i], value_slices[i]);
-      }
-    }
-
-    s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      // we did 10 writes each of size sz + 1
-      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
-  // in DB atomically i.e in a single batch. Also refer MultiGet.
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    std::string keys[10] = {"9", "7", "5", "3", "1",
-                            "8", "6", "4", "2", "0"};
-
-    WriteBatch batch;
-    Status s;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key_str;
-      batch.Delete(cfh, keys[i]);
-    }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(10);
-    }
-
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* /* thread */,
-      WriteOptions& /* write_opts */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    return Status::NotSupported("BatchedOpsStressTest does not support "
-        "TestDeleteRange");
-  }
-
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "BatchedOpsStressTest does not support "
-            "TestIngestExternalFile\n");
-    std::terminate();
-  }
-
-  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
-  // in the same snapshot, and verifies that all the values are of the form
-  // "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
-  // the DB.
-  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    Slice key_slices[10];
-    std::string values[10];
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = db_->GetSnapshot();
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string from_db;
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      key_slices[i] = keys[i];
-      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
-      if (!s.ok() && !s.IsNotFound()) {
-        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
-        values[i] = "";
-        thread->stats.AddErrors(1);
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-      } else if (s.IsNotFound()) {
-        values[i] = "";
-        thread->stats.AddGets(1, 0);
-      } else {
-        values[i] = from_db;
-
-        char expected_prefix = (keys[i])[0];
-        char actual_prefix = (values[i])[0];
-        if (actual_prefix != expected_prefix) {
-          fprintf(stderr, "error expected prefix = %c actual = %c\n",
-                  expected_prefix, actual_prefix);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-        thread->stats.AddGets(1, 1);
-      }
-    }
-    db_->ReleaseSnapshot(readoptionscopy.snapshot);
-
-    // Now that we retrieved all values, check that they all match
-    for (int i = 1; i < 10; i++) {
-      if (values[i] != values[0]) {
-        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
-                StringToHex(values[i]).c_str());
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-      }
-    }
-
-    return s;
-  }
-
-  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
-      const ReadOptions& readoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    size_t num_keys = rand_keys.size();
-    std::vector<Status> ret_status(num_keys);
-    std::array<std::string, 10> keys = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    size_t num_prefixes = keys.size();
-    for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
-      std::vector<Slice> key_slices;
-      std::vector<PinnableSlice> values(num_prefixes);
-      std::vector<Status> statuses(num_prefixes);
-      ReadOptions readoptionscopy = readoptions;
-      readoptionscopy.snapshot = db_->GetSnapshot();
-      std::vector<std::string> key_str;
-      key_str.reserve(num_prefixes);
-      key_slices.reserve(num_prefixes);
-      std::string from_db;
-      ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
-
-      for (size_t key = 0; key < num_prefixes; ++key) {
-        key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
-        key_slices.emplace_back(key_str.back());
-      }
-      db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(),
-          values.data(), statuses.data());
-      for (size_t i = 0; i < num_prefixes; i++) {
-        Status s = statuses[i];
-        if (!s.ok() && !s.IsNotFound()) {
-          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
-          thread->stats.AddErrors(1);
-          ret_status[rand_key] = s;
-          // we continue after error rather than exiting so that we can
-          // find more errors if any
-        } else if (s.IsNotFound()) {
-          thread->stats.AddGets(1, 0);
-          ret_status[rand_key] = s;
-        } else {
-          char expected_prefix = (keys[i])[0];
-          char actual_prefix = (values[i])[0];
-          if (actual_prefix != expected_prefix) {
-            fprintf(stderr, "error expected prefix = %c actual = %c\n",
-                    expected_prefix, actual_prefix);
-          }
-          std::string str;
-          str.assign(values[i].data(), values[i].size());
-          values[i].Reset();
-          str[0] = ' '; // blank out the differing character
-          values[i].PinSelf(str);
-          thread->stats.AddGets(1, 1);
-        }
-      }
-      db_->ReleaseSnapshot(readoptionscopy.snapshot);
-
-      // Now that we retrieved all values, check that they all match
-      for (size_t i = 1; i < num_prefixes; i++) {
-        if (values[i] != values[0]) {
-          fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                  key_str[i].c_str(),
-                  StringToHex(values[0].ToString()).c_str(),
-                  StringToHex(values[i].ToString()).c_str());
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-        }
-      }
-    }
-
-    return ret_status;
-  }
-
-  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
-  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
-  // of the key. Each of these 10 scans returns a series of values;
-  // each series should be the same length, and it is verified for each
-  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V)
-  virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    size_t prefix_to_use =
-        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string prefixes[10] = {"0", "1", "2", "3", "4",
-                                "5", "6", "7", "8", "9"};
-    Slice prefix_slices[10];
-    ReadOptions readoptionscopy[10];
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Iterator* iters[10];
-    std::string upper_bounds[10];
-    Slice ub_slices[10];
-    Status s = Status::OK();
-    for (int i = 0; i < 10; i++) {
-      prefixes[i] += key.ToString();
-      prefixes[i].resize(prefix_to_use);
-      prefix_slices[i] = Slice(prefixes[i]);
-      readoptionscopy[i] = readoptions;
-      readoptionscopy[i].snapshot = snapshot;
-      if (thread->rand.OneIn(2) &&
-          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
-        // For half of the time, set the upper bound to the next prefix
-        ub_slices[i] = Slice(upper_bounds[i]);
-        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
-      }
-      iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
-      iters[i]->Seek(prefix_slices[i]);
-    }
-
-    long count = 0;
-    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
-      count++;
-      std::string values[10];
-      // get list of all values for this iteration
-      for (int i = 0; i < 10; i++) {
-        // no iterator should finish before the first one
-        assert(iters[i]->Valid() &&
-               iters[i]->key().starts_with(prefix_slices[i]));
-        values[i] = iters[i]->value().ToString();
-
-        char expected_first = (prefixes[i])[0];
-        char actual_first = (values[i])[0];
-
-        if (actual_first != expected_first) {
-          fprintf(stderr, "error expected first = %c actual = %c\n",
-                  expected_first, actual_first);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-      }
-      // make sure all values are equivalent
-      for (int i = 0; i < 10; i++) {
-        if (values[i] != values[0]) {
-          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
-                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
-                  StringToHex(values[i]).c_str());
-          // we continue after error rather than exiting so that we can
-          // find more errors if any
-        }
-        iters[i]->Next();
-      }
-    }
-
-    // cleanup iterators and snapshot
-    for (int i = 0; i < 10; i++) {
-      // if the first iterator finished, they should have all finished
-      assert(!iters[i]->Valid() ||
-             !iters[i]->key().starts_with(prefix_slices[i]));
-      assert(iters[i]->status().ok());
-      delete iters[i];
-    }
-    db_->ReleaseSnapshot(snapshot);
-
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    return s;
-  }
-
-  virtual void VerifyDb(ThreadState* /* thread */) const {}
-};
-
-class CfConsistencyStressTest : public StressTest {
- public:
-  CfConsistencyStressTest() : batch_id_(0) {}
-
-  virtual ~CfConsistencyStressTest() {}
-
-  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
-                         const ReadOptions& /* read_opts */,
-                         const std::vector<int>& rand_column_families,
-                         const std::vector<int64_t>& rand_keys,
-                         char (&value)[100],
-                         std::unique_ptr<MutexLock>& /* lock */) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    uint64_t value_base = batch_id_.fetch_add(1);
-    size_t sz =
-        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
-    Slice v(value, sz);
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[cf];
-      if (FLAGS_use_merge) {
-        batch.Merge(cfh, key, v);
-      } else { /* !FLAGS_use_merge */
-        batch.Put(cfh, key, v);
-      }
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      auto num = static_cast<long>(rand_column_families.size());
-      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
-    }
-
-    return s;
-  }
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-                            const std::vector<int>& rand_column_families,
-                            const std::vector<int64_t>& rand_keys,
-                            std::unique_ptr<MutexLock>& /* lock */) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[cf];
-      batch.Delete(cfh, key);
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
-    }
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
-                                 const std::vector<int>& rand_column_families,
-                                 const std::vector<int64_t>& rand_keys,
-                                 std::unique_ptr<MutexLock>& /* lock */) {
-    int64_t rand_key = rand_keys[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    if (rand_key > max_key - FLAGS_range_deletion_width) {
-      rand_key =
-          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
-    }
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
-    Slice end_key = end_key_str;
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
-      batch.DeleteRange(cfh, key, end_key);
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddRangeDeletions(
-          static_cast<long>(rand_column_families.size()));
-    }
-    return s;
-  }
-
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "CfConsistencyStressTest does not support TestIngestExternalFile "
-            "because it's not possible to verify the result\n");
-    std::terminate();
-  }
-
-  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
-                         const std::vector<int>& rand_column_families,
-                         const std::vector<int64_t>& rand_keys) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh =
-        column_families_[rand_column_families[thread->rand.Next() %
-                                              rand_column_families.size()]];
-    std::string from_db;
-    Status s = db_->Get(readoptions, cfh, key, &from_db);
-    if (s.ok()) {
-      thread->stats.AddGets(1, 1);
-    } else if (s.IsNotFound()) {
-      thread->stats.AddGets(1, 0);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    return s;
-  }
-
-  virtual std::vector<Status> TestMultiGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    size_t num_keys = rand_keys.size();
-    std::vector<std::string> key_str;
-    std::vector<Slice> keys;
-    keys.reserve(num_keys);
-    key_str.reserve(num_keys);
-    std::vector<PinnableSlice> values(num_keys);
-    std::vector<Status> statuses(num_keys);
-    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
-
-    for (size_t i = 0; i < num_keys; ++i) {
-      key_str.emplace_back(Key(rand_keys[i]));
-      keys.emplace_back(key_str.back());
-    }
-    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), statuses.data());
-    for (auto s : statuses) {
-      if (s.ok()) {
-        // found case
-        thread->stats.AddGets(1, 1);
-      } else if (s.IsNotFound()) {
-        // not found case
-        thread->stats.AddGets(1, 0);
-      } else {
-        // errors case
-        thread->stats.AddErrors(1);
-      }
-    }
-    return statuses;
-  }
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-                                const ReadOptions& readoptions,
-                                const std::vector<int>& rand_column_families,
-                                const std::vector<int64_t>& rand_keys) {
-    size_t prefix_to_use =
-        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
-
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    Slice prefix = Slice(key.data(), prefix_to_use);
-
-    std::string upper_bound;
-    Slice ub_slice;
-    ReadOptions ro_copy = readoptions;
-    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
-      ub_slice = Slice(upper_bound);
-      ro_copy.iterate_upper_bound = &ub_slice;
-    }
-    auto cfh =
-        column_families_[rand_column_families[thread->rand.Next() %
-                                              rand_column_families.size()]];
-    Iterator* iter = db_->NewIterator(ro_copy, cfh);
-    long count = 0;
-    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
-         iter->Next()) {
-      ++count;
-    }
-    assert(prefix_to_use == 0 ||
-           count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
-    Status s = iter->status();
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    delete iter;
-    return s;
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual Status TestCheckpoint(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestCheckpoint\n");
-    std::terminate();
-  }
-#else
-  virtual Status TestCheckpoint(
-      ThreadState* thread, const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    std::string checkpoint_dir =
-        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
-    Checkpoint* checkpoint = nullptr;
-    Status s = Checkpoint::Create(db_, &checkpoint);
-    if (s.ok()) {
-      s = checkpoint->CreateCheckpoint(checkpoint_dir);
-    }
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* checkpoint_db = nullptr;
-    if (s.ok()) {
-      delete checkpoint;
-      checkpoint = nullptr;
-      Options options(options_);
-      options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descs;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      if (FLAGS_clear_column_family_one_in == 0) {
-        for (const auto& name : column_family_names_) {
-          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
-        }
-        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
-                                &cf_handles, &checkpoint_db);
-      }
-    }
-    if (checkpoint_db != nullptr) {
-      for (auto cfh : cf_handles) {
-        delete cfh;
-      }
-      cf_handles.clear();
-      delete checkpoint_db;
-      checkpoint_db = nullptr;
-    }
-    DestroyDB(checkpoint_dir, Options());
-    if (!s.ok()) {
-      fprintf(stderr, "A checkpoint operation failed with: %s\n",
-              s.ToString().c_str());
-    }
-    return s;
-  }
-#endif  // !ROCKSDB_LITE
-
-  virtual void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    // We must set total_order_seek to true because we are doing a SeekToFirst
-    // on a column family whose memtables may support (by default) prefix-based
-    // iterator. In this case, NewIterator with options.total_order_seek being
-    // false returns a prefix-based iterator. Calling SeekToFirst using this
-    // iterator causes the iterator to become invalid. That means we cannot
-    // iterate the memtable using this iterator any more, although the memtable
-    // contains the most up-to-date key-values.
-    options.total_order_seek = true;
-    assert(thread != nullptr);
-    auto shared = thread->shared;
-    std::vector<std::unique_ptr<Iterator> > iters(column_families_.size());
-    for (size_t i = 0; i != column_families_.size(); ++i) {
-      iters[i].reset(db_->NewIterator(options, column_families_[i]));
-    }
-    for (auto& iter : iters) {
-      iter->SeekToFirst();
-    }
-    size_t num = column_families_.size();
-    assert(num == iters.size());
-    std::vector<Status> statuses(num, Status::OK());
-    do {
-      if (shared->HasVerificationFailedYet()) {
-        break;
-      }
-      size_t valid_cnt = 0;
-      size_t idx = 0;
-      for (auto& iter : iters) {
-        if (iter->Valid()) {
-          ++valid_cnt;
-        } else {
-          statuses[idx] = iter->status();
-        }
-        ++idx;
-      }
-      if (valid_cnt == 0) {
-        Status status;
-        for (size_t i = 0; i != num; ++i) {
-          const auto& s = statuses[i];
-          if (!s.ok()) {
-            status = s;
-            fprintf(stderr, "Iterator on cf %s has error: %s\n",
-                    column_families_[i]->GetName().c_str(),
-                    s.ToString().c_str());
-            shared->SetVerificationFailure();
-          }
-        }
-        if (status.ok()) {
-          fprintf(stdout, "Finished scanning all column families.\n");
-        }
-        break;
-      } else if (valid_cnt != iters.size()) {
-        shared->SetVerificationFailure();
-        for (size_t i = 0; i != num; ++i) {
-          if (!iters[i]->Valid()) {
-            if (statuses[i].ok()) {
-              fprintf(stderr, "Finished scanning cf %s\n",
-                      column_families_[i]->GetName().c_str());
-            } else {
-              fprintf(stderr, "Iterator on cf %s has error: %s\n",
-                      column_families_[i]->GetName().c_str(),
-                      statuses[i].ToString().c_str());
-            }
-          } else {
-            fprintf(stderr, "cf %s has remaining data to scan\n",
-                    column_families_[i]->GetName().c_str());
-          }
-        }
-        break;
-      }
-      if (shared->HasVerificationFailedYet()) {
-        break;
-      }
-      // If the program reaches here, then all column families' iterators are
-      // still valid.
-      if (shared->PrintingVerificationResults()) {
-        continue;
-      }
-      Slice key;
-      Slice value;
-      int num_mismatched_cfs = 0;
-      for (size_t i = 0; i != num; ++i) {
-        if (i == 0) {
-          key = iters[i]->key();
-          value = iters[i]->value();
-        } else {
-          int cmp = key.compare(iters[i]->key());
-          if (cmp != 0) {
-            ++num_mismatched_cfs;
-            if (1 == num_mismatched_cfs) {
-              fprintf(stderr, "Verification failed\n");
-              fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
-                      db_->GetLatestSequenceNumber());
-              fprintf(stderr, "[%s] %s => %s\n",
-                      column_families_[0]->GetName().c_str(),
-                      key.ToString(true /* hex */).c_str(),
-                      value.ToString(true /* hex */).c_str());
-            }
-            fprintf(stderr, "[%s] %s => %s\n",
-                    column_families_[i]->GetName().c_str(),
-                    iters[i]->key().ToString(true /* hex */).c_str(),
-                    iters[i]->value().ToString(true /* hex */).c_str());
-#ifndef ROCKSDB_LITE
-            Slice begin_key;
-            Slice end_key;
-            if (cmp < 0) {
-              begin_key = key;
-              end_key = iters[i]->key();
-            } else {
-              begin_key = iters[i]->key();
-              end_key = key;
-            }
-            std::vector<KeyVersion> versions;
-            const size_t kMaxNumIKeys = 8;
-            const auto print_key_versions = [&](ColumnFamilyHandle* cfh) {
-              Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key,
-                                           kMaxNumIKeys, &versions);
-              if (!s.ok()) {
-                fprintf(stderr, "%s\n", s.ToString().c_str());
-                return;
-              }
-              assert(nullptr != cfh);
-              fprintf(stderr,
-                      "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt
-                      ")\n",
-                      cfh->GetName().c_str(),
-                      begin_key.ToString(true /* hex */).c_str(),
-                      end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
-              for (const KeyVersion& kv : versions) {
-                fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
-                        Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
-                        kv.type);
-              }
-            };
-            if (1 == num_mismatched_cfs) {
-              print_key_versions(column_families_[0]);
-            }
-            print_key_versions(column_families_[i]);
-#endif  // ROCKSDB_LITE
-            shared->SetVerificationFailure();
-          }
-        }
-      }
-      shared->FinishPrintingVerificationResults();
-      for (auto& iter : iters) {
-        iter->Next();
-      }
-    } while (true);
-  }
-
-  virtual std::vector<int> GenerateColumnFamilies(
-      const int /* num_column_families */, int /* rand_column_family */) const {
-    std::vector<int> ret;
-    int num = static_cast<int>(column_families_.size());
-    int k = 0;
-    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
-    return ret;
-  }
-
- private:
-  std::atomic<int64_t> batch_id_;
-};
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                  " [OPTIONS]...");
-  ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_statistics) {
-    dbstats = rocksdb::CreateDBStatistics();
-    if (FLAGS_enable_secondary) {
-      dbstats_secondaries = rocksdb::CreateDBStatistics();
-    }
-  }
-  FLAGS_compression_type_e =
-    StringToCompressionType(FLAGS_compression_type.c_str());
-  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
-  if (!FLAGS_hdfs.empty()) {
-    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
-  }
-  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
-
-  // The number of background threads should be at least as much the
-  // max number of concurrent compactions.
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
-  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
-                                  rocksdb::Env::Priority::BOTTOM);
-  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
-    fprintf(stderr,
-            "Error: prefixpercent is non-zero while prefix_size is "
-            "not positive!\n");
-    exit(1);
-  }
-  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: please specify prefix_size for "
-            "test_batches_snapshots test!\n");
-    exit(1);
-  }
-  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
-    fprintf(stderr,
-            "Error: please specify positive prefix_size in order to use "
-            "memtable_prefix_bloom_size_ratio\n");
-    exit(1);
-  }
-  if ((FLAGS_readpercent + FLAGS_prefixpercent +
-       FLAGS_writepercent + FLAGS_delpercent + FLAGS_delrangepercent +
-       FLAGS_iterpercent) != 100) {
-      fprintf(stderr,
-              "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
-              "100!\n");
-      exit(1);
-  }
-  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
-    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
-    exit(1);
-  }
-  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
-      fprintf(stderr,
-              "Error: #DB-reopens should be < ops_per_thread\n"
-              "Provided reopens = %d and ops_per_thread = %lu\n",
-              FLAGS_reopen,
-              (unsigned long)FLAGS_ops_per_thread);
-      exit(1);
-  }
-  if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
-    fprintf(stderr, "Error: nonzero delrangepercent unsupported in "
-                    "test_batches_snapshots mode\n");
-    exit(1);
-  }
-  if (FLAGS_active_width > FLAGS_max_key) {
-    fprintf(stderr, "Error: active_width can be at most max_key\n");
-    exit(1);
-  } else if (FLAGS_active_width == 0) {
-    FLAGS_active_width = FLAGS_max_key;
-  }
-  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
-    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
-            kValueMaxLen / kRandomValueMaxFactor);
-    exit(1);
-  }
-  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
-    fprintf(
-        stderr,
-        "Error: nooverwritepercent must not be 100 when using merge operands");
-    exit(1);
-  }
-  if (FLAGS_ingest_external_file_one_in > 0 && FLAGS_nooverwritepercent > 0) {
-    fprintf(stderr,
-            "Error: nooverwritepercent must be 0 when using file ingestion\n");
-    exit(1);
-  }
-  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
-    fprintf(stderr,
-            "Error: clear_column_family_one_in must be 0 when using backup\n");
-    exit(1);
-  }
-  if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
-    FLAGS_atomic_flush = true;
-  }
-
-  if (FLAGS_read_only) {
-    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
-        FLAGS_delrangepercent != 0) {
-      fprintf(stderr, "Error: updates are not supported in read only mode\n");
-      exit(1);
-    } else if (FLAGS_checkpoint_one_in > 0 &&
-               FLAGS_clear_column_family_one_in > 0) {
-      fprintf(stdout,
-              "Warn: checkpoint won't be validated since column families may "
-              "be dropped.\n");
-    }
-  }
-
-  // Choose a location for the test database if none given with --db=<path>
-  if (FLAGS_db.empty()) {
-      std::string default_db_path;
-      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
-      default_db_path += "/dbstress";
-      FLAGS_db = default_db_path;
-  }
-
-  if (FLAGS_enable_secondary && FLAGS_secondaries_base.empty()) {
-    std::string default_secondaries_path;
-    FLAGS_env->GetTestDirectory(&default_secondaries_path);
-    default_secondaries_path += "/dbstress_secondaries";
-    rocksdb::Status s = FLAGS_env->CreateDirIfMissing(default_secondaries_path);
-    if (!s.ok()) {
-      fprintf(stderr, "Failed to create directory %s: %s\n",
-              default_secondaries_path.c_str(), s.ToString().c_str());
-      exit(1);
-    }
-    FLAGS_secondaries_base = default_secondaries_path;
-  }
-
-  if (!FLAGS_enable_secondary && FLAGS_secondary_catch_up_one_in > 0) {
-    fprintf(stderr, "Secondary instance is disabled.\n");
-    exit(1);
-  }
-
-  rocksdb_kill_odds = FLAGS_kill_random_test;
-  rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
-
-  std::unique_ptr<rocksdb::StressTest> stress;
-  if (FLAGS_test_cf_consistency) {
-    stress.reset(new rocksdb::CfConsistencyStressTest());
-  } else if (FLAGS_test_batches_snapshots) {
-    stress.reset(new rocksdb::BatchedOpsStressTest());
-  } else {
-    stress.reset(new rocksdb::NonBatchedOpsStressTest());
-  }
-  if (stress->Run()) {
-    return 0;
-  } else {
-    return 1;
-  }
-}
-
+int main(int argc, char** argv) { return rocksdb::db_stress_tool(argc, argv); }
 #endif  // GFLAGS
diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
new file mode 100644
index 00000000000..876de71ed1c
--- /dev/null
+++ b/tools/db_stress_tool.cc
@@ -0,0 +1,4686 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#ifdef GFLAGS
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cinttypes>
+#include <exception>
+#include <queue>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "hdfs/env_hdfs.h"
+#include "logging/logging.h"
+#include "monitoring/histogram.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/sync_point.h"
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/testutil.h"
+
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+static const long KB = 1024;
+static const int kRandomValueMaxFactor = 3;
+static const int kValueMaxLen = 100;
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
+
+DEFINE_int64(max_key, 1 * KB * KB,
+             "Max number of key/values to place in database");
+
+DEFINE_int32(column_families, 10, "Number of column families");
+
+DEFINE_string(
+    options_file, "",
+    "The path to a RocksDB options file.  If specified, then db_stress will "
+    "run with the RocksDB options in the default column family of the "
+    "specified options file. Note that, when an options file is provided, "
+    "db_stress will ignore the flag values for all options that may be passed "
+    "via options file.");
+
+DEFINE_int64(
+    active_width, 0,
+    "Number of keys in active span of the key-range at any given time. The "
+    "span begins with its left endpoint at key 0, gradually moves rightwards, "
+    "and ends with its right endpoint at max_key. If set to 0, active_width "
+    "will be sanitized to be equal to max_key.");
+
+// TODO(noetzli) Add support for single deletes
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_bool(atomic_flush, false,
+            "If set, enables atomic flush in the options.\n");
+
+DEFINE_bool(test_cf_consistency, false,
+            "If set, runs the stress test dedicated to verifying writes to "
+            "multiple column families are consistent. Setting this implies "
+            "`atomic_flush=true` is set true if `disable_wal=false`.\n");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
+DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool(verbose, false, "Verbose");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_stress will report number of finished operations");
+
+DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+              "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_int32(write_buffer_size,
+             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             rocksdb::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             rocksdb::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_double(memtable_prefix_bloom_size_ratio,
+              rocksdb::Options().memtable_prefix_bloom_size_ratio,
+              "creates prefix blooms for memtables, each with size "
+              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
+
+DEFINE_bool(memtable_whole_key_filtering,
+            rocksdb::Options().memtable_whole_key_filtering,
+            "Enable whole key filtering in memtables.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data."
+             " Negative means use default settings.");
+
+DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
+             "Number of bytes in a block.");
+
+DEFINE_int32(
+    format_version,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
+    "Format version of SST files.");
+
+DEFINE_int32(index_block_restart_interval,
+             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys in index block.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
+DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
+             "The interval (in milliseconds) to adjust compaction thread pool "
+             "size. Don't change it periodically if the value is 0.");
+
+DEFINE_int32(compaction_thread_pool_variations, 2,
+             "Range of background thread pool size variations when adjusted "
+             "periodically.");
+
+DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0,
+             "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0,
+             "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(clear_column_family_one_in, 1000000,
+             "With a chance of 1/N, delete a column family and then recreate "
+             "it again. If N == 0, never drop/create column families. "
+             "When test_batches_snapshots is true, this flag has no effect");
+
+DEFINE_int32(set_options_one_in, 0,
+             "With a chance of 1/N, change some random options");
+
+DEFINE_int32(set_in_place_one_in, 0,
+             "With a chance of 1/N, toggle in place support option");
+
+DEFINE_int64(cache_size, 2LL * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "True if indexes/filters should be cached in block cache.");
+
+DEFINE_bool(use_clock_cache, false,
+            "Replace default LRU block cache with clock cache.");
+
+DEFINE_uint64(subcompactions, 1,
+              "Maximum number of subcompactions to divide L0-L1 compactions "
+              "into.");
+
+DEFINE_uint64(periodic_compaction_seconds, 1000,
+              "Files older than this value will be picked up for compaction.");
+
+DEFINE_uint64(compaction_ttl, 1000,
+              "Files older than TTL will be compacted to the next level.");
+
+DEFINE_bool(allow_concurrent_memtable_write, false,
+            "Allow multi-writers to update mem tables in parallel.");
+
+DEFINE_bool(enable_write_thread_adaptive_yield, true,
+            "Use a yielding spin loop for brief writer thread waits.");
+
+static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_int32(bloom_bits, 10,
+             "Bloom filter bits per key. "
+             "Negative means use default settings.");
+
+DEFINE_bool(use_block_based_filter, false,
+            "use block based filter"
+            "instead of full filter for block based table");
+
+DEFINE_bool(partition_filters, false,
+            "use partitioned filters "
+            "for block-based table");
+
+DEFINE_int32(
+    index_type,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions::kBinarySearch),
+    "Type of block-based table index (see `enum IndexType` in table.h)");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_string(secondaries_base, "",
+              "Use this path as the base path for secondary instances.");
+
+DEFINE_bool(enable_secondary, false, "Enable secondary instance.");
+
+DEFINE_string(
+    expected_values_path, "",
+    "File where the array of expected uint32_t values will be stored. If "
+    "provided and non-empty, the DB state will be verified against these "
+    "values after recovery. --max_key and --column_family must be kept the "
+    "same across invocations of this program that use the same "
+    "--expected_values_path.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
+            "Use O_DIRECT for reading data");
+
+DEFINE_bool(use_direct_io_for_flush_and_compaction,
+            rocksdb::Options().use_direct_io_for_flush_and_compaction,
+            "Use O_DIRECT for writing data");
+
+// Database statistics
+static std::shared_ptr<rocksdb::Statistics> dbstats;
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
+extern int rocksdb_kill_odds;
+
+DEFINE_string(kill_prefix_blacklist, "",
+              "If non-empty, kill points with prefix in the list given will be"
+              " skipped. Items are comma-separated.");
+extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_uint64(recycle_log_file_num, rocksdb::Options().recycle_log_file_num,
+              "Number of old WAL files to keep around for later recycling");
+
+DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,
+              rocksdb::Options().max_bytes_for_level_base,
+              "Max bytes for level-1");
+
+DEFINE_double(max_bytes_for_level_multiplier, 2,
+              "A multiplier to compute max bytes for level-N (N >= 2)");
+
+DEFINE_int32(range_deletion_width, 10,
+             "The width of the range deletion intervals.");
+
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
+DEFINE_bool(rate_limit_bg_reads, false,
+            "Use options.rate_limiter on compaction reads");
+
+DEFINE_bool(use_txn, false,
+            "Use TransactionDB. Currently the default write policy is "
+            "TxnDBWritePolicy::WRITE_PREPARED");
+
+DEFINE_int32(backup_one_in, 0,
+             "If non-zero, then CreateNewBackup() will be called once for "
+             "every N operations on average.  0 indicates CreateNewBackup() "
+             "is disabled.");
+
+DEFINE_int32(checkpoint_one_in, 0,
+             "If non-zero, then CreateCheckpoint() will be called once for "
+             "every N operations on average.  0 indicates CreateCheckpoint() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_one_in, 0,
+             "If non-zero, then IngestExternalFile() will be called once for "
+             "every N operations on average.  0 indicates IngestExternalFile() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_width, 1000,
+             "The width of the ingested external files.");
+
+DEFINE_int32(compact_files_one_in, 0,
+             "If non-zero, then CompactFiles() will be called once for every N "
+             "operations on average.  0 indicates CompactFiles() is disabled.");
+
+DEFINE_int32(compact_range_one_in, 0,
+             "If non-zero, then CompactRange() will be called once for every N "
+             "operations on average.  0 indicates CompactRange() is disabled.");
+
+DEFINE_int32(flush_one_in, 0,
+             "If non-zero, then Flush() will be called once for every N ops "
+             "on average.  0 indicates calls to Flush() are disabled.");
+
+DEFINE_int32(compact_range_width, 10000,
+             "The width of the ranges passed to CompactRange().");
+
+DEFINE_int32(acquire_snapshot_one_in, 0,
+             "If non-zero, then acquires a snapshot once every N operations on "
+             "average.");
+
+DEFINE_bool(compare_full_db_state_snapshot, false,
+            "If set we compare state of entire db (in one of the threads) with"
+            "each snapshot.");
+
+DEFINE_uint64(snapshot_hold_ops, 0,
+              "If non-zero, then releases snapshots N operations after they're "
+              "acquired.");
+
+DEFINE_bool(use_multiget, false,
+            "If set, use the batched MultiGet API for reads");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value > 100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             "Ratio of writes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(delrangepercent, 0,
+             "Ratio of range deletions to total workload (expressed as a "
+             "percentage). Cannot be used with test_batches_snapshots");
+static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
+
+DEFINE_int32(nooverwritepercent, 60,
+             "Ratio of keys without overwrite to total workload (expressed as "
+             " a percentage)");
+static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10,
+             "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+namespace {
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return rocksdb::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return rocksdb::kLZ4HCCompression;
+  else if (!strcasecmp(ctype, "xpress"))
+    return rocksdb::kXpressCompression;
+  else if (!strcasecmp(ctype, "zstd"))
+    return rocksdb::kZSTD;
+
+  fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression;  // default value
+}
+
+enum rocksdb::ChecksumType StringToChecksumType(const char* ctype) {
+  assert(ctype);
+  auto iter = rocksdb::checksum_type_string_map.find(ctype);
+  if (iter != rocksdb::checksum_type_string_map.end()) {
+    return iter->second;
+  }
+  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
+  return rocksdb::kCRC32c;
+}
+
+std::string ChecksumTypeToString(rocksdb::ChecksumType ctype) {
+  auto iter = std::find_if(
+      rocksdb::checksum_type_string_map.begin(),
+      rocksdb::checksum_type_string_map.end(),
+      [&](const std::pair<std::string, rocksdb::ChecksumType>&
+              name_and_enum_val) { return name_and_enum_val.second == ctype; });
+  assert(iter != rocksdb::checksum_type_string_map.end());
+  return iter->first;
+}
+
+std::vector<std::string> SplitString(std::string src) {
+  std::vector<std::string> ret;
+  if (src.empty()) {
+    return ret;
+  }
+  size_t pos = 0;
+  size_t pos_comma;
+  while ((pos_comma = src.find(',', pos)) != std::string::npos) {
+    ret.push_back(src.substr(pos, pos_comma - pos));
+    pos = pos_comma + 1;
+  }
+  ret.push_back(src.substr(pos, src.length()));
+  return ret;
+}
+}  // namespace
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_int32(compression_max_dict_bytes, 0,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes, 0,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
+DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
+static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
+
+DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+
+DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
+
+DEFINE_int32(secondary_catch_up_one_in, 0,
+             "If non-zero, the secondaries attemp to catch up with the primary "
+             "once for every N operations on average. 0 indicates the "
+             "secondaries do not try to catch up after open.");
+
+static std::shared_ptr<rocksdb::Statistics> dbstats_secondaries;
+
+enum RepFactory { kSkipList, kHashSkipList, kVectorRep };
+
+namespace {
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// truncation of constant value on static_cast
+#pragma warning(disable : 4309)
+#endif
+bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
+  std::string ret = src.ToString();
+  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
+    if (ret[i] != static_cast<char>(255)) {
+      ret[i] = ret[i] + 1;
+      break;
+    } else if (i != 0) {
+      ret[i] = 0;
+    } else {
+      // all FF. No next prefix
+      return false;
+    }
+  }
+  *v = ret;
+  return true;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+}  // namespace
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "prefix_hash", "");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < -1 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. -1 <= PrefixSize <= 8\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 7,
+             "Control the prefix size for HashSkipListRep. "
+             "-1 is disabled.");
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false,
+            "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+DEFINE_bool(use_full_merge_v1, false,
+            "On true, use a merge operator that implement the deprecated "
+            "version of FullMerge");
+
+namespace rocksdb {
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+static bool GetIntVal(std::string big_endian_key, uint64_t* key_p) {
+  unsigned int size_key = sizeof(*key_p);
+  assert(big_endian_key.size() == size_key);
+  std::string little_endian_key;
+  little_endian_key.resize(size_key);
+  for (size_t i = 0; i < size_key; ++i) {
+    little_endian_key[i] = big_endian_key[size_key - 1 - i];
+  }
+  Slice little_endian_slice = Slice(little_endian_key);
+  return GetFixed64(&little_endian_slice, key_p);
+}
+
+static std::string StringToHex(const std::string& str) {
+  std::string result = "0x";
+  result.append(Slice(str).ToString(true));
+  return result;
+}
+
+class StressTest;
+namespace {
+
+class Stats {
+ private:
+  uint64_t start_;
+  uint64_t finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  size_t single_deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long range_deletions_;
+  long covered_by_range_deletions_;
+  long errors_;
+  long num_compact_files_succeed_;
+  long num_compact_files_failed_;
+  int next_report_;
+  size_t bytes_;
+  uint64_t last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() {}
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    single_deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    range_deletions_ = 0;
+    covered_by_range_deletions_ = 0;
+    errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    num_compact_files_succeed_ = 0;
+    num_compact_files_failed_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    single_deletes_ += other.single_deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    range_deletions_ += other.range_deletions_;
+    covered_by_range_deletions_ = other.covered_by_range_deletions_;
+    errors_ += other.errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    num_compact_files_succeed_ += other.num_compact_files_succeed_;
+    num_compact_files_failed_ += other.num_compact_files_failed_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      auto now = FLAGS_env->NowMicros();
+      auto micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (FLAGS_progress_reports) {
+      if (done_ >= next_report_) {
+        if (next_report_ < 1000)
+          next_report_ += 100;
+        else if (next_report_ < 5000)
+          next_report_ += 500;
+        else if (next_report_ < 10000)
+          next_report_ += 1000;
+        else if (next_report_ < 50000)
+          next_report_ += 5000;
+        else if (next_report_ < 100000)
+          next_report_ += 10000;
+        else if (next_report_ < 500000)
+          next_report_ += 50000;
+        else
+          next_report_ += 100000;
+        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+      }
+    }
+  }
+
+  void AddBytesForWrites(long nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(long ngets, long nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(long nprefixes, long count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(long n) { iterations_ += n; }
+
+  void AddDeletes(long n) { deletes_ += n; }
+
+  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
+
+  void AddRangeDeletions(long n) { range_deletions_ += n; }
+
+  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
+
+  void AddErrors(long n) { errors_ += n; }
+
+  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
+
+  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_ / elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n", seconds_ * 1e6 / done_,
+            (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100 * writes_) / done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
+            single_deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", gets_,
+            founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_);
+    fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "",
+            covered_by_range_deletions_);
+
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "",
+            num_compact_files_succeed_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "",
+            num_compact_files_failed_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  // indicates a key may have any value (or not be present) as an operation on
+  // it is incomplete.
+  static const uint32_t UNKNOWN_SENTINEL;
+  // indicates a key should definitely be deleted
+  static const uint32_t DELETION_SENTINEL;
+
+  explicit SharedState(StressTest* stress_test)
+      : cv_(&mu_),
+        seed_(static_cast<uint32_t>(FLAGS_seed)),
+        max_key_(FLAGS_max_key),
+        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        num_populated_(0),
+        vote_reopen_(0),
+        num_done_(0),
+        start_(false),
+        start_verify_(false),
+        should_stop_bg_thread_(false),
+        bg_thread_finished_(false),
+        stress_test_(stress_test),
+        verification_failure_(false),
+        no_overwrite_ids_(FLAGS_column_families),
+        values_(nullptr),
+        printing_verification_results_(false) {
+    // Pick random keys in each column family that will not experience
+    // overwrite
+
+    printf("Choosing random keys with no overwrite\n");
+    Random64 rnd(seed_);
+    // Start with the identity permutation. Subsequent iterations of
+    // for loop below will start with perm of previous for loop
+    int64_t* permutation = new int64_t[max_key_];
+    for (int64_t i = 0; i < max_key_; i++) {
+      permutation[i] = i;
+    }
+    // Now do the Knuth shuffle
+    int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
+    // Only need to figure out first num_no_overwrite_keys of permutation
+    no_overwrite_ids_.reserve(num_no_overwrite_keys);
+    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
+      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
+      // Swap i and rand_index;
+      int64_t temp = permutation[i];
+      permutation[i] = permutation[rand_index];
+      permutation[rand_index] = temp;
+      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
+      // permutation
+      no_overwrite_ids_.insert(permutation[i]);
+    }
+    delete[] permutation;
+
+    size_t expected_values_size =
+        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
+    bool values_init_needed = false;
+    Status status;
+    if (!FLAGS_expected_values_path.empty()) {
+      if (!std::atomic<uint32_t>{}.is_lock_free()) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on platforms without lock-free "
+            "std::atomic<uint32_t>");
+      }
+      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on when "
+            "--clear_column_family_one_in is greater than zero.");
+      }
+      uint64_t size = 0;
+      if (status.ok()) {
+        status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
+      }
+      std::unique_ptr<WritableFile> wfile;
+      if (status.ok() && size == 0) {
+        const EnvOptions soptions;
+        status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
+                                            soptions);
+      }
+      if (status.ok() && size == 0) {
+        std::string buf(expected_values_size, '\0');
+        status = wfile->Append(buf);
+        values_init_needed = true;
+      }
+      if (status.ok()) {
+        status = FLAGS_env->NewMemoryMappedFileBuffer(
+            FLAGS_expected_values_path, &expected_mmap_buffer_);
+      }
+      if (status.ok()) {
+        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
+        values_ = static_cast<std::atomic<uint32_t>*>(
+            expected_mmap_buffer_->GetBase());
+        assert(values_ != nullptr);
+      } else {
+        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
+                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
+        assert(values_ == nullptr);
+      }
+    }
+    if (values_ == nullptr) {
+      values_allocation_.reset(
+          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
+      values_ = &values_allocation_[0];
+      values_init_needed = true;
+    }
+    assert(values_ != nullptr);
+    if (values_init_needed) {
+      for (int i = 0; i < FLAGS_column_families; ++i) {
+        for (int j = 0; j < max_key_; ++j) {
+          Delete(i, j, false /* pending */);
+        }
+      }
+    }
+
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+
+    long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
+    key_locks_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      key_locks_[i].resize(num_locks);
+      for (auto& ptr : key_locks_[i]) {
+        ptr.reset(new port::Mutex);
+      }
+    }
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  StressTest* GetStressTest() const { return stress_test_; }
+
+  int64_t GetMaxKey() const { return max_key_; }
+
+  uint32_t GetNumThreads() const { return num_threads_; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncOperated() { num_populated_++; }
+
+  void IncDone() { num_done_++; }
+
+  void IncVotedReopen() { vote_reopen_ = (vote_reopen_ + 1) % num_threads_; }
+
+  bool AllInitialized() const { return num_initialized_ >= num_threads_; }
+
+  bool AllOperated() const { return num_populated_ >= num_threads_; }
+
+  bool AllDone() const { return num_done_ >= num_threads_; }
+
+  bool AllVotedReopen() { return (vote_reopen_ == 0); }
+
+  void SetStart() { start_ = true; }
+
+  void SetStartVerify() { start_verify_ = true; }
+
+  bool Started() const { return start_; }
+
+  bool VerifyStarted() const { return start_verify_; }
+
+  void SetVerificationFailure() { verification_failure_.store(true); }
+
+  bool HasVerificationFailedYet() { return verification_failure_.load(); }
+
+  port::Mutex* GetMutexForKey(int cf, int64_t key) {
+    return key_locks_[cf][key >> log2_keys_per_lock_].get();
+  }
+
+  void LockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Lock();
+    }
+  }
+
+  void UnlockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Unlock();
+    }
+  }
+
+  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
+    return values_[cf * max_key_ + key];
+  }
+
+  void ClearColumnFamily(int cf) {
+    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
+              DELETION_SENTINEL);
+  }
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    if (!pending) {
+      // prevent expected-value update from reordering before Write
+      std::atomic_thread_fence(std::memory_order_release);
+    }
+    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
+                         std::memory_order_relaxed);
+    if (pending) {
+      // prevent Write from reordering before expected-value update
+      std::atomic_thread_fence(std::memory_order_release);
+    }
+  }
+
+  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool Delete(int cf, int64_t key, bool pending) {
+    if (Value(cf, key) == DELETION_SENTINEL) {
+      return false;
+    }
+    Put(cf, key, DELETION_SENTINEL, pending);
+    return true;
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
+    int covered = 0;
+    for (int64_t key = begin_key; key < end_key; ++key) {
+      if (Delete(cf, key, pending)) {
+        ++covered;
+      }
+    }
+    return covered;
+  }
+
+  bool AllowsOverwrite(int64_t key) {
+    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
+  }
+
+  bool Exists(int cf, int64_t key) {
+    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
+    // is disallowed can't be accidentally added a second time, in which case
+    // SingleDelete wouldn't be able to properly delete the key. It does allow
+    // the case where a SingleDelete might be added which covers nothing, but
+    // that's not a correctness issue.
+    uint32_t expected_value = Value(cf, key).load();
+    return expected_value != DELETION_SENTINEL;
+  }
+
+  uint32_t GetSeed() const { return seed_; }
+
+  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
+
+  bool ShoudStopBgThread() { return should_stop_bg_thread_; }
+
+  void SetBgThreadFinish() { bg_thread_finished_ = true; }
+
+  bool BgThreadFinished() const { return bg_thread_finished_; }
+
+  bool ShouldVerifyAtBeginning() const {
+    return expected_mmap_buffer_.get() != nullptr;
+  }
+
+  bool PrintingVerificationResults() {
+    bool tmp = false;
+    return !printing_verification_results_.compare_exchange_strong(
+        tmp, true, std::memory_order_relaxed);
+  }
+
+  void FinishPrintingVerificationResults() {
+    printing_verification_results_.store(false, std::memory_order_relaxed);
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const int64_t max_key_;
+  const uint32_t log2_keys_per_lock_;
+  const int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  bool should_stop_bg_thread_;
+  bool bg_thread_finished_;
+  StressTest* stress_test_;
+  std::atomic<bool> verification_failure_;
+
+  // Keys that should not be overwritten
+  std::unordered_set<size_t> no_overwrite_ids_;
+
+  std::atomic<uint32_t>* values_;
+  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
+  // Has to make it owned by a smart ptr as port::Mutex is not copyable
+  // and storing it in the container may require copying depending on the impl.
+  std::vector<std::vector<std::unique_ptr<port::Mutex>>> key_locks_;
+  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
+  std::atomic<bool> printing_verification_results_;
+};
+
+const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
+const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;  // 0..n-1
+  Random rand;   // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+  struct SnapshotState {
+    const Snapshot* snapshot;
+    // The cf from which we did a Get at this snapshot
+    int cf_at;
+    // The name of the cf at the time that we did a read
+    std::string cf_at_name;
+    // The key with which we did a Get at this snapshot
+    std::string key;
+    // The status of the Get
+    Status status;
+    // The value of the Get
+    std::string value;
+    // optional state of all keys in the db
+    std::vector<bool>* key_vec;
+  };
+  std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
+};
+
+class DbStressListener : public EventListener {
+ public:
+  DbStressListener(const std::string& db_name,
+                   const std::vector<DbPath>& db_paths,
+                   const std::vector<ColumnFamilyDescriptor>& column_families)
+      : db_name_(db_name),
+        db_paths_(db_paths),
+        column_families_(column_families),
+        num_pending_file_creations_(0) {}
+  virtual ~DbStressListener() { assert(num_pending_file_creations_ == 0); }
+#ifndef ROCKSDB_LITE
+  virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    assert(IsValidColumnFamilyName(info.cf_name));
+    VerifyFilePath(info.file_path);
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
+  }
+
+  virtual void OnCompactionCompleted(DB* /*db*/,
+                                     const CompactionJobInfo& ci) override {
+    assert(IsValidColumnFamilyName(ci.cf_name));
+    assert(ci.input_files.size() + ci.output_files.size() > 0U);
+    for (const auto& file_path : ci.input_files) {
+      VerifyFilePath(file_path);
+    }
+    for (const auto& file_path : ci.output_files) {
+      VerifyFilePath(file_path);
+    }
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
+  }
+
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) override {
+    ++num_pending_file_creations_;
+  }
+  virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    assert(info.db_name == db_name_);
+    assert(IsValidColumnFamilyName(info.cf_name));
+    if (info.file_size) {
+      VerifyFilePath(info.file_path);
+    }
+    assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
+    if (info.status.ok() && info.file_size > 0) {
+      assert(info.table_properties.data_size > 0 ||
+             info.table_properties.num_range_deletions > 0);
+      assert(info.table_properties.raw_key_size > 0);
+      assert(info.table_properties.num_entries > 0);
+    }
+    --num_pending_file_creations_;
+  }
+
+ protected:
+  bool IsValidColumnFamilyName(const std::string& cf_name) const {
+    if (cf_name == kDefaultColumnFamilyName) {
+      return true;
+    }
+    // The column family names in the stress tests are numbers.
+    for (size_t i = 0; i < cf_name.size(); ++i) {
+      if (cf_name[i] < '0' || cf_name[i] > '9') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void VerifyFileDir(const std::string& file_dir) {
+#ifndef NDEBUG
+    if (db_name_ == file_dir) {
+      return;
+    }
+    for (const auto& db_path : db_paths_) {
+      if (db_path.path == file_dir) {
+        return;
+      }
+    }
+    for (auto& cf : column_families_) {
+      for (const auto& cf_path : cf.options.cf_paths) {
+        if (cf_path.path == file_dir) {
+          return;
+        }
+      }
+    }
+    assert(false);
+#else
+    (void)file_dir;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFileName(const std::string& file_name) {
+#ifndef NDEBUG
+    uint64_t file_number;
+    FileType file_type;
+    bool result = ParseFileName(file_name, &file_number, &file_type);
+    assert(result);
+    assert(file_type == kTableFile);
+#else
+    (void)file_name;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFilePath(const std::string& file_path) {
+#ifndef NDEBUG
+    size_t pos = file_path.find_last_of("/");
+    if (pos == std::string::npos) {
+      VerifyFileName(file_path);
+    } else {
+      if (pos > 0) {
+        VerifyFileDir(file_path.substr(0, pos));
+      }
+      VerifyFileName(file_path.substr(pos));
+    }
+#else
+    (void)file_path;
+#endif  // !NDEBUG
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  std::string db_name_;
+  std::vector<DbPath> db_paths_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  std::atomic<int> num_pending_file_creations_;
+};
+
+}  // namespace
+
+class StressTest {
+ public:
+  StressTest()
+      : cache_(NewCache(FLAGS_cache_size)),
+        compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                           ? FLAGS_use_block_based_filter
+                                 ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
+                                 : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
+                           : nullptr),
+        db_(nullptr),
+#ifndef ROCKSDB_LITE
+        txn_db_(nullptr),
+#endif
+        new_column_family_name_(1),
+        num_times_reopened_(0),
+        db_preload_finished_(false) {
+    if (FLAGS_destroy_db_initially) {
+      std::vector<std::string> files;
+      FLAGS_env->GetChildren(FLAGS_db, &files);
+      for (unsigned int i = 0; i < files.size(); i++) {
+        if (Slice(files[i]).starts_with("heap-")) {
+          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+        }
+      }
+      Options options;
+      options.env = FLAGS_env;
+      Status s = DestroyDB(FLAGS_db, options);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot destroy original db: %s\n",
+                s.ToString().c_str());
+        exit(1);
+      }
+    }
+  }
+
+  virtual ~StressTest() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
+  }
+
+  std::shared_ptr<Cache> NewCache(size_t capacity) {
+    if (capacity <= 0) {
+      return nullptr;
+    }
+    if (FLAGS_use_clock_cache) {
+      auto cache = NewClockCache((size_t)capacity);
+      if (!cache) {
+        fprintf(stderr, "Clock cache not supported.");
+        exit(1);
+      }
+      return cache;
+    } else {
+      return NewLRUCache((size_t)capacity);
+    }
+  }
+
+  bool BuildOptionsTable() {
+    if (FLAGS_set_options_one_in <= 0) {
+      return true;
+    }
+
+    std::unordered_map<std::string, std::vector<std::string>> options_tbl = {
+        {"write_buffer_size",
+         {ToString(options_.write_buffer_size),
+          ToString(options_.write_buffer_size * 2),
+          ToString(options_.write_buffer_size * 4)}},
+        {"max_write_buffer_number",
+         {ToString(options_.max_write_buffer_number),
+          ToString(options_.max_write_buffer_number * 2),
+          ToString(options_.max_write_buffer_number * 4)}},
+        {"arena_block_size",
+         {
+             ToString(options_.arena_block_size),
+             ToString(options_.write_buffer_size / 4),
+             ToString(options_.write_buffer_size / 8),
+         }},
+        {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
+        {"max_successive_merges", {"0", "2", "4"}},
+        {"inplace_update_num_locks", {"100", "200", "300"}},
+        // TODO(ljin): enable test for this option
+        // {"disable_auto_compactions", {"100", "200", "300"}},
+        {"soft_rate_limit", {"0", "0.5", "0.9"}},
+        {"hard_rate_limit", {"0", "1.1", "2.0"}},
+        {"level0_file_num_compaction_trigger",
+         {
+             ToString(options_.level0_file_num_compaction_trigger),
+             ToString(options_.level0_file_num_compaction_trigger + 2),
+             ToString(options_.level0_file_num_compaction_trigger + 4),
+         }},
+        {"level0_slowdown_writes_trigger",
+         {
+             ToString(options_.level0_slowdown_writes_trigger),
+             ToString(options_.level0_slowdown_writes_trigger + 2),
+             ToString(options_.level0_slowdown_writes_trigger + 4),
+         }},
+        {"level0_stop_writes_trigger",
+         {
+             ToString(options_.level0_stop_writes_trigger),
+             ToString(options_.level0_stop_writes_trigger + 2),
+             ToString(options_.level0_stop_writes_trigger + 4),
+         }},
+        {"max_compaction_bytes",
+         {
+             ToString(options_.target_file_size_base * 5),
+             ToString(options_.target_file_size_base * 15),
+             ToString(options_.target_file_size_base * 100),
+         }},
+        {"target_file_size_base",
+         {
+             ToString(options_.target_file_size_base),
+             ToString(options_.target_file_size_base * 2),
+             ToString(options_.target_file_size_base * 4),
+         }},
+        {"target_file_size_multiplier",
+         {
+             ToString(options_.target_file_size_multiplier),
+             "1",
+             "2",
+         }},
+        {"max_bytes_for_level_base",
+         {
+             ToString(options_.max_bytes_for_level_base / 2),
+             ToString(options_.max_bytes_for_level_base),
+             ToString(options_.max_bytes_for_level_base * 2),
+         }},
+        {"max_bytes_for_level_multiplier",
+         {
+             ToString(options_.max_bytes_for_level_multiplier),
+             "1",
+             "2",
+         }},
+        {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
+    };
+
+    options_table_ = std::move(options_tbl);
+
+    for (const auto& iter : options_table_) {
+      options_index_.push_back(iter.first);
+    }
+    return true;
+  }
+
+  bool Run() {
+    uint64_t now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing db_stress\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
+    PrintEnv();
+    Open();
+    BuildOptionsTable();
+    SharedState shared(this);
+
+    if (FLAGS_read_only) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+      PreloadDbAndReopenAsReadOnly(FLAGS_max_key, &shared);
+    }
+    uint32_t n = shared.GetNumThreads();
+
+    now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing worker threads\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
+    std::vector<ThreadState*> threads(n);
+    for (uint32_t i = 0; i < n; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      FLAGS_env->StartThread(ThreadBody, threads[i]);
+    }
+    ThreadState bg_thread(0, &shared);
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
+    }
+
+    // Each thread goes through the following states:
+    // initializing -> wait for others to init -> read/populate/depopulate
+    // wait for others to operate -> verify -> done
+
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      if (shared.ShouldVerifyAtBeginning()) {
+        if (shared.HasVerificationFailedYet()) {
+          printf("Crash-recovery verification failed :(\n");
+        } else {
+          printf("Crash-recovery verification passed :)\n");
+        }
+      }
+
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Starting database operations\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllOperated()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      now = FLAGS_env->NowMicros();
+      if (FLAGS_test_batches_snapshots) {
+        fprintf(stdout, "%s Limited verification already done during gets\n",
+                FLAGS_env->TimeToString((uint64_t)now / 1000000).c_str());
+      } else {
+        fprintf(stdout, "%s Starting verification\n",
+                FLAGS_env->TimeToString((uint64_t)now / 1000000).c_str());
+      }
+
+      shared.SetStartVerify();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    for (unsigned int i = 1; i < n; i++) {
+      threads[0]->stats.Merge(threads[i]->stats);
+    }
+    threads[0]->stats.Report("Stress Test");
+
+    for (unsigned int i = 0; i < n; i++) {
+      delete threads[i];
+      threads[i] = nullptr;
+    }
+    now = FLAGS_env->NowMicros();
+    if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
+      fprintf(stdout, "%s Verification successful\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+    }
+    PrintStatistics();
+
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      MutexLock l(shared.GetMutex());
+      shared.SetShouldStopBgThread();
+      while (!shared.BgThreadFinished()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+#ifndef ROCKSDB_LITE
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Start to verify secondaries against primary\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+    for (size_t k = 0; k != secondaries_.size(); ++k) {
+      Status s = secondaries_[k]->TryCatchUpWithPrimary();
+      if (!s.ok()) {
+        fprintf(stderr, "Secondary failed to catch up with primary\n");
+        return false;
+      }
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      // Verify only the default column family since the primary may have
+      // dropped other column families after most recent reopen.
+      std::unique_ptr<Iterator> iter1(db_->NewIterator(ropts));
+      std::unique_ptr<Iterator> iter2(secondaries_[k]->NewIterator(ropts));
+      for (iter1->SeekToFirst(), iter2->SeekToFirst();
+           iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+        if (iter1->key().compare(iter2->key()) != 0 ||
+            iter1->value().compare(iter2->value())) {
+          fprintf(stderr,
+                  "Secondary %d contains different data from "
+                  "primary.\nPrimary: %s : %s\nSecondary: %s : %s\n",
+                  static_cast<int>(k),
+                  iter1->key().ToString(/*hex=*/true).c_str(),
+                  iter1->value().ToString(/*hex=*/true).c_str(),
+                  iter2->key().ToString(/*hex=*/true).c_str(),
+                  iter2->value().ToString(/*hex=*/true).c_str());
+          return false;
+        }
+      }
+      if (iter1->Valid() && !iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is smaller than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      } else if (!iter1->Valid() && iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is larger than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      }
+    }
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Verification of secondaries succeeded\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+#endif  // ROCKSDB_LITE
+
+    if (shared.HasVerificationFailedYet()) {
+      printf("Verification failed :(\n");
+      return false;
+    }
+    return true;
+  }
+
+ protected:
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    if (shared->ShouldVerifyAtBeginning()) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetStressTest()->OperateDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncOperated();
+      if (shared->AllOperated()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->VerifyStarted()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+
+    thread->shared->GetStressTest()->VerifyDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  static void PoolSizeChangeThread(void* v) {
+    assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    while (true) {
+      {
+        MutexLock l(shared->GetMutex());
+        if (shared->ShoudStopBgThread()) {
+          shared->SetBgThreadFinish();
+          shared->GetCondVar()->SignalAll();
+          return;
+        }
+      }
+
+      auto thread_pool_size_base = FLAGS_max_background_compactions;
+      auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
+      int new_thread_pool_size =
+          thread_pool_size_base - thread_pool_size_var +
+          thread->rand.Next() % (thread_pool_size_var * 2 + 1);
+      if (new_thread_pool_size < 1) {
+        new_thread_pool_size = 1;
+      }
+      FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
+      // Sleep up to 3 seconds
+      FLAGS_env->SleepForMicroseconds(
+          thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
+              1000 +
+          1);
+    }
+  }
+
+  static void PrintKeyValue(int cf, uint64_t key, const char* value,
+                            size_t sz) {
+    if (!FLAGS_verbose) {
+      return;
+    }
+    std::string tmp;
+    tmp.reserve(sz * 2 + 16);
+    char buf[4];
+    for (size_t i = 0; i < sz; i++) {
+      snprintf(buf, 4, "%X", value[i]);
+      tmp.append(buf);
+    }
+    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
+            key, sz, tmp.c_str());
+  }
+
+  static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    return base_key + thread->rand.Next() % FLAGS_active_width;
+  }
+
+  static std::vector<int64_t> GenerateNKeys(ThreadState* thread, int num_keys,
+                                            uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    std::vector<int64_t> keys;
+    keys.reserve(num_keys);
+    int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width;
+    keys.push_back(next_key);
+    for (int i = 1; i < num_keys; ++i) {
+      // This may result in some duplicate keys
+      next_key = next_key + thread->rand.Next() %
+                                (FLAGS_active_width - (next_key - base_key));
+      keys.push_back(next_key);
+    }
+    return keys;
+  }
+
+  static size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
+    size_t value_sz =
+        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    (void)max_sz;
+    *((uint32_t*)v) = rand;
+    for (size_t i = sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz;  // the size of the value set.
+  }
+
+  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
+                    ThreadState::SnapshotState& snap_state) {
+    Status s;
+    if (cf->GetName() != snap_state.cf_at_name) {
+      return s;
+    }
+    ReadOptions ropt;
+    ropt.snapshot = snap_state.snapshot;
+    PinnableSlice exp_v(&snap_state.value);
+    exp_v.PinSelf();
+    PinnableSlice v;
+    s = db->Get(ropt, cf, snap_state.key, &v);
+    if (!s.ok() && !s.IsNotFound()) {
+      return s;
+    }
+    if (snap_state.status != s) {
+      return Status::Corruption(
+          "The snapshot gave inconsistent results for key " +
+          ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
+          " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
+          ") vs. (" + s.ToString() + ")");
+    }
+    if (s.ok()) {
+      if (exp_v != v) {
+        return Status::Corruption("The snapshot gave inconsistent values: (" +
+                                  exp_v.ToString() + ") vs. (" + v.ToString() +
+                                  ")");
+      }
+    }
+    if (snap_state.key_vec != nullptr) {
+      // When `prefix_extractor` is set, seeking to beginning and scanning
+      // across prefixes are only supported with `total_order_seek` set.
+      ropt.total_order_seek = true;
+      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
+      std::unique_ptr<std::vector<bool>> tmp_bitvec(
+          new std::vector<bool>(FLAGS_max_key));
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        uint64_t key_val;
+        if (GetIntVal(iterator->key().ToString(), &key_val)) {
+          (*tmp_bitvec.get())[key_val] = true;
+        }
+      }
+      if (!std::equal(snap_state.key_vec->begin(), snap_state.key_vec->end(),
+                      tmp_bitvec.get()->begin())) {
+        return Status::Corruption("Found inconsistent keys at this snapshot");
+      }
+    }
+    return Status::OK();
+  }
+
+  // Currently PreloadDb has to be single-threaded.
+  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
+                                    SharedState* shared) {
+    WriteOptions write_opts;
+    write_opts.disableWAL = FLAGS_disable_wal;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    char value[100];
+    int cf_idx = 0;
+    Status s;
+    for (auto cfh : column_families_) {
+      for (int64_t k = 0; k != number_of_keys; ++k) {
+        std::string key_str = Key(k);
+        Slice key = key_str;
+        size_t sz = GenerateValue(0 /*value_base*/, value, sizeof(value));
+        Slice v(value, sz);
+        shared->Put(cf_idx, k, 0, true /* pending */);
+
+        if (FLAGS_use_merge) {
+          if (!FLAGS_use_txn) {
+            s = db_->Merge(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Merge(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
+        } else {
+          if (!FLAGS_use_txn) {
+            s = db_->Put(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Put(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
+        }
+
+        shared->Put(cf_idx, k, 0, false /* pending */);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (!s.ok()) {
+        break;
+      }
+      ++cf_idx;
+    }
+    if (s.ok()) {
+      s = db_->Flush(FlushOptions(), column_families_);
+    }
+    if (s.ok()) {
+      for (auto cf : column_families_) {
+        delete cf;
+      }
+      column_families_.clear();
+      delete db_;
+      db_ = nullptr;
+#ifndef ROCKSDB_LITE
+      txn_db_ = nullptr;
+#endif
+
+      db_preload_finished_.store(true);
+      auto now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Reopening database in read-only\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+      // Reopen as read-only, can ignore all options related to updates
+      Open();
+    } else {
+      fprintf(stderr, "Failed to preload db");
+      exit(1);
+    }
+  }
+
+  Status SetOptions(ThreadState* thread) {
+    assert(FLAGS_set_options_one_in > 0);
+    std::unordered_map<std::string, std::string> opts;
+    std::string name =
+        options_index_[thread->rand.Next() % options_index_.size()];
+    int value_idx = thread->rand.Next() % options_table_[name].size();
+    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
+      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
+      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
+    } else if (name == "level0_file_num_compaction_trigger" ||
+               name == "level0_slowdown_writes_trigger" ||
+               name == "level0_stop_writes_trigger") {
+      opts["level0_file_num_compaction_trigger"] =
+          options_table_["level0_file_num_compaction_trigger"][value_idx];
+      opts["level0_slowdown_writes_trigger"] =
+          options_table_["level0_slowdown_writes_trigger"][value_idx];
+      opts["level0_stop_writes_trigger"] =
+          options_table_["level0_stop_writes_trigger"][value_idx];
+    } else {
+      opts[name] = options_table_[name][value_idx];
+    }
+
+    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
+    auto cfh = column_families_[rand_cf_idx];
+    return db_->SetOptions(cfh, opts);
+  }
+
+#ifndef ROCKSDB_LITE
+  Status NewTxn(WriteOptions& write_opts, Transaction** txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
+    }
+    static std::atomic<uint64_t> txn_id = {0};
+    TransactionOptions txn_options;
+    *txn = txn_db_->BeginTransaction(write_opts, txn_options);
+    auto istr = std::to_string(txn_id.fetch_add(1));
+    Status s = (*txn)->SetName("xid" + istr);
+    return s;
+  }
+
+  Status CommitTxn(Transaction* txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
+    }
+    Status s = txn->Prepare();
+    if (s.ok()) {
+      s = txn->Commit();
+    }
+    delete txn;
+    return s;
+  }
+#endif
+
+  virtual void OperateDb(ThreadState* thread) {
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    WriteOptions write_opts;
+    auto shared = thread->shared;
+    char value[100];
+    std::string from_db;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    write_opts.disableWAL = FLAGS_disable_wal;
+    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
+    const int writeBound = prefixBound + (int)FLAGS_writepercent;
+    const int delBound = writeBound + (int)FLAGS_delpercent;
+    const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
+    const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
+
+    thread->stats.Start();
+    for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (open_cnt != 0) {
+        thread->stats.FinishedSingleOp();
+        MutexLock l(thread->shared->GetMutex());
+        while (!thread->snapshot_queue.empty()) {
+          db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+          delete thread->snapshot_queue.front().second.key_vec;
+          thread->snapshot_queue.pop();
+        }
+        thread->shared->IncVotedReopen();
+        if (thread->shared->AllVotedReopen()) {
+          thread->shared->GetStressTest()->Reopen();
+          thread->shared->GetCondVar()->SignalAll();
+        } else {
+          thread->shared->GetCondVar()->Wait();
+        }
+        // Commenting this out as we don't want to reset stats on each open.
+        // thread->stats.Start();
+      }
+
+      for (uint64_t i = 0; i < ops_per_open; i++) {
+        if (thread->shared->HasVerificationFailedYet()) {
+          break;
+        }
+
+        // Change Options
+        if (FLAGS_set_options_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_options_one_in)) {
+          SetOptions(thread);
+        }
+
+        if (FLAGS_set_in_place_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
+          options_.inplace_update_support ^= options_.inplace_update_support;
+        }
+
+        MaybeClearOneColumnFamily(thread);
+
+#ifndef ROCKSDB_LITE
+        if (FLAGS_compact_files_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
+          auto* random_cf =
+              column_families_[thread->rand.Next() % FLAGS_column_families];
+          rocksdb::ColumnFamilyMetaData cf_meta_data;
+          db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
+
+          // Randomly compact up to three consecutive files from a level
+          const int kMaxRetry = 3;
+          for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
+            size_t random_level = thread->rand.Uniform(
+                static_cast<int>(cf_meta_data.levels.size()));
+
+            const auto& files = cf_meta_data.levels[random_level].files;
+            if (files.size() > 0) {
+              size_t random_file_index =
+                  thread->rand.Uniform(static_cast<int>(files.size()));
+              if (files[random_file_index].being_compacted) {
+                // Retry as the selected file is currently being compacted
+                continue;
+              }
+
+              std::vector<std::string> input_files;
+              input_files.push_back(files[random_file_index].name);
+              if (random_file_index > 0 &&
+                  !files[random_file_index - 1].being_compacted) {
+                input_files.push_back(files[random_file_index - 1].name);
+              }
+              if (random_file_index + 1 < files.size() &&
+                  !files[random_file_index + 1].being_compacted) {
+                input_files.push_back(files[random_file_index + 1].name);
+              }
+
+              size_t output_level =
+                  std::min(random_level + 1, cf_meta_data.levels.size() - 1);
+              auto s =
+                  db_->CompactFiles(CompactionOptions(), random_cf, input_files,
+                                    static_cast<int>(output_level));
+              if (!s.ok()) {
+                fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                        s.ToString().c_str());
+                thread->stats.AddNumCompactFilesFailed(1);
+              } else {
+                thread->stats.AddNumCompactFilesSucceed(1);
+              }
+              break;
+            }
+          }
+        }
+#endif  // !ROCKSDB_LITE
+        int64_t rand_key = GenerateOneKey(thread, i);
+        int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+        std::string keystr = Key(rand_key);
+        Slice key = keystr;
+        std::unique_ptr<MutexLock> lock;
+        if (ShouldAcquireMutexOnKey()) {
+          lock.reset(new MutexLock(
+              shared->GetMutexForKey(rand_column_family, rand_key)));
+        }
+
+        auto column_family = column_families_[rand_column_family];
+
+        if (FLAGS_compact_range_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
+          int64_t end_key_num;
+          if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
+            end_key_num = port::kMaxInt64;
+          } else {
+            end_key_num = FLAGS_compact_range_width + rand_key;
+          }
+          std::string end_key_buf = Key(end_key_num);
+          Slice end_key(end_key_buf);
+
+          CompactRangeOptions cro;
+          cro.exclusive_manual_compaction =
+              static_cast<bool>(thread->rand.Next() % 2);
+          Status status = db_->CompactRange(cro, column_family, &key, &end_key);
+          if (!status.ok()) {
+            printf("Unable to perform CompactRange(): %s\n",
+                   status.ToString().c_str());
+          }
+        }
+
+        std::vector<int> rand_column_families =
+            GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+        if (FLAGS_flush_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+          FlushOptions flush_opts;
+          std::vector<ColumnFamilyHandle*> cfhs;
+          std::for_each(
+              rand_column_families.begin(), rand_column_families.end(),
+              [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+          Status status = db_->Flush(flush_opts, cfhs);
+          if (!status.ok()) {
+            fprintf(stdout, "Unable to perform Flush(): %s\n",
+                    status.ToString().c_str());
+          }
+        }
+
+        std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+
+        if (FLAGS_ingest_external_file_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
+          TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
+        }
+
+        if (FLAGS_backup_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
+        }
+
+        if (FLAGS_checkpoint_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
+          Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+          }
+        }
+
+        if (FLAGS_acquire_snapshot_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
+          auto snapshot = db_->GetSnapshot();
+          ReadOptions ropt;
+          ropt.snapshot = snapshot;
+          std::string value_at;
+          // When taking a snapshot, we also read a key from that snapshot. We
+          // will later read the same key before releasing the snapshot and
+          // verify that the results are the same.
+          auto status_at = db_->Get(ropt, column_family, key, &value_at);
+          std::vector<bool>* key_vec = nullptr;
+
+          if (FLAGS_compare_full_db_state_snapshot && (thread->tid == 0)) {
+            key_vec = new std::vector<bool>(FLAGS_max_key);
+            // When `prefix_extractor` is set, seeking to beginning and scanning
+            // across prefixes are only supported with `total_order_seek` set.
+            ropt.total_order_seek = true;
+            std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+            for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+              uint64_t key_val;
+              if (GetIntVal(iterator->key().ToString(), &key_val)) {
+                (*key_vec)[key_val] = true;
+              }
+            }
+          }
+
+          ThreadState::SnapshotState snap_state = {
+              snapshot, rand_column_family, column_family->GetName(),
+              keystr,   status_at,          value_at,
+              key_vec};
+          thread->snapshot_queue.emplace(
+              std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
+              snap_state);
+        }
+        while (!thread->snapshot_queue.empty() &&
+               i >= thread->snapshot_queue.front().first) {
+          auto snap_state = thread->snapshot_queue.front().second;
+          assert(snap_state.snapshot);
+          // Note: this is unsafe as the cf might be dropped concurrently. But
+          // it is ok since unclean cf drop is cunnrently not supported by write
+          // prepared transactions.
+          Status s =
+              AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+          }
+          db_->ReleaseSnapshot(snap_state.snapshot);
+          delete snap_state.key_vec;
+          thread->snapshot_queue.pop();
+        }
+
+        int prob_op = thread->rand.Uniform(100);
+        // Reset this in case we pick something other than a read op. We don't
+        // want to use a stale value when deciding at the beginning of the loop
+        // whether to vote to reopen
+        if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+          // OPERATION read
+          if (FLAGS_use_multiget) {
+            // Leave room for one more iteration of the loop with a single key
+            // batch. This is to ensure that each thread does exactly the same
+            // number of ops
+            int multiget_batch_size = static_cast<int>(
+                std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                         FLAGS_ops_per_thread - i - 1));
+            // If its the last iteration, ensure that multiget_batch_size is 1
+            multiget_batch_size = std::max(multiget_batch_size, 1);
+            rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
+            TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
+            i += multiget_batch_size - 1;
+          } else {
+            TestGet(thread, read_opts, rand_column_families, rand_keys);
+          }
+        } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+          // OPERATION prefix scan
+          // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+          // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+          // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+          // prefix
+          TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
+        } else if (prefixBound <= prob_op && prob_op < writeBound) {
+          // OPERATION write
+          TestPut(thread, write_opts, read_opts, rand_column_families,
+                  rand_keys, value, lock);
+        } else if (writeBound <= prob_op && prob_op < delBound) {
+          // OPERATION delete
+          TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
+        } else if (delBound <= prob_op && prob_op < delRangeBound) {
+          // OPERATION delete range
+          TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
+                          lock);
+        } else {
+          // OPERATION iterate
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
+                       FLAGS_ops_per_thread - i - 1));
+          rand_keys = GenerateNKeys(thread, num_seeks, i);
+          i += num_seeks - 1;
+          TestIterate(thread, read_opts, rand_column_families, rand_keys);
+        }
+        thread->stats.FinishedSingleOp();
+#ifndef ROCKSDB_LITE
+        uint32_t tid = thread->tid;
+        assert(secondaries_.empty() ||
+               static_cast<size_t>(tid) < secondaries_.size());
+        if (FLAGS_secondary_catch_up_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
+          Status s = secondaries_[tid]->TryCatchUpWithPrimary();
+          if (!s.ok()) {
+            VerificationAbort(shared, "Secondary instance failed to catch up",
+                              s);
+            break;
+          }
+        }
+#endif
+      }
+    }
+    while (!thread->snapshot_queue.empty()) {
+      db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+      delete thread->snapshot_queue.front().second.key_vec;
+      thread->snapshot_queue.pop();
+    }
+
+    thread->stats.Stop();
+  }
+
+  virtual void VerifyDb(ThreadState* thread) const = 0;
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
+
+  virtual bool ShouldAcquireMutexOnKey() const { return false; }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int rand_column_family) const {
+    return {rand_column_family};
+  }
+
+  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
+    return {rand_key};
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& read_opts,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& read_opts,
+                         const std::vector<int>& cf_ids,
+                         const std::vector<int64_t>& keys, char (&value)[100],
+                         std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) = 0;
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
+                             const std::vector<int>& rand_column_families,
+                             const std::vector<int64_t>& rand_keys) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = read_opts;
+    readoptionscopy.snapshot = snapshot;
+
+    std::string upper_bound_str;
+    Slice upper_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator upper bound
+      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      upper_bound_str = Key(rand_upper_key);
+      upper_bound = Slice(upper_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_upper_bound = &upper_bound;
+    }
+    std::string lower_bound_str;
+    Slice lower_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, enable iterator lower bound
+      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      lower_bound_str = Key(rand_lower_key);
+      lower_bound = Slice(lower_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_lower_bound = &lower_bound;
+    }
+
+    auto cfh = column_families_[rand_column_families[0]];
+    std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
+
+    for (int64_t rkey : rand_keys) {
+      std::string key_str = Key(rkey);
+      Slice key = key_str;
+
+      if (readoptionscopy.iterate_upper_bound != nullptr &&
+          thread->rand.OneIn(2)) {
+        // 1/2 chance, change the upper bound.
+        // It is possible that it is changed without first use, but there is no
+        // problem with that.
+        int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+        upper_bound_str = Key(rand_upper_key);
+        upper_bound = Slice(upper_bound_str);
+      }
+
+      // Set up an iterator and does the same without bounds and with total
+      // order seek and compare the results. This is to identify bugs related
+      // to bounds, prefix extractor or reseeking. Sometimes we are comparing
+      // iterators with the same set-up, and it doesn't hurt to check them
+      // to be equal.
+      ReadOptions cmp_ro;
+      cmp_ro.snapshot = snapshot;
+      cmp_ro.total_order_seek = true;
+      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cfh));
+      bool diverged = false;
+
+      iter->Seek(key);
+      cmp_iter->Seek(key);
+      VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
+                     &diverged);
+
+      for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+        if (thread->rand.OneIn(2)) {
+          iter->Next();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Next();
+          }
+        } else {
+          iter->Prev();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Prev();
+          }
+        }
+        VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
+                       &diverged);
+      }
+
+      if (s.ok()) {
+        thread->stats.AddIterations(1);
+      } else {
+        thread->stats.AddErrors(1);
+        break;
+      }
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+  // Compare the two iterator, iter and cmp_iter are in the same position,
+  // unless iter might be made invalidate or undefined because of
+  // upper or lower bounds, or prefix extractor.
+  // Will flag failure if the verification fails.
+  // diverged = true if the two iterator is already diverged.
+  // True if verification passed, false if not.
+  void VerifyIterator(ThreadState* thread, const ReadOptions& ro,
+                      Iterator* iter, Iterator* cmp_iter, const Slice& seek_key,
+                      bool* diverged) {
+    if (*diverged) {
+      return;
+    }
+
+    if (ro.iterate_lower_bound != nullptr) {
+      // Lower bound would create a lot of discrepency for now so disabling
+      // the verification for now.
+      *diverged = true;
+      return;
+    }
+
+    if (iter->Valid() && !cmp_iter->Valid()) {
+      fprintf(stderr,
+              "Control interator is invalid but iterator has key %s seek key "
+              "%s\n",
+              iter->key().ToString(true).c_str(),
+              seek_key.ToString(true).c_str());
+      if (ro.iterate_upper_bound != nullptr) {
+        fprintf(stderr, "upper bound %s\n",
+                ro.iterate_upper_bound->ToString(true).c_str());
+      }
+      if (ro.iterate_lower_bound != nullptr) {
+        fprintf(stderr, "lower bound %s\n",
+                ro.iterate_lower_bound->ToString(true).c_str());
+      }
+
+      *diverged = true;
+    } else if (cmp_iter->Valid()) {
+      // Iterator is not valid. It can be legimate if it has already been
+      // out of upper or lower bound, or filtered out by prefix iterator.
+      const Slice& total_order_key = cmp_iter->key();
+      const SliceTransform* pe = options_.prefix_extractor.get();
+      const Comparator* cmp = options_.comparator;
+
+      if (pe != nullptr) {
+        if (!pe->InDomain(seek_key)) {
+          // Prefix seek a non-in-domain key is undefined. Skip checking for
+          // this scenario.
+          *diverged = true;
+          return;
+        }
+
+        if (!pe->InDomain(total_order_key) ||
+            pe->Transform(total_order_key) != pe->Transform(seek_key)) {
+          // If the prefix is exhausted, the only thing needs to check
+          // is the iterator isn't return a position in prefix.
+          // Either way, checking can stop from here.
+          *diverged = true;
+          if (!iter->Valid() || !pe->InDomain(iter->key()) ||
+              pe->Transform(iter->key()) != pe->Transform(seek_key)) {
+            return;
+          }
+          fprintf(stderr,
+                  "Iterator stays in prefix bug contol doesn't"
+                  " seek key %s iterator key %s control iterator key %s\n",
+                  seek_key.ToString(true).c_str(),
+                  iter->key().ToString(true).c_str(),
+                  cmp_iter->key().ToString(true).c_str());
+        }
+      }
+      // Check upper or lower bounds.
+      if (!*diverged) {
+        if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
+            (!iter->Valid() &&
+             (ro.iterate_upper_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
+             (ro.iterate_lower_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
+          fprintf(stderr,
+                  "Iterator diverged from control iterator which"
+                  " has value %s seek key %s\n",
+                  total_order_key.ToString(true).c_str(),
+                  seek_key.ToString(true).c_str());
+          if (iter->Valid()) {
+            fprintf(stderr, "iterator has value %s\n",
+                    iter->key().ToString(true).c_str());
+          } else {
+            fprintf(stderr, "iterator is not valid\n");
+          }
+          if (ro.iterate_upper_bound != nullptr) {
+            fprintf(stderr, "upper bound %s\n",
+                    ro.iterate_upper_bound->ToString(true).c_str());
+          }
+          if (ro.iterate_lower_bound != nullptr) {
+            fprintf(stderr, "lower bound %s\n",
+                    ro.iterate_lower_bound->ToString(true).c_str());
+          }
+          *diverged = true;
+        }
+      }
+    }
+    if (*diverged) {
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    }
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestBackupRestore(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestBackupRestore\n");
+    std::terminate();
+  }
+
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else   // ROCKSDB_LITE
+  virtual Status TestBackupRestore(ThreadState* thread,
+                                   const std::vector<int>& rand_column_families,
+                                   const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
+    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
+    BackupableDBOptions backup_opts(backup_dir);
+    BackupEngine* backup_engine = nullptr;
+    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    if (s.ok()) {
+      s = backup_engine->CreateNewBackup(db_);
+    }
+    if (s.ok()) {
+      delete backup_engine;
+      backup_engine = nullptr;
+      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    }
+    if (s.ok()) {
+      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+    }
+    if (s.ok()) {
+      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+    }
+    DB* restored_db = nullptr;
+    std::vector<ColumnFamilyHandle*> restored_cf_handles;
+    if (s.ok()) {
+      Options restore_options(options_);
+      restore_options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      assert(FLAGS_clear_column_family_one_in == 0);
+      for (auto name : column_family_names_) {
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
+      }
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+    }
+    // for simplicity, currently only verifies existence/non-existence of a few
+    // keys
+    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+      std::string key_str = Key(rand_keys[i]);
+      Slice key = key_str;
+      std::string restored_value;
+      Status get_status = restored_db->Get(
+          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+          &restored_value);
+      bool exists =
+          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+      if (get_status.ok()) {
+        if (!exists) {
+          s = Status::Corruption(
+              "key exists in restore but not in original db");
+        }
+      } else if (get_status.IsNotFound()) {
+        if (exists) {
+          s = Status::Corruption(
+              "key exists in original db but not in restore");
+        }
+      } else {
+        s = get_status;
+      }
+    }
+    if (backup_engine != nullptr) {
+      delete backup_engine;
+      backup_engine = nullptr;
+    }
+    if (restored_db != nullptr) {
+      for (auto* cf_handle : restored_cf_handles) {
+        restored_db->DestroyColumnFamilyHandle(cf_handle);
+      }
+      delete restored_db;
+      restored_db = nullptr;
+    }
+    if (!s.ok()) {
+      printf("A backup/restore operation failed with: %s\n",
+             s.ToString().c_str());
+    }
+    return s;
+  }
+
+  virtual Status TestCheckpoint(ThreadState* thread,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    DestroyDB(checkpoint_dir, Options());
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    }
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+        std::string key_str = Key(rand_keys[i]);
+        Slice key = key_str;
+        std::string value;
+        Status get_status = checkpoint_db->Get(
+            ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
+        bool exists =
+            thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+        if (get_status.ok()) {
+          if (!exists) {
+            s = Status::Corruption(
+                "key exists in checkpoint but not in original db");
+          }
+        } else if (get_status.IsNotFound()) {
+          if (exists) {
+            s = Status::Corruption(
+                "key exists in original db but not in checkpoint");
+          }
+        } else {
+          s = get_status;
+        }
+      }
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, Options());
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // ROCKSDB_LITE
+
+  void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
+    printf("Verification failed: %s. Status is %s\n", msg.c_str(),
+           s.ToString().c_str());
+    shared->SetVerificationFailure();
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         int64_t key) const {
+    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf,
+           key, msg.c_str());
+    shared->SetVerificationFailure();
+  }
+
+  void PrintEnv() const {
+    fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
+            kMinorVersion);
+    fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
+    fprintf(stdout, "TransactionDB             : %s\n",
+            FLAGS_use_txn ? "true" : "false");
+    fprintf(stdout, "Read only mode            : %s\n",
+            FLAGS_read_only ? "true" : "false");
+    fprintf(stdout, "Atomic flush              : %s\n",
+            FLAGS_atomic_flush ? "true" : "false");
+    fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "Clear CFs one in          : %d\n",
+              FLAGS_clear_column_family_one_in);
+    }
+    fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
+    fprintf(stdout, "Ops per thread            : %lu\n",
+            (unsigned long)FLAGS_ops_per_thread);
+    std::string ttl_state("unused");
+    if (FLAGS_ttl > 0) {
+      ttl_state = NumberToString(FLAGS_ttl);
+    }
+    fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
+    fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
+    fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
+    fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
+    fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
+    fprintf(stdout, "Delete range percentage   : %d%%\n",
+            FLAGS_delrangepercent);
+    fprintf(stdout, "No overwrite percentage   : %d%%\n",
+            FLAGS_nooverwritepercent);
+    fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
+            FLAGS_db_write_buffer_size);
+    fprintf(stdout, "Write-buffer-size         : %d\n",
+            FLAGS_write_buffer_size);
+    fprintf(stdout, "Iterations                : %lu\n",
+            (unsigned long)FLAGS_num_iterations);
+    fprintf(stdout, "Max key                   : %lu\n",
+            (unsigned long)FLAGS_max_key);
+    fprintf(stdout, "Ratio #ops/#keys          : %f\n",
+            (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
+    fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
+    fprintf(stdout, "Batches/snapshots         : %d\n",
+            FLAGS_test_batches_snapshots);
+    fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
+    fprintf(stdout, "Num keys per lock         : %d\n",
+            1 << FLAGS_log2_keys_per_lock);
+    std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
+    fprintf(stdout, "Compression               : %s\n", compression.c_str());
+    std::string checksum = ChecksumTypeToString(FLAGS_checksum_type_e);
+    fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
+    fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
+            FLAGS_subcompactions);
+    fprintf(stdout, "Use MultiGet              : %s\n",
+            FLAGS_use_multiget ? "true" : "false");
+
+    const char* memtablerep = "";
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        memtablerep = "skip_list";
+        break;
+      case kHashSkipList:
+        memtablerep = "prefix_hash";
+        break;
+      case kVectorRep:
+        memtablerep = "vector";
+        break;
+    }
+
+    fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
+
+    fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
+    if (!rocksdb_kill_prefix_blacklist.empty()) {
+      fprintf(stdout, "Skipping kill points prefixes:\n");
+      for (auto& p : rocksdb_kill_prefix_blacklist) {
+        fprintf(stdout, "  %s\n", p.c_str());
+      }
+    }
+    fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
+            FLAGS_periodic_compaction_seconds);
+    fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
+            FLAGS_compaction_ttl);
+
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+#ifndef ROCKSDB_LITE
+    assert(txn_db_ == nullptr);
+#endif
+    if (FLAGS_options_file.empty()) {
+      BlockBasedTableOptions block_based_options;
+      block_based_options.block_cache = cache_;
+      block_based_options.cache_index_and_filter_blocks =
+          FLAGS_cache_index_and_filter_blocks;
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.checksum = FLAGS_checksum_type_e;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.format_version =
+          static_cast<uint32_t>(FLAGS_format_version);
+      block_based_options.index_block_restart_interval =
+          static_cast<int32_t>(FLAGS_index_block_restart_interval);
+      block_based_options.filter_policy = filter_policy_;
+      block_based_options.partition_filters = FLAGS_partition_filters;
+      block_based_options.index_type =
+          static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
+      options_.table_factory.reset(
+          NewBlockBasedTableFactory(block_based_options));
+      options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
+      options_.write_buffer_size = FLAGS_write_buffer_size;
+      options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
+      options_.min_write_buffer_number_to_merge =
+          FLAGS_min_write_buffer_number_to_merge;
+      options_.max_write_buffer_number_to_maintain =
+          FLAGS_max_write_buffer_number_to_maintain;
+      options_.max_write_buffer_size_to_maintain =
+          FLAGS_max_write_buffer_size_to_maintain;
+      options_.memtable_prefix_bloom_size_ratio =
+          FLAGS_memtable_prefix_bloom_size_ratio;
+      options_.memtable_whole_key_filtering =
+          FLAGS_memtable_whole_key_filtering;
+      options_.max_background_compactions = FLAGS_max_background_compactions;
+      options_.max_background_flushes = FLAGS_max_background_flushes;
+      options_.compaction_style =
+          static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+      if (FLAGS_prefix_size >= 0) {
+        options_.prefix_extractor.reset(
+            NewFixedPrefixTransform(FLAGS_prefix_size));
+      }
+      options_.max_open_files = FLAGS_open_files;
+      options_.statistics = dbstats;
+      options_.env = FLAGS_env;
+      options_.use_fsync = FLAGS_use_fsync;
+      options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
+      options_.allow_mmap_reads = FLAGS_mmap_read;
+      options_.allow_mmap_writes = FLAGS_mmap_write;
+      options_.use_direct_reads = FLAGS_use_direct_reads;
+      options_.use_direct_io_for_flush_and_compaction =
+          FLAGS_use_direct_io_for_flush_and_compaction;
+      options_.recycle_log_file_num =
+          static_cast<size_t>(FLAGS_recycle_log_file_num);
+      options_.target_file_size_base = FLAGS_target_file_size_base;
+      options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+      options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+      options_.max_bytes_for_level_multiplier =
+          FLAGS_max_bytes_for_level_multiplier;
+      options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+      options_.level0_slowdown_writes_trigger =
+          FLAGS_level0_slowdown_writes_trigger;
+      options_.level0_file_num_compaction_trigger =
+          FLAGS_level0_file_num_compaction_trigger;
+      options_.compression = FLAGS_compression_type_e;
+      options_.compression_opts.max_dict_bytes =
+          FLAGS_compression_max_dict_bytes;
+      options_.compression_opts.zstd_max_train_bytes =
+          FLAGS_compression_zstd_max_train_bytes;
+      options_.create_if_missing = true;
+      options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
+      options_.inplace_update_support = FLAGS_in_place_update;
+      options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+      options_.allow_concurrent_memtable_write =
+          FLAGS_allow_concurrent_memtable_write;
+      options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+      options_.ttl = FLAGS_compaction_ttl;
+      options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
+      options_.enable_write_thread_adaptive_yield =
+          FLAGS_enable_write_thread_adaptive_yield;
+      options_.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+      options_.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+      options_.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+      options_.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+      options_.atomic_flush = FLAGS_atomic_flush;
+    } else {
+#ifdef ROCKSDB_LITE
+      fprintf(stderr, "--options_file not supported in lite mode\n");
+      exit(1);
+#else
+      DBOptions db_options;
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      Status s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(),
+                                     &db_options, &cf_descriptors);
+      if (!s.ok()) {
+        fprintf(stderr, "Unable to load options file %s --- %s\n",
+                FLAGS_options_file.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+      options_ = Options(db_options, cf_descriptors[0].options);
+#endif  // ROCKSDB_LITE
+    }
+
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      options_.rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
+          10 /* fairness */,
+          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                    : RateLimiter::Mode::kWritesOnly));
+      if (FLAGS_rate_limit_bg_reads) {
+        options_.new_table_reader_for_compaction_inputs = true;
+      }
+    }
+
+    if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
+      fprintf(stderr,
+              "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) {
+      fprintf(stderr,
+              "WARNING: prefix_size is non-zero but "
+              "memtablerep != prefix_hash\n");
+    }
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        // no need to do anything
+        break;
+#ifndef ROCKSDB_LITE
+      case kHashSkipList:
+        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+        break;
+      case kVectorRep:
+        options_.memtable_factory.reset(new VectorRepFactory());
+        break;
+#else
+      default:
+        fprintf(stderr,
+                "RocksdbLite only supports skip list mem table. Skip "
+                "--rep_factory\n");
+#endif  // ROCKSDB_LITE
+    }
+
+    if (FLAGS_use_full_merge_v1) {
+      options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
+    } else {
+      options_.merge_operator = MergeOperators::CreatePutOperator();
+    }
+
+    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+    Status s;
+    if (FLAGS_ttl == -1) {
+      std::vector<std::string> existing_column_families;
+      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
+                                 &existing_column_families);  // ignore errors
+      if (!s.ok()) {
+        // DB doesn't exist
+        assert(existing_column_families.empty());
+        assert(column_family_names_.empty());
+        column_family_names_.push_back(kDefaultColumnFamilyName);
+      } else if (column_family_names_.empty()) {
+        // this is the first call to the function Open()
+        column_family_names_ = existing_column_families;
+      } else {
+        // this is a reopen. just assert that existing column_family_names are
+        // equivalent to what we remember
+        auto sorted_cfn = column_family_names_;
+        std::sort(sorted_cfn.begin(), sorted_cfn.end());
+        std::sort(existing_column_families.begin(),
+                  existing_column_families.end());
+        if (sorted_cfn != existing_column_families) {
+          fprintf(stderr,
+                  "Expected column families differ from the existing:\n");
+          printf("Expected: {");
+          for (auto cf : sorted_cfn) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+          printf("Existing: {");
+          for (auto cf : existing_column_families) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+        }
+        assert(sorted_cfn == existing_column_families);
+      }
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      for (auto name : column_family_names_) {
+        if (name != kDefaultColumnFamilyName) {
+          new_column_family_name_ =
+              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
+        }
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+      }
+      while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
+        std::string name = ToString(new_column_family_name_.load());
+        new_column_family_name_++;
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+        column_family_names_.push_back(name);
+      }
+      options_.listeners.clear();
+      options_.listeners.emplace_back(
+          new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
+      options_.create_missing_column_families = true;
+      if (!FLAGS_use_txn) {
+        if (db_preload_finished_.load() && FLAGS_read_only) {
+          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
+                                  &column_families_, &db_);
+        } else {
+          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                       &column_families_, &db_);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        TransactionDBOptions txn_db_options;
+        // For the moment it is sufficient to test WRITE_PREPARED policy
+        txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+        s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
+                                cf_descriptors, &column_families_, &txn_db_);
+        db_ = txn_db_;
+        // after a crash, rollback to commit recovered transactions
+        std::vector<Transaction*> trans;
+        txn_db_->GetAllPreparedTransactions(&trans);
+        Random rand(static_cast<uint32_t>(FLAGS_seed));
+        for (auto txn : trans) {
+          if (rand.OneIn(2)) {
+            s = txn->Commit();
+            assert(s.ok());
+          } else {
+            s = txn->Rollback();
+            assert(s.ok());
+          }
+          delete txn;
+        }
+        trans.clear();
+        txn_db_->GetAllPreparedTransactions(&trans);
+        assert(trans.size() == 0);
+#endif
+      }
+      assert(!s.ok() || column_families_.size() ==
+                            static_cast<size_t>(FLAGS_column_families));
+
+      if (FLAGS_enable_secondary) {
+#ifndef ROCKSDB_LITE
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        secondary_cfh_lists_.clear();
+        secondary_cfh_lists_.resize(FLAGS_threads);
+        Options tmp_opts;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        tmp_opts.statistics = dbstats_secondaries;
+        tmp_opts.env = FLAGS_env;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  cf_descriptors, &secondary_cfh_lists_[i],
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+#else
+        fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
+        exit(1);
+#endif
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      DBWithTTL* db_with_ttl;
+      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+      db_ = db_with_ttl;
+      if (FLAGS_enable_secondary) {
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        Options tmp_opts;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+#else
+      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
+      exit(1);
+#endif
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Reopen() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    db_ = nullptr;
+#ifndef ROCKSDB_LITE
+    txn_db_ = nullptr;
+#endif
+
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
+
+    num_times_reopened_++;
+    auto now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Reopening database for the %dth time\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str(),
+            num_times_reopened_);
+    Open();
+  }
+
+  void PrintStatistics() {
+    if (dbstats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+    if (dbstats_secondaries) {
+      fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
+              dbstats_secondaries->ToString().c_str());
+    }
+  }
+
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
+  DB* db_;
+#ifndef ROCKSDB_LITE
+  TransactionDB* txn_db_;
+#endif
+  Options options_;
+  std::vector<ColumnFamilyHandle*> column_families_;
+  std::vector<std::string> column_family_names_;
+  std::atomic<int> new_column_family_name_;
+  int num_times_reopened_;
+  std::unordered_map<std::string, std::vector<std::string>> options_table_;
+  std::vector<std::string> options_index_;
+  std::atomic<bool> db_preload_finished_;
+
+  // Fields used for stress-testing secondary instance in the same process
+  std::vector<DB*> secondaries_;
+  std::vector<std::vector<ColumnFamilyHandle*>> secondary_cfh_lists_;
+};
+
+class NonBatchedOpsStressTest : public StressTest {
+ public:
+  NonBatchedOpsStressTest() {}
+
+  virtual ~NonBatchedOpsStressTest() {}
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    auto shared = thread->shared;
+    const int64_t max_key = shared->GetMaxKey();
+    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
+    int64_t start = keys_per_thread * thread->tid;
+    int64_t end = start + keys_per_thread;
+    uint64_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (!thread->rand.OneIn(2)) {
+        // Use iterator to verify this range
+        std::unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+        iter->Seek(Key(start));
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // TODO(ljin): update "long" to uint64_t
+          // Reseek when the prefix changes
+          if (prefix_to_use > 0 &&
+              i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
+            iter->Seek(Key(i));
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = iter->status();
+          if (iter->Valid()) {
+            if (iter->key().compare(k) > 0) {
+              s = Status::NotFound(Slice());
+            } else if (iter->key().compare(k) == 0) {
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else if (iter->key().compare(k) < 0) {
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound(Slice());
+          }
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      } else {
+        // Use Get to verify this range
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      }
+    }
+  }
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* thread) {
+    if (FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
+      if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
+        // drop column family and then create it again (can't drop default)
+        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+        std::string new_name = ToString(new_column_family_name_.fetch_add(1));
+        {
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(
+              stdout,
+              "[CF %d] Dropping and recreating column family. new name: %s\n",
+              cf, new_name.c_str());
+        }
+        thread->shared->LockColumnFamily(cf);
+        Status s = db_->DropColumnFamily(column_families_[cf]);
+        delete column_families_[cf];
+        if (!s.ok()) {
+          fprintf(stderr, "dropping column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                    &column_families_[cf]);
+        column_family_names_[cf] = new_name;
+        thread->shared->ClearColumnFamily(cf);
+        if (!s.ok()) {
+          fprintf(stderr, "creating column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        thread->shared->UnlockColumnFamily(cf);
+      }
+    }
+  }
+
+  virtual bool ShouldAcquireMutexOnKey() const { return true; }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    std::string from_db;
+    Status s = db_->Get(read_opts, cfh, key, &from_db);
+    if (s.ok()) {
+      // found case
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      // not found case
+      thread->stats.AddGets(1, 0);
+    } else {
+      // errors case
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    key_str.reserve(num_keys);
+    keys.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (const auto& s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& read_opts,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = read_opts;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      // For half of the time, set the upper bound to the next prefix
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    Status s = iter->status();
+    if (iter->status().ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100], std::unique_ptr<MutexLock>& lock) {
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    while (!shared->AllowsOverwrite(rand_key) &&
+           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    ColumnFamilyHandle* cfh = column_families_[rand_column_family];
+
+    if (FLAGS_verify_before_write) {
+      std::string key_str2 = Key(rand_key);
+      Slice k = key_str2;
+      std::string from_db;
+      Status s = db_->Get(read_opts, cfh, k, &from_db);
+      if (!VerifyValue(rand_column_family, rand_key, read_opts, shared, from_db,
+                       s, true)) {
+        return s;
+      }
+    }
+    uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
+    Status s;
+    if (FLAGS_use_merge) {
+      if (!FLAGS_use_txn) {
+        s = db_->Merge(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Merge(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    } else {
+      if (!FLAGS_use_txn) {
+        s = db_->Put(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Put(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    }
+    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
+    if (!s.ok()) {
+      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    thread->stats.AddBytesForWrites(1, sz);
+    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
+                  sz);
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& lock) {
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+
+    // OPERATION delete
+    // If the chosen key does not allow overwrite and it does not exist,
+    // choose another key.
+    while (!shared->AllowsOverwrite(rand_key) &&
+           !shared->Exists(rand_column_family, rand_key)) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_family];
+
+    // Use delete if the key may be overwritten and a single deletion
+    // otherwise.
+    Status s;
+    if (shared->AllowsOverwrite(rand_key)) {
+      shared->Delete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->Delete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Delete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->Delete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    } else {
+      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->SingleDelete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->SingleDelete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddSingleDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& lock) {
+    // OPERATION delete range
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    // delete range does not respect disallowed overwrites. the keys for
+    // which overwrites are disallowed are randomly distributed so it
+    // could be expensive to find a range where each key allows
+    // overwrites.
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      lock.reset();
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+      range_locks.emplace_back(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    } else {
+      range_locks.emplace_back(std::move(lock));
+    }
+    for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
+      if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(new MutexLock(
+            shared->GetMutexForKey(rand_column_family, rand_key + j)));
+      }
+    }
+    shared->DeleteRange(rand_column_family, rand_key,
+                        rand_key + FLAGS_range_deletion_width,
+                        true /* pending */);
+
+    std::string keystr = Key(rand_key);
+    Slice key = keystr;
+    auto cfh = column_families_[rand_column_family];
+    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_keystr;
+    Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
+    if (!s.ok()) {
+      fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    int covered = shared->DeleteRange(rand_column_family, rand_key,
+                                      rand_key + FLAGS_range_deletion_width,
+                                      false /* pending */);
+    thread->stats.AddRangeDeletions(1);
+    thread->stats.AddCoveredByRangeDeletions(covered);
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+#else
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys, std::unique_ptr<MutexLock>& lock) {
+    const std::string sst_filename =
+        FLAGS_db + "/." + ToString(thread->tid) + ".sst";
+    Status s;
+    if (FLAGS_env->FileExists(sst_filename).ok()) {
+      // Maybe we terminated abnormally before, so cleanup to give this file
+      // ingestion a clean slate
+      s = FLAGS_env->DeleteFile(sst_filename);
+    }
+
+    SstFileWriter sst_file_writer(EnvOptions(), options_);
+    if (s.ok()) {
+      s = sst_file_writer.Open(sst_filename);
+    }
+    int64_t key_base = rand_keys[0];
+    int column_family = rand_column_families[0];
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    std::vector<uint32_t> values;
+    SharedState* shared = thread->shared;
+
+    // Grab locks, set pending state on expected values, and add keys
+    for (int64_t key = key_base;
+         s.ok() && key < std::min(key_base + FLAGS_ingest_external_file_width,
+                                  shared->GetMaxKey());
+         ++key) {
+      if (key == key_base) {
+        range_locks.emplace_back(std::move(lock));
+      } else if ((key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(
+            new MutexLock(shared->GetMutexForKey(column_family, key)));
+      }
+
+      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+      values.push_back(value_base);
+      shared->Put(column_family, key, value_base, true /* pending */);
+
+      char value[100];
+      size_t value_len = GenerateValue(value_base, value, sizeof(value));
+      auto key_str = Key(key);
+      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
+    }
+
+    if (s.ok()) {
+      s = sst_file_writer.Finish();
+    }
+    if (s.ok()) {
+      s = db_->IngestExternalFile(column_families_[column_family],
+                                  {sst_filename}, IngestExternalFileOptions());
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    int64_t key = key_base;
+    for (int32_t value : values) {
+      shared->Put(column_family, key, value, false /* pending */);
+      ++key;
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool VerifyValue(int cf, int64_t key, const ReadOptions& /*opts*/,
+                   SharedState* shared, const std::string& value_from_db,
+                   Status s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    char value[kValueMaxLen];
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::UNKNOWN_SENTINEL) {
+      return true;
+    }
+    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key);
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key);
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf,
+                          key);
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+class BatchedOpsStressTest : public StressTest {
+ public:
+  BatchedOpsStressTest() {}
+  virtual ~BatchedOpsStressTest() {}
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer BatchedOpsStressTest::TestGet
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    uint32_t value_base =
+        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      values[i] += v.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, keys[i], value_slices[i]);
+      } else {
+        batch.Put(cfh, keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      batch.Delete(cfh, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDeleteRange(
+      ThreadState* /* thread */, WriteOptions& /* write_opts */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    return Status::NotSupported(
+        "BatchedOpsStressTest does not support "
+        "TestDeleteRange");
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "BatchedOpsStressTest does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
+  // the DB.
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string from_db;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = from_db;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' ';  // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<Status> ret_status(num_keys);
+    std::array<std::string, 10> keys = {"0", "1", "2", "3", "4",
+                                        "5", "6", "7", "8", "9"};
+    size_t num_prefixes = keys.size();
+    for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
+      std::vector<Slice> key_slices;
+      std::vector<PinnableSlice> values(num_prefixes);
+      std::vector<Status> statuses(num_prefixes);
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = db_->GetSnapshot();
+      std::vector<std::string> key_str;
+      key_str.reserve(num_prefixes);
+      key_slices.reserve(num_prefixes);
+      std::string from_db;
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+      for (size_t key = 0; key < num_prefixes; ++key) {
+        key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
+        key_slices.emplace_back(key_str.back());
+      }
+      db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(),
+                    values.data(), statuses.data());
+      for (size_t i = 0; i < num_prefixes; i++) {
+        Status s = statuses[i];
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+          ret_status[rand_key] = s;
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (s.IsNotFound()) {
+          thread->stats.AddGets(1, 0);
+          ret_status[rand_key] = s;
+        } else {
+          char expected_prefix = (keys[i])[0];
+          char actual_prefix = (values[i])[0];
+          if (actual_prefix != expected_prefix) {
+            fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                    expected_prefix, actual_prefix);
+          }
+          std::string str;
+          str.assign(values[i].data(), values[i].size());
+          values[i].Reset();
+          str[0] = ' ';  // blank out the differing character
+          values[i].PinSelf(str);
+          thread->stats.AddGets(1, 1);
+        }
+      }
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+      // Now that we retrieved all values, check that they all match
+      for (size_t i = 1; i < num_prefixes; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                  key_str[i].c_str(), StringToHex(values[0].ToString()).c_str(),
+                  StringToHex(values[i].ToString()).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+      }
+    }
+
+    return ret_status;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    std::string upper_bounds[10];
+    Slice ub_slices[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += key.ToString();
+      prefixes[i].resize(prefix_to_use);
+      prefix_slices[i] = Slice(prefixes[i]);
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].snapshot = snapshot;
+      if (thread->rand.OneIn(2) &&
+          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
+        // For half of the time, set the upper bound to the next prefix
+        ub_slices[i] = Slice(upper_bounds[i]);
+        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
+      }
+      iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    long count = 0;
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' ';  // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr,
+                  "error : %d, inconsistent values for prefix %s: %s, %s\n", i,
+                  prefixes[i].c_str(), StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  virtual void VerifyDb(ThreadState* /* thread */) const {}
+};
+
+class CfConsistencyStressTest : public StressTest {
+ public:
+  CfConsistencyStressTest() : batch_id_(0) {}
+
+  virtual ~CfConsistencyStressTest() {}
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    uint64_t value_base = batch_id_.fetch_add(1);
+    size_t sz =
+        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
+    Slice v(value, sz);
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, key, v);
+      } else { /* !FLAGS_use_merge */
+        batch.Put(cfh, key, v);
+      }
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      auto num = static_cast<long>(rand_column_families.size());
+      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      batch.Delete(cfh, key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& /* lock */) {
+    int64_t rand_key = rand_keys[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
+      batch.DeleteRange(cfh, key, end_key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddRangeDeletions(
+          static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "CfConsistencyStressTest does not support TestIngestExternalFile "
+            "because it's not possible to verify the result\n");
+    std::terminate();
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    std::string from_db;
+    Status s = db_->Get(readoptions, cfh, key, &from_db);
+    if (s.ok()) {
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(1, 0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    keys.reserve(num_keys);
+    key_str.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (auto s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), prefix_to_use);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = readoptions;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(prefix_to_use == 0 ||
+           count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
+    Status s = iter->status();
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else
+  virtual Status TestCheckpoint(
+      ThreadState* thread, const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    DestroyDB(checkpoint_dir, Options());
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    }
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, Options());
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // !ROCKSDB_LITE
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    // We must set total_order_seek to true because we are doing a SeekToFirst
+    // on a column family whose memtables may support (by default) prefix-based
+    // iterator. In this case, NewIterator with options.total_order_seek being
+    // false returns a prefix-based iterator. Calling SeekToFirst using this
+    // iterator causes the iterator to become invalid. That means we cannot
+    // iterate the memtable using this iterator any more, although the memtable
+    // contains the most up-to-date key-values.
+    options.total_order_seek = true;
+    assert(thread != nullptr);
+    auto shared = thread->shared;
+    std::vector<std::unique_ptr<Iterator>> iters(column_families_.size());
+    for (size_t i = 0; i != column_families_.size(); ++i) {
+      iters[i].reset(db_->NewIterator(options, column_families_[i]));
+    }
+    for (auto& iter : iters) {
+      iter->SeekToFirst();
+    }
+    size_t num = column_families_.size();
+    assert(num == iters.size());
+    std::vector<Status> statuses(num, Status::OK());
+    do {
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+      size_t valid_cnt = 0;
+      size_t idx = 0;
+      for (auto& iter : iters) {
+        if (iter->Valid()) {
+          ++valid_cnt;
+        } else {
+          statuses[idx] = iter->status();
+        }
+        ++idx;
+      }
+      if (valid_cnt == 0) {
+        Status status;
+        for (size_t i = 0; i != num; ++i) {
+          const auto& s = statuses[i];
+          if (!s.ok()) {
+            status = s;
+            fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    s.ToString().c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+        if (status.ok()) {
+          fprintf(stdout, "Finished scanning all column families.\n");
+        }
+        break;
+      } else if (valid_cnt != iters.size()) {
+        shared->SetVerificationFailure();
+        for (size_t i = 0; i != num; ++i) {
+          if (!iters[i]->Valid()) {
+            if (statuses[i].ok()) {
+              fprintf(stderr, "Finished scanning cf %s\n",
+                      column_families_[i]->GetName().c_str());
+            } else {
+              fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                      column_families_[i]->GetName().c_str(),
+                      statuses[i].ToString().c_str());
+            }
+          } else {
+            fprintf(stderr, "cf %s has remaining data to scan\n",
+                    column_families_[i]->GetName().c_str());
+          }
+        }
+        break;
+      }
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+      // If the program reaches here, then all column families' iterators are
+      // still valid.
+      if (shared->PrintingVerificationResults()) {
+        continue;
+      }
+      Slice key;
+      Slice value;
+      int num_mismatched_cfs = 0;
+      for (size_t i = 0; i != num; ++i) {
+        if (i == 0) {
+          key = iters[i]->key();
+          value = iters[i]->value();
+        } else {
+          int cmp = key.compare(iters[i]->key());
+          if (cmp != 0) {
+            ++num_mismatched_cfs;
+            if (1 == num_mismatched_cfs) {
+              fprintf(stderr, "Verification failed\n");
+              fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
+                      db_->GetLatestSequenceNumber());
+              fprintf(stderr, "[%s] %s => %s\n",
+                      column_families_[0]->GetName().c_str(),
+                      key.ToString(true /* hex */).c_str(),
+                      value.ToString(true /* hex */).c_str());
+            }
+            fprintf(stderr, "[%s] %s => %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    iters[i]->key().ToString(true /* hex */).c_str(),
+                    iters[i]->value().ToString(true /* hex */).c_str());
+#ifndef ROCKSDB_LITE
+            Slice begin_key;
+            Slice end_key;
+            if (cmp < 0) {
+              begin_key = key;
+              end_key = iters[i]->key();
+            } else {
+              begin_key = iters[i]->key();
+              end_key = key;
+            }
+            std::vector<KeyVersion> versions;
+            const size_t kMaxNumIKeys = 8;
+            const auto print_key_versions = [&](ColumnFamilyHandle* cfh) {
+              Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key,
+                                           kMaxNumIKeys, &versions);
+              if (!s.ok()) {
+                fprintf(stderr, "%s\n", s.ToString().c_str());
+                return;
+              }
+              assert(nullptr != cfh);
+              fprintf(stderr,
+                      "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt
+                      ")\n",
+                      cfh->GetName().c_str(),
+                      begin_key.ToString(true /* hex */).c_str(),
+                      end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
+              for (const KeyVersion& kv : versions) {
+                fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
+                        Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
+                        kv.type);
+              }
+            };
+            if (1 == num_mismatched_cfs) {
+              print_key_versions(column_families_[0]);
+            }
+            print_key_versions(column_families_[i]);
+#endif  // ROCKSDB_LITE
+            shared->SetVerificationFailure();
+          }
+        }
+      }
+      shared->FinishPrintingVerificationResults();
+      for (auto& iter : iters) {
+        iter->Next();
+      }
+    } while (true);
+  }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int /* rand_column_family */) const {
+    std::vector<int> ret;
+    int num = static_cast<int>(column_families_.size());
+    int k = 0;
+    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
+    return ret;
+  }
+
+ private:
+  std::atomic<int64_t> batch_id_;
+};
+
+int db_stress_tool(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+    if (FLAGS_enable_secondary) {
+      dbstats_secondaries = rocksdb::CreateDBStatistics();
+    }
+  }
+  FLAGS_compression_type_e =
+      StringToCompressionType(FLAGS_compression_type.c_str());
+  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                  rocksdb::Env::Priority::BOTTOM);
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
+    fprintf(stderr,
+            "Error: prefixpercent is non-zero while prefix_size is "
+            "not positive!\n");
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify prefix_size for "
+            "test_batches_snapshots test!\n");
+    exit(1);
+  }
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
+    fprintf(stderr,
+            "Error: please specify positive prefix_size in order to use "
+            "memtable_prefix_bloom_size_ratio\n");
+    exit(1);
+  }
+  if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
+       FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) {
+    fprintf(stderr,
+            "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
+            "100!\n");
+    exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+    exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+    fprintf(stderr,
+            "Error: #DB-reopens should be < ops_per_thread\n"
+            "Provided reopens = %d and ops_per_thread = %lu\n",
+            FLAGS_reopen, (unsigned long)FLAGS_ops_per_thread);
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
+    fprintf(stderr,
+            "Error: nonzero delrangepercent unsupported in "
+            "test_batches_snapshots mode\n");
+    exit(1);
+  }
+  if (FLAGS_active_width > FLAGS_max_key) {
+    fprintf(stderr, "Error: active_width can be at most max_key\n");
+    exit(1);
+  } else if (FLAGS_active_width == 0) {
+    FLAGS_active_width = FLAGS_max_key;
+  }
+  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
+    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
+            kValueMaxLen / kRandomValueMaxFactor);
+    exit(1);
+  }
+  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
+    fprintf(
+        stderr,
+        "Error: nooverwritepercent must not be 100 when using merge operands");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0 && FLAGS_nooverwritepercent > 0) {
+    fprintf(stderr,
+            "Error: nooverwritepercent must be 0 when using file ingestion\n");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
+    fprintf(stderr,
+            "Error: clear_column_family_one_in must be 0 when using backup\n");
+    exit(1);
+  }
+  if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
+    FLAGS_atomic_flush = true;
+  }
+
+  if (FLAGS_read_only) {
+    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
+        FLAGS_delrangepercent != 0) {
+      fprintf(stderr, "Error: updates are not supported in read only mode\n");
+      exit(1);
+    } else if (FLAGS_checkpoint_one_in > 0 &&
+               FLAGS_clear_column_family_one_in > 0) {
+      fprintf(stdout,
+              "Warn: checkpoint won't be validated since column families may "
+              "be dropped.\n");
+    }
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbstress";
+    FLAGS_db = default_db_path;
+  }
+
+  if (FLAGS_enable_secondary && FLAGS_secondaries_base.empty()) {
+    std::string default_secondaries_path;
+    FLAGS_env->GetTestDirectory(&default_secondaries_path);
+    default_secondaries_path += "/dbstress_secondaries";
+    rocksdb::Status s = FLAGS_env->CreateDirIfMissing(default_secondaries_path);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to create directory %s: %s\n",
+              default_secondaries_path.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    FLAGS_secondaries_base = default_secondaries_path;
+  }
+
+  if (!FLAGS_enable_secondary && FLAGS_secondary_catch_up_one_in > 0) {
+    fprintf(stderr, "Secondary instance is disabled.\n");
+    exit(1);
+  }
+
+  rocksdb_kill_odds = FLAGS_kill_random_test;
+  rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
+
+  std::unique_ptr<rocksdb::StressTest> stress;
+  if (FLAGS_test_cf_consistency) {
+    stress.reset(new rocksdb::CfConsistencyStressTest());
+  } else if (FLAGS_test_batches_snapshots) {
+    stress.reset(new rocksdb::BatchedOpsStressTest());
+  } else {
+    stress.reset(new rocksdb::NonBatchedOpsStressTest());
+  }
+  if (stress->Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace rocksdb
+#endif  // GFLAGS

From 69bd8a2859a25970c840475836782ed3fe41801d Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 18 Oct 2019 10:22:54 -0700
Subject: [PATCH 458/572] Update HISTORY.md with recent BlobDB adjacent changes

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5939

Differential Revision: D18009096

Pulled By: ltamasi

fbshipit-source-id: 032a48a302f9da38aecf4055b5a8d4e1dffd9dc7
---
 HISTORY.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index b2a3cf73797..6e2d3a354ab 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -17,6 +17,9 @@
 * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
 * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
 * The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
+* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
+* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
+* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.

From 526e3b9763fa51a98696c5c047a4c7677ce61ec9 Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Fri, 18 Oct 2019 14:12:21 -0700
Subject: [PATCH 459/572] Enable trace_replay with multi-threads (#5934)

Summary:
In the current trace replay, all the queries are serialized and called by single threads. It may not simulate the original application query situations closely. The multi-threads replay is implemented in this PR. Users can set the number of threads to replay the trace. The queries generated according to the trace records are scheduled in the thread pool job queue.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5934

Test Plan: test with make check and real trace replay.

Differential Revision: D17998098

Pulled By: zhichao-cao

fbshipit-source-id: 87eecf6f7c17a9dc9d7ab29dd2af74f6f60212c8
---
 tools/db_bench_tool.cc       |   6 +-
 trace_replay/trace_replay.cc | 152 +++++++++++++++++++++++++++++++++++
 trace_replay/trace_replay.h  |  31 +++++++
 3 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index cee43dff643..7721ee476a4 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -798,7 +798,6 @@ DEFINE_string(trace_file, "", "Trace workload to a file. ");
 
 DEFINE_int32(trace_replay_fast_forward, 1,
              "Fast forward trace replay, must >= 1. ");
-
 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
              "Block cache trace sampling frequency, termed s. It uses spatial "
              "downsampling and samples accesses to one out of s blocks.");
@@ -809,6 +808,8 @@ DEFINE_int64(
     "will not be logged if the trace file size exceeds this threshold. Default "
     "is 64 GB.");
 DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
+DEFINE_int32(trace_replay_threads, 1,
+             "The number of threads to replay, must >=1.");
 
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
@@ -6529,7 +6530,8 @@ class Benchmark {
                       std::move(trace_reader));
     replayer.SetFastForward(
         static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
-    s = replayer.Replay();
+    s = replayer.MultiThreadReplay(
+        static_cast<uint32_t>(FLAGS_trace_replay_threads));
     if (s.ok()) {
       fprintf(stdout, "Replay started from trace_file: %s\n",
               FLAGS_trace_file.c_str());
diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index b444ab371d9..ef1ae1785a4 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
 #include "util/string_util.h"
+#include "util/threadpool_imp.h"
 
 namespace rocksdb {
 
@@ -173,6 +174,7 @@ Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
     : trace_reader_(std::move(reader)) {
   assert(db != nullptr);
   db_ = static_cast<DBImpl*>(db->GetRootDB());
+  env_ = Env::Default();
   for (ColumnFamilyHandle* cfh : handles) {
     cf_map_[cfh->GetID()] = cfh;
   }
@@ -285,6 +287,78 @@ Status Replayer::Replay() {
   return s;
 }
 
+// The trace can be replayed with multithread by configurnge the number of
+// threads in the thread pool. Trace records are read from the trace file
+// sequentially and the corresponding queries are scheduled in the task
+// queue based on the timestamp. Currently, we support Write_batch (Put,
+// Delete, SingleDelete, DeleteRange), Get, Iterator (Seek and SeekForPrev).
+Status Replayer::MultiThreadReplay(uint32_t threads_num) {
+  Status s;
+  Trace header;
+  s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+
+  ThreadPoolImpl thread_pool;
+  thread_pool.SetHostEnv(env_);
+
+  if (threads_num > 1) {
+    thread_pool.SetBackgroundThreads(static_cast<int>(threads_num));
+  } else {
+    thread_pool.SetBackgroundThreads(1);
+  }
+
+  std::chrono::system_clock::time_point replay_epoch =
+      std::chrono::system_clock::now();
+  WriteOptions woptions;
+  ReadOptions roptions;
+  ReplayerWorkerArg* ra;
+  uint64_t ops = 0;
+  while (s.ok()) {
+    ra = new ReplayerWorkerArg;
+    ra->db = db_;
+    s = ReadTrace(&(ra->trace_entry));
+    if (!s.ok()) {
+      break;
+    }
+    ra->woptions = woptions;
+    ra->roptions = roptions;
+
+    std::this_thread::sleep_until(
+        replay_epoch + std::chrono::microseconds(
+                           (ra->trace_entry.ts - header.ts) / fast_forward_));
+    if (ra->trace_entry.type == kTraceWrite) {
+      thread_pool.Schedule(&Replayer::BGWorkWriteBatch, ra, nullptr, nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceGet) {
+      thread_pool.Schedule(&Replayer::BGWorkGet, ra, nullptr, nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceIteratorSeek) {
+      thread_pool.Schedule(&Replayer::BGWorkIterSeek, ra, nullptr, nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceIteratorSeekForPrev) {
+      thread_pool.Schedule(&Replayer::BGWorkIterSeekForPrev, ra, nullptr,
+                           nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceEnd) {
+      // Do nothing for now.
+      // TODO: Add some validations later.
+      delete ra;
+      break;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    s = Status::OK();
+  }
+  thread_pool.JoinAllThreads();
+  return s;
+}
+
 Status Replayer::ReadHeader(Trace* header) {
   assert(header != nullptr);
   Status s = ReadTrace(header);
@@ -325,4 +399,82 @@ Status Replayer::ReadTrace(Trace* trace) {
   return TracerHelper::DecodeTrace(encoded_trace, trace);
 }
 
+void Replayer::BGWorkGet(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  if (cf_id == 0) {
+    ra->db->Get(ra->roptions, key, &value);
+  } else {
+    ra->db->Get(ra->roptions, (*cf_map)[cf_id], key, &value);
+  }
+
+  return;
+}
+
+void Replayer::BGWorkWriteBatch(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  WriteBatch batch(ra->trace_entry.payload);
+  ra->db->Write(ra->woptions, &batch);
+  return;
+}
+
+void Replayer::BGWorkIterSeek(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  Iterator* single_iter = nullptr;
+  if (cf_id == 0) {
+    single_iter = ra->db->NewIterator(ra->roptions);
+  } else {
+    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+  }
+  single_iter->Seek(key);
+  delete single_iter;
+  return;
+}
+
+void Replayer::BGWorkIterSeekForPrev(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  Iterator* single_iter = nullptr;
+  if (cf_id == 0) {
+    single_iter = ra->db->NewIterator(ra->roptions);
+  } else {
+    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+  }
+  single_iter->SeekForPrev(key);
+  delete single_iter;
+  return;
+}
+
 }  // namespace rocksdb
diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h
index d6956317096..776a1e0ca18 100644
--- a/trace_replay/trace_replay.h
+++ b/trace_replay/trace_replay.h
@@ -137,6 +137,11 @@ class Replayer {
   // between the traces into consideration.
   Status Replay();
 
+  // Replay the provide trace stream, which is the same as Replay(), with
+  // multi-threads. Queries are scheduled in the thread pool job queue.
+  // User can set the number of threads in the thread pool.
+  Status MultiThreadReplay(uint32_t threads_num);
+
   // Enables fast forwarding a replay by reducing the delay between the ingested
   // traces.
   // fast_forward : Rate of replay speedup.
@@ -149,10 +154,36 @@ class Replayer {
   Status ReadFooter(Trace* footer);
   Status ReadTrace(Trace* trace);
 
+  // The background function for MultiThreadReplay to execute Get query
+  // based on the trace records.
+  static void BGWorkGet(void* arg);
+
+  // The background function for MultiThreadReplay to execute WriteBatch
+  // (Put, Delete, SingleDelete, DeleteRange) based on the trace records.
+  static void BGWorkWriteBatch(void* arg);
+
+  // The background function for MultiThreadReplay to execute Iterator (Seek)
+  // based on the trace records.
+  static void BGWorkIterSeek(void* arg);
+
+  // The background function for MultiThreadReplay to execute Iterator
+  // (SeekForPrev) based on the trace records.
+  static void BGWorkIterSeekForPrev(void* arg);
+
   DBImpl* db_;
+  Env* env_;
   std::unique_ptr<TraceReader> trace_reader_;
   std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
   uint32_t fast_forward_;
 };
 
+// The passin arg of MultiThreadRepkay for each trace record.
+struct ReplayerWorkerArg {
+  DB* db;
+  Trace trace_entry;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*>* cf_map;
+  WriteOptions woptions;
+  ReadOptions roptions;
+};
+
 }  // namespace rocksdb

From fe464bca5cf3a4e576c8fb151c61d418cd95c8a8 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 18 Oct 2019 14:43:17 -0700
Subject: [PATCH 460/572] Fix PlainTableReader not to crash sst_dump (#5940)

Summary:
Plain table SSTs could crash sst_dump because of a bug in
PlainTableReader that can leave table_properties_ as null. Even if it
was intended not to keep the table properties in some cases, they were
leaked on the offending code path.

Steps to reproduce:

    $ db_bench --benchmarks=fillrandom --num=2000000 --use_plain_table --prefix-size=12
    $ sst_dump --file=0000xx.sst --show_properties
    from [] to []
    Process /dev/shm/dbbench/000014.sst
    Sst file format: plain table
    Raw user collected properties
    ------------------------------
    Segmentation fault (core dumped)

Also added missing unit testing of plain table full_scan_mode, and
an assertion in NewIterator to check for regression.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5940

Test Plan: new unit test, manual, make check

Differential Revision: D18018145

Pulled By: pdillinger

fbshipit-source-id: 4310c755e824c4cd6f3f86a3abc20dfa417c5e07
---
 db/plain_table_db_test.cc         | 49 +++++++++++++++++++++----------
 table/plain/plain_table_reader.cc |  8 +++--
 table/plain/plain_table_reader.h  |  2 ++
 tools/sst_dump_tool.cc            | 22 +++++++-------
 4 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 186f4717d72..a5dbe225b87 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -302,6 +302,7 @@ class TestPlainTableReader : public PlainTableReader {
         EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
       }
     }
+    table_properties_.reset(props);
   }
 
   ~TestPlainTableReader() override {}
@@ -396,7 +397,9 @@ TEST_P(PlainTableDBTest, Flush) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (EncodingType encoding_type : {kPlain, kPrefix}) {
-    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+    for (int bloom = -1; bloom <= 117; bloom += 117) {
+      const int bloom_bits = std::max(bloom, 0);
+      const bool full_scan_mode = bloom < 0;
       for (int total_order = 0; total_order <= 1; total_order++) {
         for (int store_index_in_file = 0; store_index_in_file <= 1;
              ++store_index_in_file) {
@@ -414,7 +417,7 @@ TEST_P(PlainTableDBTest, Flush) {
             plain_table_options.index_sparseness = 2;
             plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
             plain_table_options.encoding_type = encoding_type;
-            plain_table_options.full_scan_mode = false;
+            plain_table_options.full_scan_mode = full_scan_mode;
             plain_table_options.store_index_in_file = store_index_in_file;
 
             options.table_factory.reset(
@@ -427,7 +430,7 @@ TEST_P(PlainTableDBTest, Flush) {
             plain_table_options.index_sparseness = 16;
             plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
             plain_table_options.encoding_type = encoding_type;
-            plain_table_options.full_scan_mode = false;
+            plain_table_options.full_scan_mode = full_scan_mode;
             plain_table_options.store_index_in_file = store_index_in_file;
 
             options.table_factory.reset(
@@ -454,20 +457,36 @@ TEST_P(PlainTableDBTest, Flush) {
           auto row = ptc.begin();
           auto tp = row->second;
 
-          if (!store_index_in_file) {
-            ASSERT_EQ(total_order ? "4" : "12",
-                      (tp->user_collected_properties)
-                          .at("plain_table_hash_table_size"));
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_sub_index_size"));
+          if (full_scan_mode) {
+            // Does not support Get/Seek
+            std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+            iter->SeekToFirst();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("0000000000000bar", iter->key().ToString());
+            ASSERT_EQ("v2", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("1000000000000foo", iter->key().ToString());
+            ASSERT_EQ("v3", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_TRUE(iter->status().ok());
           } else {
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_hash_table_size"));
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_sub_index_size"));
+            if (!store_index_in_file) {
+              ASSERT_EQ(total_order ? "4" : "12",
+                        (tp->user_collected_properties)
+                            .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            } else {
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            }
+            ASSERT_EQ("v3", Get("1000000000000foo"));
+            ASSERT_EQ("v2", Get("0000000000000bar"));
           }
-          ASSERT_EQ("v3", Get("1000000000000foo"));
-          ASSERT_EQ("v2", Get("0000000000000bar"));
         }
         }
       }
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index aa8aa6ed16f..58145dda303 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -183,6 +183,8 @@ Status PlainTableReader::Open(
     // can be used.
     new_reader->full_scan_mode_ = true;
   }
+  // PopulateIndex can add to the props, so don't store them until now
+  new_reader->table_properties_.reset(props);
 
   if (immortal_table && new_reader->file_info_.is_mmap_mode) {
     new_reader->dummy_cleanable_.reset(new Cleanable());
@@ -199,6 +201,9 @@ InternalIterator* PlainTableReader::NewIterator(
     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
     Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
     size_t /*compaction_readahead_size*/) {
+  // Not necessarily used here, but make sure this has been initialized
+  assert(table_properties_);
+
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
@@ -291,7 +296,6 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
                                        size_t index_sparseness,
                                        size_t huge_page_tlb_size) {
   assert(props != nullptr);
-  table_properties_.reset(props);
 
   BlockContents index_block_contents;
   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
@@ -351,7 +355,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     // Allocate bloom filter here for total order mode.
     if (IsTotalOrderMode()) {
       AllocateBloom(bloom_bits_per_key,
-                    static_cast<uint32_t>(table_properties_->num_entries),
+                    static_cast<uint32_t>(props->num_entries),
                     huge_page_tlb_size);
     }
   } else if (bloom_in_file) {
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index e2d0e859282..c956913a04f 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -165,7 +165,9 @@ class PlainTableReader: public TableReader {
   const ImmutableCFOptions& ioptions_;
   std::unique_ptr<Cleanable> dummy_cleanable_;
   uint64_t file_size_;
+ protected: // for testing
   std::shared_ptr<const TableProperties> table_properties_;
+ private:
 
   bool IsFixedLength() const {
     return user_key_len_ != kPlainTableVariableLength;
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 9c7de34cd8c..71bec339e0a 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -740,17 +740,19 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
         total_data_block_size += table_properties->data_size;
         total_index_block_size += table_properties->index_size;
         total_filter_block_size += table_properties->filter_size;
-      }
-      if (show_properties) {
-        fprintf(stdout,
-                "Raw user collected properties\n"
-                "------------------------------\n");
-        for (const auto& kv : table_properties->user_collected_properties) {
-          std::string prop_name = kv.first;
-          std::string prop_val = Slice(kv.second).ToString(true);
-          fprintf(stdout, "  # %s: 0x%s\n", prop_name.c_str(),
-                  prop_val.c_str());
+        if (show_properties) {
+          fprintf(stdout,
+                  "Raw user collected properties\n"
+                  "------------------------------\n");
+          for (const auto& kv : table_properties->user_collected_properties) {
+            std::string prop_name = kv.first;
+            std::string prop_val = Slice(kv.second).ToString(true);
+            fprintf(stdout, "  # %s: 0x%s\n", prop_name.c_str(),
+                    prop_val.c_str());
+          }
         }
+      } else {
+        fprintf(stderr, "Reader unexpectedly returned null properties\n");
       }
     }
   }

From 5f8f2fda0e293416059fc008f9e0d2de0c72962d Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 18 Oct 2019 14:49:26 -0700
Subject: [PATCH 461/572] Refactor / clean up / optimize FullFilterBitsReader
 (#5941)

Summary:
FullFilterBitsReader, after creating in BloomFilterPolicy, was
responsible for decoding metadata bits. This meant that
FullFilterBitsReader::MayMatch had some metadata checks in order to
implement "always true" or "always false" functionality in the case
of inconsistent or trivial metadata. This made for ugly
mixing-of-concerns code and probably had some runtime cost. It also
didn't really support plugging in alternative filter implementations
with extensions to the existing metadata schema.

BloomFilterPolicy::GetFilterBitsReader is now (exclusively) responsible
for decoding filter metadata bits and constructing appropriate instances
deriving from FilterBitsReader. "Always false" and "always true" derived
classes allow FullFilterBitsReader not to be concerned with handling of
trivial or inconsistent metadata. This also makes for easy expansion
to alternative filter implementations in new, alternative derived
classes. This change makes calls to FilterBitsReader::MayMatch
*necessarily* virtual because there's now more than one built-in
implementation. Compared with the previous implementation's extra
'if' checks in MayMatch, there's no consistent performance difference,
measured by (an older revision of) filter_bench (differences here seem
to be within noise):

    Inside queries...
    -  Dry run (407) ns/op: 35.9996
    +  Dry run (407) ns/op: 35.2034
    -  Single filter ns/op: 47.5483
    +  Single filter ns/op: 47.4034
    -  Batched, prepared ns/op: 43.1559
    +  Batched, prepared ns/op: 42.2923
    ...
    -  Random filter ns/op: 150.697
    +  Random filter ns/op: 149.403
    ----------------------------
    Outside queries...
    -  Dry run (980) ns/op: 34.6114
    +  Dry run (980) ns/op: 34.0405
    -  Single filter ns/op: 56.8326
    +  Single filter ns/op: 55.8414
    -  Batched, prepared ns/op: 48.2346
    +  Batched, prepared ns/op: 47.5667
    -  Random filter ns/op: 155.377
    +  Random filter ns/op: 153.942
         Average FP rate %: 1.1386

Also, the FullFilterBitsReader ctor was responsible for a surprising
amount of CPU in production, due in part to inefficient determination of
the CACHE_LINE_SIZE used to construct the filter being read. The
overwhelming common case (same as my CACHE_LINE_SIZE) is now
substantially optimized, as shown with filter_bench with
-new_reader_every=1 (old option - see below) (repeatable result):

    Inside queries...
    -  Dry run (453) ns/op: 118.799
    +  Dry run (453) ns/op: 105.869
    -  Single filter ns/op: 82.5831
    +  Single filter ns/op: 74.2509
    ...
    -  Random filter ns/op: 224.936
    +  Random filter ns/op: 194.833
    ----------------------------
    Outside queries...
    -  Dry run (aa1) ns/op: 118.503
    +  Dry run (aa1) ns/op: 104.925
    -  Single filter ns/op: 90.3023
    +  Single filter ns/op: 83.425
    ...
    -  Random filter ns/op: 220.455
    +  Random filter ns/op: 175.7
         Average FP rate %: 1.13886

However PR#5936 has/will reclaim most of this cost. After that PR, the optimization of this code path is likely negligible, but nonetheless it's clear we aren't making performance any worse.

Also fixed inadequate check of consistency between filter data size and
num_lines. (Unit test updated.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5941

Test Plan:
previously added unit tests FullBloomTest.CorruptFilters and
FullBloomTest.RawSchema

Differential Revision: D18018353

Pulled By: pdillinger

fbshipit-source-id: 8e04c2b4a7d93223f49a237fd52ef2483929ed9c
---
 util/bloom.cc      | 154 +++++++++++++++++++++++----------------------
 util/bloom_test.cc |   9 +--
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/util/bloom.cc b/util/bloom.cc
index d59e29b51b2..9e1f471760d 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -132,39 +132,27 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
 }
 
 namespace {
+class AlwaysTrueFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
 class FullFilterBitsReader : public FilterBitsReader {
  public:
-  explicit FullFilterBitsReader(const Slice& contents)
-      : data_(contents.data()),
-        data_len_(static_cast<uint32_t>(contents.size())),
-        num_probes_(0),
-        num_lines_(0),
-        log2_cache_line_size_(0) {
-    assert(data_);
-    GetFilterMeta(contents, &num_probes_, &num_lines_);
-    // Sanitize broken parameter
-    if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
-      num_lines_ = 0;
-      num_probes_ = 0;
-    } else if (num_lines_ != 0) {
-      while (true) {
-        uint32_t num_lines_at_curr_cache_size =
-            (data_len_ - 5) >> log2_cache_line_size_;
-        if (num_lines_at_curr_cache_size == 0) {
-          // The cache line size seems not a power of two. It's not supported
-          // and indicates a corruption so disable using this filter.
-          // Removed for unit testing corruption: assert(false);
-          num_lines_ = 0;
-          num_probes_ = 0;
-          break;
-        }
-        if (num_lines_at_curr_cache_size == num_lines_) {
-          break;
-        }
-        ++log2_cache_line_size_;
-      }
-    }
-  }
+  FullFilterBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                       uint32_t log2_cache_line_size)
+      : data_(data),
+        num_probes_(num_probes),
+        num_lines_(num_lines),
+        log2_cache_line_size_(log2_cache_line_size) {}
+
   // No Copy allowed
   FullFilterBitsReader(const FullFilterBitsReader&) = delete;
   void operator=(const FullFilterBitsReader&) = delete;
@@ -177,11 +165,6 @@ class FullFilterBitsReader : public FilterBitsReader {
   // if the key was not on the list, but it should aim to return false with a
   // high probability.
   bool MayMatch(const Slice& key) override {
-    if (data_len_ <= 5) {   // remain same with original filter
-      return false;
-    }
-    // Other Error params, including a broken filter, regarded as match
-    if (num_probes_ == 0 || num_lines_ == 0) return true;
     uint32_t hash = BloomHash(key);
     uint32_t byte_offset;
     LegacyFullFilterImpl::PrepareHashMayMatch(
@@ -191,17 +174,6 @@ class FullFilterBitsReader : public FilterBitsReader {
   }
 
   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
-    if (data_len_ <= 5) {  // remain same with original filter
-      for (int i = 0; i < num_keys; ++i) {
-        may_match[i] = false;
-      }
-      return;
-    }
-    for (int i = 0; i < num_keys; ++i) {
-      may_match[i] = true;
-    }
-    // Other Error params, including a broken filter, regarded as match
-    if (num_probes_ == 0 || num_lines_ == 0) return;
     uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
     uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
     for (int i = 0; i < num_keys; ++i) {
@@ -210,42 +182,20 @@ class FullFilterBitsReader : public FilterBitsReader {
                                                 /*out*/ &byte_offsets[i],
                                                 log2_cache_line_size_);
     }
-
     for (int i = 0; i < num_keys; ++i) {
-      if (!LegacyFullFilterImpl::HashMayMatchPrepared(hashes[i], num_probes_,
-                                                      data_ + byte_offsets[i],
-                                                      log2_cache_line_size_)) {
-        may_match[i] = false;
-      }
+      may_match[i] = LegacyFullFilterImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i],
+          log2_cache_line_size_);
     }
   }
 
  private:
-  // Filter meta data
   const char* data_;
-  uint32_t data_len_;
-  int num_probes_;
-  uint32_t num_lines_;
-  uint32_t log2_cache_line_size_;
-
-  // Get num_probes, and num_lines from filter
-  // If filter format broken, set both to 0.
-  void GetFilterMeta(const Slice& filter, int* num_probes, uint32_t* num_lines);
+  const int num_probes_;
+  const uint32_t num_lines_;
+  const uint32_t log2_cache_line_size_;
 };
 
-void FullFilterBitsReader::GetFilterMeta(const Slice& filter, int* num_probes,
-                                         uint32_t* num_lines) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) {
-    // filter is empty or broken
-    *num_probes = 0;
-    *num_lines = 0;
-    return;
-  }
-
-  *num_probes = filter.data()[len - 5];
-  *num_lines = DecodeFixed32(filter.data() + len - 4);
-}
 
 // An implementation of filter policy
 class BloomFilterPolicy : public FilterPolicy {
@@ -311,8 +261,60 @@ class BloomFilterPolicy : public FilterPolicy {
     return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
   }
 
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one.
   FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
-    return new FullFilterBitsReader(contents);
+    uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+    if (len_with_meta <= 5) {
+      // filter is empty or broken. Treat like zero keys added.
+      return new AlwaysFalseFilter();
+    }
+
+    char raw_num_probes = contents.data()[len_with_meta - 5];
+    // NB: *num_probes > 30 and < 128 probably have not been used, because of
+    // BloomFilterPolicy::initialize, unless directly calling
+    // FullFilterBitsBuilder as an API, but we are leaving those cases in
+    // limbo with FullFilterBitsReader for now.
+
+    if (raw_num_probes < 1) {
+      // Treat as zero probes (always FP) for now.
+      // NB: < 0 (or unsigned > 127) effectively reserved for future use.
+      return new AlwaysTrueFilter();
+    }
+    // else attempt decode for FullFilterBitsReader
+
+    int num_probes = raw_num_probes;
+    assert(num_probes >= 1);
+    assert(num_probes <= 127);
+
+    uint32_t len = len_with_meta - 5;
+    assert(len > 0);
+
+    uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+    uint32_t log2_cache_line_size;
+
+    if (num_lines * CACHE_LINE_SIZE == len) {
+      // Common case
+      log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
+    } else if (num_lines == 0 || len % num_lines != 0) {
+      // Invalid (no solution to num_lines * x == len)
+      // Treat as zero probes (always FP) for now.
+      return new AlwaysTrueFilter();
+    } else {
+      // Determine the non-native cache line size (from another system)
+      log2_cache_line_size = 0;
+      while ((num_lines << log2_cache_line_size) < len) {
+        ++log2_cache_line_size;
+      }
+      if ((num_lines << log2_cache_line_size) != len) {
+        // Invalid (block size not a power of two)
+        // Treat as zero probes (always FP) for now.
+        return new AlwaysTrueFilter();
+      }
+    }
+    // if not early return
+    return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
+                                    log2_cache_line_size);
   }
 
   // If choose to use block based builder
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index fc0ea854b97..bcea94ef845 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -589,14 +589,11 @@ TEST_F(FullBloomTest, CorruptFilters) {
     ASSERT_TRUE(Matches("hello"));
     ASSERT_TRUE(Matches("world"));
 
-    // Bad filter bits
+    // Bad filter bits - returns true for safety
     // 65 bytes is not a power of two, so not a legal cache line size
     OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
-    // ASSERT_TRUE(Matches("hello"));
-    // ASSERT_TRUE(Matches("world"));
-    // NB: NOT PROPERLY CHECKED in implementation
-    ASSERT_EQ(fill, Matches("hello"));
-    ASSERT_EQ(fill, Matches("world"));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
 
     // Bad filter bits - returns false as if built from zero keys
     // < 5 bytes overall means missing even metadata

From c53db172a166141fabd3ee49fef9c90dcab44ee9 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 18 Oct 2019 15:47:40 -0700
Subject: [PATCH 462/572] Fix TestIterate for HashSkipList in db_stress (#5942)

Summary:
Since SeekForPrev (used by Prev) is not supported by HashSkipList when prefix is used, we disable it when stress testing HashSkipList.

- Change the default memtablerep to skip list.
- Avoid Prev() when memtablerep is HashSkipList and prefix is used.

Test Plan (on devserver):
```
$make db_stress
$./db_stress -ops_per_thread=10000 -reopen=1 -destroy_db_initially=true -column_families=1 -threads=1 -column_families=1 -memtablerep=prefix_hash
$# or simply
$./db_stress
$./db_stress -memtablerep=prefix_hash
```
Results must print "Verification successful".
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5942

Differential Revision: D18017062

Pulled By: riversand963

fbshipit-source-id: af867e59aa9e6f533143c984d7d529febf232fd7
---
 tools/db_stress_tool.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 876de71ed1c..2b9e063f291 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -683,7 +683,7 @@ bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
 }  // namespace
 
 static enum RepFactory FLAGS_rep_factory;
-DEFINE_string(memtablerep, "prefix_hash", "");
+DEFINE_string(memtablerep, "skip_list", "");
 
 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
   if (value < -1 || value > 8) {
@@ -2461,8 +2461,11 @@ class StressTest {
       VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
                      &diverged);
 
+      bool no_reverse =
+          (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek &&
+           options_.prefix_extractor.get() != nullptr);
       for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-        if (thread->rand.OneIn(2)) {
+        if (no_reverse || thread->rand.OneIn(2)) {
           iter->Next();
           if (!diverged) {
             assert(cmp_iter->Valid());

From 29ccf2075cdfbf80d46eb34cec2291ba71268e20 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 18 Oct 2019 19:30:47 -0700
Subject: [PATCH 463/572] Store the filter bits reader alongside the filter
 block contents (#5936)

Summary:
Amongst other things, PR https://github.com/facebook/rocksdb/issues/5504 refactored the filter block readers so that
only the filter block contents are stored in the block cache (as opposed to the
earlier design where the cache stored the filter block reader itself, leading to
potentially dangling pointers and concurrency bugs). However, this change
introduced a performance hit since with the new code, the metadata fields are
re-parsed upon every access. This patch reunites the block contents with the
filter bits reader to eliminate this overhead; since this is still a self-contained
pure data object, it is safe to store it in the cache. (Note: this is similar to how
the zstd digest is handled.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5936

Test Plan:
make asan_check

filter_bench results for the old code:

```
$ ./filter_bench -quick
WARNING: Assertions are enabled; benchmarks unnecessarily slow
Building...
Build avg ns/key: 26.7153
Number of filters: 16669
Total memory (MB): 200.009
Bits/key actual: 10.0647
----------------------------
Inside queries...
  Dry run (46b) ns/op: 33.4258
  Single filter ns/op: 42.5974
  Random filter ns/op: 217.861
----------------------------
Outside queries...
  Dry run (25d) ns/op: 32.4217
  Single filter ns/op: 50.9855
  Random filter ns/op: 219.167
    Average FP rate %: 1.13993
----------------------------
Done. (For more info, run with -legend or -help.)

$ ./filter_bench -quick -use_full_block_reader
WARNING: Assertions are enabled; benchmarks unnecessarily slow
Building...
Build avg ns/key: 26.5172
Number of filters: 16669
Total memory (MB): 200.009
Bits/key actual: 10.0647
----------------------------
Inside queries...
  Dry run (46b) ns/op: 32.3556
  Single filter ns/op: 83.2239
  Random filter ns/op: 370.676
----------------------------
Outside queries...
  Dry run (25d) ns/op: 32.2265
  Single filter ns/op: 93.5651
  Random filter ns/op: 408.393
    Average FP rate %: 1.13993
----------------------------
Done. (For more info, run with -legend or -help.)
```

With the new code:

```
$ ./filter_bench -quick
WARNING: Assertions are enabled; benchmarks unnecessarily slow
Building...
Build avg ns/key: 25.4285
Number of filters: 16669
Total memory (MB): 200.009
Bits/key actual: 10.0647
----------------------------
Inside queries...
  Dry run (46b) ns/op: 31.0594
  Single filter ns/op: 43.8974
  Random filter ns/op: 226.075
----------------------------
Outside queries...
  Dry run (25d) ns/op: 31.0295
  Single filter ns/op: 50.3824
  Random filter ns/op: 226.805
    Average FP rate %: 1.13993
----------------------------
Done. (For more info, run with -legend or -help.)

$ ./filter_bench -quick -use_full_block_reader
WARNING: Assertions are enabled; benchmarks unnecessarily slow
Building...
Build avg ns/key: 26.5308
Number of filters: 16669
Total memory (MB): 200.009
Bits/key actual: 10.0647
----------------------------
Inside queries...
  Dry run (46b) ns/op: 33.2968
  Single filter ns/op: 58.6163
  Random filter ns/op: 291.434
----------------------------
Outside queries...
  Dry run (25d) ns/op: 32.1839
  Single filter ns/op: 66.9039
  Random filter ns/op: 292.828
    Average FP rate %: 1.13993
----------------------------
Done. (For more info, run with -legend or -help.)
```

Differential Revision: D17991712

Pulled By: ltamasi

fbshipit-source-id: 7ea205550217bfaaa1d5158ebd658e5832e60f29
---
 CMakeLists.txt                                |  1 +
 TARGETS                                       |  1 +
 src.mk                                        |  1 +
 table/block_based/block_based_table_reader.cc | 51 +++++++++++++++----
 .../block_based/filter_block_reader_common.cc |  2 +
 table/block_based/full_filter_block.cc        | 34 +++++--------
 table/block_based/full_filter_block.h         |  7 +--
 table/block_based/full_filter_block_test.cc   | 28 +++++-----
 table/block_based/parsed_full_filter_block.cc | 22 ++++++++
 table/block_based/parsed_full_filter_block.h  | 40 +++++++++++++++
 table/block_based/partitioned_filter_block.cc |  6 +--
 table/block_based/partitioned_filter_block.h  |  5 +-
 .../partitioned_filter_block_test.cc          | 21 +++++---
 util/filter_bench.cc                          |  8 +--
 14 files changed, 165 insertions(+), 62 deletions(-)
 create mode 100644 table/block_based/parsed_full_filter_block.cc
 create mode 100644 table/block_based/parsed_full_filter_block.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68c28f1b345..16b3cafc856 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -602,6 +602,7 @@ set(SOURCES
         table/block_based/flush_block_policy.cc
         table/block_based/full_filter_block.cc
         table/block_based/index_builder.cc
+        table/block_based/parsed_full_filter_block.cc
         table/block_based/partitioned_filter_block.cc
         table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
diff --git a/TARGETS b/TARGETS
index e1af516e0b5..0d175656f7e 100644
--- a/TARGETS
+++ b/TARGETS
@@ -233,6 +233,7 @@ cpp_library(
         "table/block_based/flush_block_policy.cc",
         "table/block_based/full_filter_block.cc",
         "table/block_based/index_builder.cc",
+        "table/block_based/parsed_full_filter_block.cc",
         "table/block_based/partitioned_filter_block.cc",
         "table/block_based/uncompression_dict_reader.cc",
         "table/block_fetcher.cc",
diff --git a/src.mk b/src.mk
index 42d6ae70b03..8cce00207fe 100644
--- a/src.mk
+++ b/src.mk
@@ -128,6 +128,7 @@ LIB_SOURCES =                                                   \
   table/block_based/flush_block_policy.cc                       \
   table/block_based/full_filter_block.cc                        \
   table/block_based/index_builder.cc                            \
+  table/block_based/parsed_full_filter_block.cc                 \
   table/block_based/partitioned_filter_block.cc                 \
   table/block_based/uncompression_dict_reader.cc                \
   table/block_fetcher.cc                             		\
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 8baf399870a..27e1d6cf287 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -84,7 +84,8 @@ class BlocklikeTraits<BlockContents> {
                                SequenceNumber /* global_seqno */,
                                size_t /* read_amp_bytes_per_bit */,
                                Statistics* /* statistics */,
-                               bool /* using_zstd */) {
+                               bool /* using_zstd */,
+                               const FilterPolicy* /* filter_policy */) {
     return new BlockContents(std::move(contents));
   }
 
@@ -93,12 +94,30 @@ class BlocklikeTraits<BlockContents> {
   }
 };
 
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       SequenceNumber /* global_seqno */,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+};
+
 template <>
 class BlocklikeTraits<Block> {
  public:
   static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
                        size_t read_amp_bytes_per_bit, Statistics* statistics,
-                       bool /* using_zstd */) {
+                       bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
     return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
                      statistics);
   }
@@ -115,7 +134,8 @@ class BlocklikeTraits<UncompressionDict> {
                                    SequenceNumber /* global_seqno */,
                                    size_t /* read_amp_bytes_per_bit */,
                                    Statistics* /* statistics */,
-                                   bool using_zstd) {
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
     return new UncompressionDict(contents.data, std::move(contents.allocation),
                                  using_zstd);
   }
@@ -141,7 +161,7 @@ Status ReadBlockFromFile(
     const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
     size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-    bool for_compaction, bool using_zstd) {
+    bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
   assert(result);
 
   BlockContents contents;
@@ -153,7 +173,7 @@ Status ReadBlockFromFile(
   if (s.ok()) {
     result->reset(BlocklikeTraits<TBlocklike>::Create(
         std::move(contents), global_seqno, read_amp_bytes_per_bit,
-        ioptions.statistics, using_zstd));
+        ioptions.statistics, using_zstd, filter_policy));
   }
 
   return s;
@@ -1629,7 +1649,7 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
       GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
-      rep_->blocks_definitely_zstd_compressed);
+      rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
@@ -1718,7 +1738,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
         BlocklikeTraits<TBlocklike>::Create(
             std::move(contents), rep_->get_global_seqno(block_type),
             read_amp_bytes_per_bit, statistics,
-            rep_->blocks_definitely_zstd_compressed));  // uncompressed block
+            rep_->blocks_definitely_zstd_compressed,
+            rep_->table_options.filter_policy.get()));  // uncompressed block
 
     if (block_cache != nullptr && block_holder->own_bytes() &&
         read_options.fill_cache) {
@@ -1789,11 +1810,13 @@ Status BlockBasedTable::PutDataBlockToCache(
 
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
         std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
-        statistics, rep_->blocks_definitely_zstd_compressed));
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
   } else {
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
         std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
-        statistics, rep_->blocks_definitely_zstd_compressed));
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
   }
 
   // Insert compressed block into compressed block cache.
@@ -2458,7 +2481,8 @@ Status BlockBasedTable::RetrieveBlock(
             ? rep_->table_options.read_amp_bytes_per_bit
             : 0,
         GetMemoryAllocator(rep_->table_options), for_compaction,
-        rep_->blocks_definitely_zstd_compressed);
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get());
   }
 
   if (!s.ok()) {
@@ -2480,6 +2504,13 @@ template Status BlockBasedTable::RetrieveBlock<BlockContents>(
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     bool for_compaction, bool use_cache) const;
 
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
 template Status BlockBasedTable::RetrieveBlock<Block>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
index b6a33498678..49a26882305 100644
--- a/table/block_based/filter_block_reader_common.cc
+++ b/table/block_based/filter_block_reader_common.cc
@@ -7,6 +7,7 @@
 #include "table/block_based/filter_block_reader_common.h"
 #include "monitoring/perf_context_imp.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
 
 namespace rocksdb {
 
@@ -96,5 +97,6 @@ size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
 // This makes it possible to keep the template definitions in the .cc file.
 template class FilterBlockReaderCommon<BlockContents>;
 template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
 
 }  // namespace rocksdb
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index 9a858232dbd..b3b2f58136b 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -93,7 +93,8 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
 }
 
 FullFilterBlockReader::FullFilterBlockReader(
-    const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+    const BlockBasedTable* t,
+    CachableEntry<ParsedFullFilterBlock>&& filter_block)
     : FilterBlockReaderCommon(t, std::move(filter_block)) {
   const SliceTransform* const prefix_extractor = table_prefix_extractor();
   if (prefix_extractor) {
@@ -125,7 +126,7 @@ std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
   assert(table->get_rep());
   assert(!pin || prefetch);
 
-  CachableEntry<BlockContents> filter_block;
+  CachableEntry<ParsedFullFilterBlock> filter_block;
   if (prefetch || !use_cache) {
     const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
                                      use_cache, nullptr /* get_context */,
@@ -158,7 +159,7 @@ bool FullFilterBlockReader::PrefixMayMatch(
 bool FullFilterBlockReader::MayMatch(
     const Slice& entry, bool no_io, GetContext* get_context,
     BlockCacheLookupContext* lookup_context) const {
-  CachableEntry<BlockContents> filter_block;
+  CachableEntry<ParsedFullFilterBlock> filter_block;
 
   const Status s =
       GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
@@ -168,15 +169,10 @@ bool FullFilterBlockReader::MayMatch(
 
   assert(filter_block.GetValue());
 
-  if (filter_block.GetValue()->data.size() != 0) {
-    assert(table());
-    assert(table()->get_rep());
-
-    std::unique_ptr<FilterBitsReader> filter_bits_reader(
-        table()->get_rep()->filter_policy->GetFilterBitsReader(
-            filter_block.GetValue()->data));
-    assert(filter_bits_reader != nullptr);
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
 
+  if (filter_bits_reader) {
     if (filter_bits_reader->MayMatch(entry)) {
       PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
       return true;
@@ -220,7 +216,7 @@ void FullFilterBlockReader::PrefixesMayMatch(
 void FullFilterBlockReader::MayMatch(
     MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
     BlockCacheLookupContext* lookup_context) const {
-  CachableEntry<BlockContents> filter_block;
+  CachableEntry<ParsedFullFilterBlock> filter_block;
 
   const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
                                         lookup_context, &filter_block);
@@ -230,18 +226,13 @@ void FullFilterBlockReader::MayMatch(
 
   assert(filter_block.GetValue());
 
-  if (filter_block.GetValue()->data.size() == 0) {
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (!filter_bits_reader) {
     return;
   }
 
-  assert(table());
-  assert(table()->get_rep());
-
-  std::unique_ptr<FilterBitsReader> filter_bits_reader(
-      table()->get_rep()->filter_policy->GetFilterBitsReader(
-          filter_block.GetValue()->data));
-  assert(filter_bits_reader != nullptr);
-
   // We need to use an array instead of autovector for may_match since
   // &may_match[0] doesn't work for autovector<bool> (compiler error). So
   // declare both keys and may_match as arrays, which is also slightly less
@@ -261,6 +252,7 @@ void FullFilterBlockReader::MayMatch(
       filter_range.SkipKey(iter);
     }
   }
+
   filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
 
   int i = 0;
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 65dc278a860..04f1ec22849 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -16,7 +16,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/block_based/filter_block_reader_common.h"
-#include "table/format.h"
+#include "table/block_based/parsed_full_filter_block.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -80,10 +80,11 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
 // A FilterBlockReader is used to parse filter from SST table.
 // KeyMayMatch and PrefixMayMatch would trigger filter checking
-class FullFilterBlockReader : public FilterBlockReaderCommon<BlockContents> {
+class FullFilterBlockReader
+    : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
  public:
   FullFilterBlockReader(const BlockBasedTable* t,
-                        CachableEntry<BlockContents>&& filter_block);
+                        CachableEntry<ParsedFullFilterBlock>&& filter_block);
 
   static std::unique_ptr<FilterBlockReader> Create(
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 6ee1092dc23..28b2cefaa77 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -113,9 +113,10 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
-  CachableEntry<BlockContents> block(
-      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */);
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
 
   FullFilterBlockReader reader(table_.get(), std::move(block));
   // Remain same symantic with blockbased filter
@@ -136,9 +137,10 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
   builder.Add("hello");
   Slice slice = builder.Finish();
 
-  CachableEntry<BlockContents> block(
-      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */);
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
 
   FullFilterBlockReader reader(table_.get(), std::move(block));
   ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
@@ -189,9 +191,10 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) {
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
-  CachableEntry<BlockContents> block(
-      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */);
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
 
   FullFilterBlockReader reader(table_.get(), std::move(block));
   // Remain same symantic with blockbased filter
@@ -247,9 +250,10 @@ TEST_F(FullFilterBlockTest, SingleChunk) {
   ASSERT_EQ(5, builder.NumAdded());
   Slice slice = builder.Finish();
 
-  CachableEntry<BlockContents> block(
-      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
-      true /* own_value */);
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
 
   FullFilterBlockReader reader(table_.get(), std::move(block));
   ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
diff --git a/table/block_based/parsed_full_filter_block.cc b/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 00000000000..5cc259d1906
--- /dev/null
+++ b/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+
+namespace rocksdb {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                                             BlockContents&& contents)
+    : block_contents_(std::move(contents)),
+      filter_bits_reader_(
+          !block_contents_.data.empty()
+              ? filter_policy->GetFilterBitsReader(block_contents_.data)
+              : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+}  // namespace rocksdb
diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 00000000000..74f8ccf5a28
--- /dev/null
+++ b/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace rocksdb {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+  ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                        BlockContents&& contents);
+  ~ParsedFullFilterBlock();
+
+  FilterBitsReader* filter_bits_reader() const {
+    return filter_bits_reader_.get();
+  }
+
+  // TODO: consider memory usage of FullFilterBitsReader
+  size_t ApproximateMemoryUsage() const {
+    return block_contents_.ApproximateMemoryUsage();
+  }
+
+  bool own_bytes() const { return block_contents_.own_bytes(); }
+
+ private:
+  BlockContents block_contents_;
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 9ecf5613052..b9b96989fde 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -217,7 +217,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
     bool no_io, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
-    CachableEntry<BlockContents>* filter_block) const {
+    CachableEntry<ParsedFullFilterBlock>* filter_block) const {
   assert(table());
   assert(filter_block);
   assert(filter_block->IsEmpty());
@@ -267,7 +267,7 @@ bool PartitionedFilterBlockReader::MayMatch(
     return false;
   }
 
-  CachableEntry<BlockContents> filter_partition_block;
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
   s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
                               no_io, get_context, lookup_context,
                               &filter_partition_block);
@@ -346,7 +346,7 @@ void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
   for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
     handle = biter.value().handle;
 
-    CachableEntry<BlockContents> block;
+    CachableEntry<ParsedFullFilterBlock> block;
     // TODO: Support counter batch update for partitioned index and
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 9cac1b88a2e..089773d4751 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -100,7 +100,7 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
       FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
       bool no_io, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
-      CachableEntry<BlockContents>* filter_block) const;
+      CachableEntry<ParsedFullFilterBlock>* filter_block) const;
 
   using FilterFunction = bool (FullFilterBlockReader::*)(
       const Slice& slice, const SliceTransform* prefix_extractor,
@@ -119,7 +119,8 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
   bool index_value_is_full() const;
 
  protected:
-  std::unordered_map<uint64_t, CachableEntry<BlockContents>> filter_map_;
+  std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
+      filter_map_;
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index f849d62ed89..ee93262ad01 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -20,7 +20,7 @@
 
 namespace rocksdb {
 
-std::map<uint64_t, Slice> slices;
+std::map<uint64_t, std::string> blooms;
 
 class MockedBlockBasedTable : public BlockBasedTable {
  public:
@@ -37,13 +37,18 @@ class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
   MyPartitionedFilterBlockReader(BlockBasedTable* t,
                                  CachableEntry<Block>&& filter_block)
       : PartitionedFilterBlockReader(t, std::move(filter_block)) {
-    for (const auto& pair : slices) {
+    for (const auto& pair : blooms) {
       const uint64_t offset = pair.first;
-      const Slice& slice = pair.second;
-
-      CachableEntry<BlockContents> block(
-          new BlockContents(slice), nullptr /* cache */,
-          nullptr /* cache_handle */, true /* own_value */);
+      const std::string& bloom = pair.second;
+
+      assert(t);
+      assert(t->get_rep());
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(
+              t->get_rep()->table_options.filter_policy.get(),
+              BlockContents(Slice(bloom))),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
       filter_map_[offset] = std::move(block);
     }
   }
@@ -101,7 +106,7 @@ class PartitionedFilterBlockTest
   uint64_t last_offset = 10;
   BlockHandle Write(const Slice& slice) {
     BlockHandle bh(last_offset + 1, slice.size());
-    slices[bh.offset()] = slice;
+    blooms[bh.offset()] = slice.ToString();
     last_offset += bh.size();
     return bh;
   }
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index c12bef66919..359292e9970 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -70,6 +70,7 @@ using rocksdb::fastrange32;
 using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
 using rocksdb::FullFilterBlockReader;
+using rocksdb::ParsedFullFilterBlock;
 using rocksdb::Random32;
 using rocksdb::Slice;
 using rocksdb::mock::MockBlockBasedTableTester;
@@ -208,9 +209,10 @@ void FilterBench::Go() {
     info.keys_added_ = keys_to_add;
     info.reader_.reset(
         table_options_.filter_policy->GetFilterBitsReader(info.filter_));
-    CachableEntry<BlockContents> block(
-        new BlockContents(info.filter_), nullptr /* cache */,
-        nullptr /* cache_handle */, true /* own_value */);
+    CachableEntry<ParsedFullFilterBlock> block(
+        new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                  BlockContents(info.filter_)),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
     info.full_block_reader_.reset(
         new FullFilterBlockReader(table_.get(), std::move(block)));
     total_memory_used += info.filter_.size();

From 1a21afa789f7c9a971f446171e9ccccff57245d8 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 21 Oct 2019 10:40:17 -0700
Subject: [PATCH 464/572] Fix some dependency paths (#5946)

Summary:
Some dependency path is not correct so that ASAN cannot run with CLANG. Fix it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5946

Test Plan: Run ASAN with CLANG

Differential Revision: D18040933

fbshipit-source-id: 1d82be9d350485cf1df1c792dad765188958641f
---
 build_tools/dependencies_platform007.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/dependencies_platform007.sh b/build_tools/dependencies_platform007.sh
index 40ccc8e4274..1de8e785a80 100644
--- a/build_tools/dependencies_platform007.sh
+++ b/build_tools/dependencies_platform007.sh
@@ -1,6 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/1bd23f9917738974ad0ff305aa23eb5f93f18305/9.0.0/centos7-native/c9f9104
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/963d9aeda70cc4779885b1277484fe7544a04e3e/9.0.0/platform007/9e92d53/
 LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc
 GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413
 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
@@ -14,6 +14,6 @@ NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2
 LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9
 TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d
 KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/2e3cb7d119b3cea5f1e738cc13a1ac69f49eb875/2.29.1/centos7-native/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/ab9f09bba370e7066cafd4eb59752db93f2e8312/2.29.1/platform007/15a3614
 VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d
 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832

From 30e2dc02f0e488e15d93db3748fc26385d179854 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 21 Oct 2019 11:37:09 -0700
Subject: [PATCH 465/572] Fix VerifyChecksum readahead with mmap mode (#5945)

Summary:
A recent change introduced readahead inside VerifyChecksum(). However it is not compatible with mmap mode and generated wrong checksum verification failure. Fix it by not enabling readahead in mmap
 mode.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5945

Test Plan: Add a unit test that used to fail.

Differential Revision: D18021443

fbshipit-source-id: 6f2eb600f81b26edb02222563a4006869d576bff
---
 db/corruption_test.cc                         | 7 +++++++
 table/block_based/block_based_table_reader.cc | 8 ++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 3fd85953db3..4add9b26a2f 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -369,6 +369,13 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   ASSERT_GE(senv.random_read_counter_.Read(), 213);
   ASSERT_LE(senv.random_read_counter_.Read(), 447);
 
+  // Test readahead shouldn't break mmap mode (where it should be
+  // disabled).
+  options.allow_mmap_reads = true;
+  Reopen(&options);
+  dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+
   CloseDb();
 }
 
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 27e1d6cf287..0189df36c78 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3781,8 +3781,12 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
   size_t readahead_size = (read_options.readahead_size != 0)
                               ? read_options.readahead_size
                               : kMaxAutoReadaheadSize;
-  FilePrefetchBuffer prefetch_buffer(rep_->file.get(), readahead_size,
-                                     readahead_size);
+  // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+  // needed there.
+  FilePrefetchBuffer prefetch_buffer(
+      rep_->file.get(), readahead_size /* readadhead_size */,
+      readahead_size /* max_readahead_size */,
+      !rep_->ioptions.allow_mmap_reads /* enable */);
 
   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
     s = index_iter->status();

From a0cd9200268952fde22ac73fa4daf652325562f5 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 21 Oct 2019 11:39:28 -0700
Subject: [PATCH 466/572] LevelIterator to avoid gap after prefix bloom filters
 out a file (#5861)

Summary:
Right now, when LevelIterator::Seek() is called, when a file is filtered out by prefix bloom filter, the position is put to the beginning of the next file. This is a confusing internal interface because many keys in the levels are skipped. Avoid this behavior by checking the key of the next file against the seek key, and invalidate the whole iterator if the prefix doesn't match.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5861

Test Plan: Add a new unit test to validate the behavior; run all exsiting tests; run crash_test

Differential Revision: D17918213

fbshipit-source-id: f06b47d937c7cc8919001f18dcc3af5b28c9cdac
---
 HISTORY.md        |  1 +
 db/db_test2.cc    | 34 ++++++++++++++++++++++++++++++++++
 db/version_set.cc | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 6e2d3a354ab..58fcabb1b6a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -31,6 +31,7 @@
 * Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
+* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 8d8b2171b42..fac84e3fe5d 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -4029,6 +4029,40 @@ TEST_F(DBTest2, PrefixBloomReseek) {
   delete iter;
 }
 
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Bloom filter is filterd out by f1.
+  // This is just one of several valid position following the contract.
+  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+  // the behavior of the current implementation. If underlying implementation
+  // changes, the test might fail here.
+  iter->Seek("bbb1");
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, RowCacheSnapshot) {
   Options options = CurrentOptions();
diff --git a/db/version_set.cc b/db/version_set.cc
index d13d315595f..e6553ddf800 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -934,7 +934,8 @@ class LevelIterator final : public InternalIterator {
   }
 
  private:
-  void SkipEmptyFileForward();
+  // Return true if at least one invalid file is seen and skipped.
+  bool SkipEmptyFileForward();
   void SkipEmptyFileBackward();
   void SetFileIterator(InternalIterator* iter);
   void InitFileIterator(size_t new_file_index);
@@ -1042,7 +1043,32 @@ void LevelIterator::Seek(const Slice& target) {
   if (file_iter_.iter() != nullptr) {
     file_iter_.Seek(target);
   }
-  SkipEmptyFileForward();
+  if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
+      file_iter_.iter() != nullptr && file_iter_.Valid()) {
+    // We've skipped the file we initially positioned to. In the prefix
+    // seek case, it is likely that the file is skipped because of
+    // prefix bloom or hash, where more keys are skipped. We then check
+    // the current key and invalidate the iterator if the prefix is
+    // already passed.
+    // When doing prefix iterator seek, when keys for one prefix have
+    // been exhausted, it can jump to any key that is larger. Here we are
+    // enforcing a stricter contract than that, in order to make it easier for
+    // higher layers (merging and DB iterator) to reason the correctness:
+    // 1. Within the prefix, the result should be accurate.
+    // 2. If keys for the prefix is exhausted, it is either positioned to the
+    //    next key after the prefix, or make the iterator invalid.
+    // A side benefit will be that it invalidates the iterator earlier so that
+    // the upper level merging iterator can merge fewer child iterators.
+    Slice target_user_key = ExtractUserKey(target);
+    Slice file_user_key = ExtractUserKey(file_iter_.key());
+    if (prefix_extractor_->InDomain(target_user_key) &&
+        (!prefix_extractor_->InDomain(file_user_key) ||
+         user_comparator_.Compare(
+             prefix_extractor_->Transform(target_user_key),
+             prefix_extractor_->Transform(file_user_key)) != 0)) {
+      SetFileIterator(nullptr);
+    }
+  }
   CheckMayBeOutOfLowerBound();
 }
 
@@ -1096,25 +1122,28 @@ void LevelIterator::Prev() {
   SkipEmptyFileBackward();
 }
 
-void LevelIterator::SkipEmptyFileForward() {
+bool LevelIterator::SkipEmptyFileForward() {
+  bool seen_empty_file = false;
   while (file_iter_.iter() == nullptr ||
          (!file_iter_.Valid() && file_iter_.status().ok() &&
           !file_iter_.iter()->IsOutOfBound())) {
+    seen_empty_file = true;
     // Move to next file
     if (file_index_ >= flevel_->num_files - 1) {
       // Already at the last file
       SetFileIterator(nullptr);
-      return;
+      break;
     }
     if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
       SetFileIterator(nullptr);
-      return;
+      break;
     }
     InitFileIterator(file_index_ + 1);
     if (file_iter_.iter() != nullptr) {
       file_iter_.SeekToFirst();
     }
   }
+  return seen_empty_file;
 }
 
 void LevelIterator::SkipEmptyFileBackward() {

From 2ce6aa5f39ee796a9b37260bbc85d68be47a771d Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Mon, 21 Oct 2019 12:07:58 -0700
Subject: [PATCH 467/572] Making platform 007 (gcc 7) default in
 build_detect_platform.sh (#5947)

Summary:
Making platform 007 (gcc 7) default in build_detect_platform.sh.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5947

Differential Revision: D18038837

Pulled By: vjnadimpalli

fbshipit-source-id: 9ac2ddaa93bf328a416faec028970e039886378e
---
 build_tools/build_detect_platform | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 00fa2b26423..042ac9f5199 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -56,10 +56,10 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
       # we need this to build with MySQL. Don't use for other purposes.
       source "$PWD/build_tools/fbcode_config4.8.1.sh"
-    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then
-      source "$PWD/build_tools/fbcode_config_platform007.sh"
-    else
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then
       source "$PWD/build_tools/fbcode_config.sh"
+    else
+      source "$PWD/build_tools/fbcode_config_platform007.sh"
     fi
 fi
 

From 7245fb5f63387b25829b595f5f2917eb6ab64140 Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Mon, 21 Oct 2019 15:03:27 -0700
Subject: [PATCH 468/572] Fix the potential memory leak of ReplayMultiThread
 (#5949)

Summary:
The pointer ra needs to be freed the status s returns not OK. In the previous  PR https://github.com/facebook/rocksdb/issues/5934  , the ra is not freed which might cause potential memory leak. Fix this issue by moving the clarification of ra inside the while loop and freeing it as desired.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5949

Test Plan: pass make asan check.

Differential Revision: D18045726

Pulled By: zhichao-cao

fbshipit-source-id: d5445b7b832c8bb1dafe008bafea7bfe9eb0b1ce
---
 trace_replay/trace_replay.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index ef1ae1785a4..b643e620e02 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -313,13 +313,13 @@ Status Replayer::MultiThreadReplay(uint32_t threads_num) {
       std::chrono::system_clock::now();
   WriteOptions woptions;
   ReadOptions roptions;
-  ReplayerWorkerArg* ra;
   uint64_t ops = 0;
   while (s.ok()) {
-    ra = new ReplayerWorkerArg;
+    ReplayerWorkerArg* ra = new ReplayerWorkerArg;
     ra->db = db_;
     s = ReadTrace(&(ra->trace_entry));
     if (!s.ok()) {
+      delete ra;
       break;
     }
     ra->woptions = woptions;

From 27a124571f5bc5d6285e4a2c57c82d8ae315199e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 21 Oct 2019 16:51:19 -0700
Subject: [PATCH 469/572] Fix memory leak on error opening PlainTable (#5951)

Summary:
Several error paths in opening of a plain table would leak memory. PR https://github.com/facebook/rocksdb/issues/5940 opened the leak to one more error path, which happens to have been (mistakenly) exercised by CuckooTableDBTest.AdaptiveTable. That test has been fixed, and the exercising of
plain table error cases (more than before) has been added as BadOptions1 and BadOptions2
to PlainTableDBTest. This effectively moved the memory leak to plain_table_db_test.

Also here is a cheap fix for the memory leak, without (yet?) changing the signature of
ReadTableProperties. This fixes ASAN on unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5951

Test Plan: make COMPILE_WITH_ASAN=1 check

Differential Revision: D18051940

Pulled By: pdillinger

fbshipit-source-id: e2952930c09a2b46c4f1ff09818c5090426929de
---
 HISTORY.md                        |  2 ++
 db/cuckoo_table_db_test.cc        |  3 ++
 db/plain_table_db_test.cc         | 59 +++++++++++++++++++++++++++++++
 table/plain/plain_table_reader.cc | 14 ++++----
 4 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 58fcabb1b6a..0450a0c4bcb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -10,6 +10,8 @@
 * Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
 * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
 * Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
+* Fixed an sst_dump crash on some plain table SST files.
+* Fixed a memory leak in some error cases of opening plain table SST files.
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 135a34c2e09..e964377cf9c 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -285,6 +285,9 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
 TEST_F(CuckooTableDBTest, AdaptiveTable) {
   Options options = CurrentOptions();
 
+  // Ensure options compatible with PlainTable
+  options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
   // Write some keys using cuckoo table.
   options.table_factory.reset(NewCuckooTableFactory());
   Reopen(&options);
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index a5dbe225b87..fea1e563cf2 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -393,6 +393,65 @@ class TestPlainTableFactory : public PlainTableFactory {
   const std::string column_family_name_;
 };
 
+TEST_P(PlainTableDBTest, BadOptions1) {
+  // Build with a prefix extractor
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open without a prefix extractor
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  Reopen(&options);
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+      "built using a prefix extractor",
+      Get("1000000000000foo"));
+
+  // Bad attempt to re-open with different prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+  Reopen(&options);
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor given doesn't match the one used to "
+      "build PlainTable",
+      Get("1000000000000foo"));
+
+  // Correct prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  // Build without a prefix extractor
+  // (apparently works even if hash_table_ratio > 0)
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+  Status s = TryReopen(&options);
+  ASSERT_EQ(
+      "Not implemented: PlainTable requires a prefix extractor enable prefix "
+      "hash mode.",
+      s.ToString());
+
+  // OK to open with hash_table_ratio == 0 and no prefix extractor
+  PlainTableOptions plain_table_options;
+  plain_table_options.hash_table_ratio = 0;
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+
+  // OK to open newly with a prefix_extractor and hash table; builds index
+  // in memory.
+  options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
 TEST_P(PlainTableDBTest, Flush) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 58145dda303..af97bb8dbe0 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -128,10 +128,11 @@ Status PlainTableReader::Open(
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
-  TableProperties* props = nullptr;
+  TableProperties* props_ptr = nullptr;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               ioptions, &props,
+                               ioptions, &props_ptr,
                                true /* compression_type_missing */);
+  std::shared_ptr<TableProperties> props(props_ptr);
   if (!s.ok()) {
     return s;
   }
@@ -165,7 +166,7 @@ Status PlainTableReader::Open(
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       ioptions, std::move(file), env_options, internal_comparator,
-      encoding_type, file_size, props, prefix_extractor));
+      encoding_type, file_size, props.get(), prefix_extractor));
 
   s = new_reader->MmapDataIfNeeded();
   if (!s.ok()) {
@@ -173,8 +174,9 @@ Status PlainTableReader::Open(
   }
 
   if (!full_scan_mode) {
-    s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
-                                  index_sparseness, huge_page_tlb_size);
+    s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+                                  hash_table_ratio, index_sparseness,
+                                  huge_page_tlb_size);
     if (!s.ok()) {
       return s;
     }
@@ -184,7 +186,7 @@ Status PlainTableReader::Open(
     new_reader->full_scan_mode_ = true;
   }
   // PopulateIndex can add to the props, so don't store them until now
-  new_reader->table_properties_.reset(props);
+  new_reader->table_properties_ = props;
 
   if (immortal_table && new_reader->file_info_.is_mmap_mode) {
     new_reader->dummy_cleanable_.reset(new Cleanable());

From 5677f4f7755281d8b3a2103f1b942d583f6dae53 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Mon, 21 Oct 2019 19:35:32 -0700
Subject: [PATCH 470/572] Using clang for internal ubsan tests (#5952)

Summary:
Using clang for internal ubsan tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5952

Differential Revision: D18048810

Pulled By: vjnadimpalli

fbshipit-source-id: ae55677a1928397b067e972d0ecb4ac1b7e2c8dc
---
 build_tools/rocksdb-lego-determinator | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index af86a16c2be..f4ea9ca346e 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -553,7 +553,7 @@ UBSAN_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Test RocksDB debug under UBSAN',
-                'shell':'set -o pipefail && $SHM $UBSAN $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
+                'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -576,7 +576,7 @@ UBSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -599,7 +599,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },

From 925250f42fa73292243d6f47b644fcdf777546b9 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 21 Oct 2019 19:38:42 -0700
Subject: [PATCH 471/572] Include db_stress_tool in rocksdb tools lib (#5950)

Summary:
include db_stress_tool in rocksdb tools lib

Test Plan (on devserver):
```
$make db_stress
$./db_stress
$make all && make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5950

Differential Revision: D18044399

Pulled By: riversand963

fbshipit-source-id: 895585abbbdfd8b954965921dba4b1400b7af1b1
---
 Makefile                     | 13 +++++++++++--
 TARGETS                      | 28 +++++++++++++++++++++++-----
 buckifier/buckify_rocksdb.py |  7 +++++++
 src.mk                       |  4 ++++
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 725a45dec4b..9d0ffab5150 100644
--- a/Makefile
+++ b/Makefile
@@ -439,6 +439,8 @@ BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
 
 ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o)
 
+STRESSTOOLOBJECTS = $(STRESS_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
+
 EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL)
 
 TESTS = \
@@ -662,6 +664,7 @@ endif
 endif
 LIBRARY = ${LIBNAME}.a
 TOOLS_LIBRARY = ${LIBNAME}_tools.a
+STRESS_LIBRARY = ${LIBNAME}_stress.a
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -751,6 +754,8 @@ static_lib: $(LIBRARY)
 
 shared_lib: $(SHARED)
 
+stress_lib: $(STRESS_LIBRARY)
+
 tools: $(TOOLS)
 
 tools_lib: $(TOOLS_LIBRARY)
@@ -1135,6 +1140,10 @@ $(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_S
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
+$(STRESS_LIBRARY): $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(STRESS_LIB_SOURCES:.cc=.o)
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
 librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
@@ -1165,7 +1174,7 @@ memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
 filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
-db_stress: tools/db_stress_tool.o tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+db_stress: tools/db_stress.o $(STRESSTOOLOBJECTS)
 	$(AM_LINK)
 
 write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL)
@@ -2038,7 +2047,7 @@ endif
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
 
-all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES)
+all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 DEPFILES = $(all_sources:.cc=.cc.d)
 
 # Add proper dependency support so changing a .h file forces a .cc file to
diff --git a/TARGETS b/TARGETS
index 0d175656f7e..1934f17645d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -392,6 +392,24 @@ cpp_library(
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
+cpp_library(
+    name = "rocksdb_stress_lib",
+    srcs = [
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
+        "tools/db_stress_tool.cc",
+        "tools/trace_analyzer_tool.cc",
+    ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [":rocksdb_lib"],
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
+)
+
 cpp_library(
     name = "env_basic_test_lib",
     srcs = ["env/env_basic_test.cc"],
@@ -758,15 +776,15 @@ ROCKS_TESTS = [
         [],
     ],
     [
-        "db_merge_operator_test",
-        "db/db_merge_operator_test.cc",
-        "parallel",
+        "db_merge_operand_test",
+        "db/db_merge_operand_test.cc",
+        "serial",
         [],
         [],
     ],
     [
-        "db_merge_operand_test",
-        "db/db_merge_operand_test.cc",
+        "db_merge_operator_test",
+        "db/db_merge_operator_test.cc",
         "parallel",
         [],
         [],
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index fc59cf5830a..3b814f3271f 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -169,6 +169,13 @@ def generate_targets(repo_path, deps_map):
         src_mk.get("ANALYZER_LIB_SOURCES", []) +
         ["test_util/testutil.cc"],
         [":rocksdb_lib"])
+    # rocksdb_stress_lib
+    TARGETS.add_library(
+        "rocksdb_stress_lib",
+        src_mk.get("ANALYZER_LIB_SOURCES", [])
+        + src_mk.get('STRESS_LIB_SOURCES', [])
+        + ["test_util/testutil.cc"],
+        [":rocksdb_lib"])
 
     print("Extra dependencies:\n{0}".format(str(deps_map)))
     # test for every test we found in the Makefile
diff --git a/src.mk b/src.mk
index 8cce00207fe..560f062023d 100644
--- a/src.mk
+++ b/src.mk
@@ -265,6 +265,9 @@ MOCK_LIB_SOURCES =                                              \
 BENCH_LIB_SOURCES =                                             \
   tools/db_bench_tool.cc                                        \
 
+STRESS_LIB_SOURCES =                                            \
+  tools/db_stress_tool.cc                                       \
+
 TEST_LIB_SOURCES =                                              \
   db/db_test_util.cc                                            \
   test_util/testharness.cc                                      \
@@ -396,6 +399,7 @@ MAIN_SOURCES =                                                          \
   tools/db_bench.cc                                                     \
   tools/db_bench_tool_test.cc                                           \
   tools/db_sanity_test.cc                                               \
+  tools/db_stress.cc                                                    \
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \

From c0abc6bbc1b46138649926e3230f46954a50a21f Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 22 Oct 2019 11:42:00 -0700
Subject: [PATCH 472/572] Use FLAGS_env for certain operations in db_bench
 (#5943)

Summary:
Since we already parse env_uri from command line and creates custom Env
accordingly, we should invoke the methods of such Envs instead of using
Env::Default().

Test Plan (on devserver):
```
$make db_bench db_stress
$./db_bench -benchmarks=fillseq
./db_stress
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5943

Differential Revision: D18018550

Pulled By: riversand963

fbshipit-source-id: 03b61329aaae0dfd914a0b902cc677f570f102e3
---
 tools/db_bench_tool.cc  | 16 +++++++++++-----
 tools/db_stress_tool.cc | 35 +++++++++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 7721ee476a4..743752b717a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -889,6 +889,9 @@ DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
 #endif  // ROCKSDB_LITE
 DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
               " --env_uri.");
+
+static std::shared_ptr<rocksdb::Env> env_guard;
+
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
 
 DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
@@ -1738,7 +1741,7 @@ class Stats {
         "ElapsedTime", "Stage", "State", "OperationProperties");
 
     int64_t current_time = 0;
-    Env::Default()->GetCurrentTime(&current_time);
+    FLAGS_env->GetCurrentTime(&current_time);
     for (auto ts : thread_list) {
       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
           ts.thread_id,
@@ -2483,7 +2486,7 @@ class Benchmark {
                 "at the same time");
         exit(1);
       }
-      FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
+      FLAGS_env = new ReportFileOpEnv(FLAGS_env);
     }
 
     if (FLAGS_prefix_size > FLAGS_key_size) {
@@ -2500,6 +2503,7 @@ class Benchmark {
     }
     if (!FLAGS_use_existing_db) {
       Options options;
+      options.env = FLAGS_env;
       if (!FLAGS_wal_dir.empty()) {
         options.wal_dir = FLAGS_wal_dir;
       }
@@ -3377,8 +3381,9 @@ class Benchmark {
     DBOptions db_opts;
     std::vector<ColumnFamilyDescriptor> cf_descs;
     if (FLAGS_options_file != "") {
-      auto s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(), &db_opts,
+      auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
                                    &cf_descs);
+      db_opts.env = FLAGS_env;
       if (s.ok()) {
         *opts = Options(db_opts, cf_descs[0].options);
         return true;
@@ -3399,6 +3404,7 @@ class Benchmark {
 
     assert(db_.db == nullptr);
 
+    options.env = FLAGS_env;
     options.max_open_files = FLAGS_open_files;
     if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
       options.write_buffer_manager.reset(
@@ -6596,7 +6602,7 @@ int db_bench_tool(int argc, char** argv) {
     fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
     exit(1);
   } else if (!FLAGS_env_uri.empty()) {
-    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env);
+    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
     if (FLAGS_env == nullptr) {
       fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
       exit(1);
@@ -6641,7 +6647,7 @@ int db_bench_tool(int argc, char** argv) {
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
     std::string default_db_path;
-    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    FLAGS_env->GetTestDirectory(&default_db_path);
     default_db_path += "/dbbench";
     FLAGS_db = default_db_path;
   }
diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 2b9e063f291..fba2d015059 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -617,6 +617,11 @@ DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
 static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
 
 DEFINE_string(hdfs, "", "Name of hdfs environment");
+
+DEFINE_string(env_uri, "",
+              "URI for env lookup. Mutually exclusive with --hdfs");
+
+static std::shared_ptr<rocksdb::Env> env_guard;
 // posix or hdfs environment
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
 
@@ -2721,7 +2726,9 @@ class StressTest {
     assert(rand_column_families.size() == rand_keys.size());
     std::string checkpoint_dir =
         FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
+    Options tmp_opts(options_);
+    tmp_opts.listeners.clear();
+    DestroyDB(checkpoint_dir, tmp_opts);
     Checkpoint* checkpoint = nullptr;
     Status s = Checkpoint::Create(db_, &checkpoint);
     if (s.ok()) {
@@ -2777,7 +2784,7 @@ class StressTest {
       delete checkpoint_db;
       checkpoint_db = nullptr;
     }
-    DestroyDB(checkpoint_dir, Options());
+    DestroyDB(checkpoint_dir, tmp_opts);
     if (!s.ok()) {
       fprintf(stderr, "A checkpoint operation failed with: %s\n",
               s.ToString().c_str());
@@ -2984,8 +2991,9 @@ class StressTest {
 #else
       DBOptions db_options;
       std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      Status s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(),
-                                     &db_options, &cf_descriptors);
+      Status s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_options,
+                                     &cf_descriptors);
+      db_options.env = FLAGS_env;
       if (!s.ok()) {
         fprintf(stderr, "Unable to load options file %s --- %s\n",
                 FLAGS_options_file.c_str(), s.ToString().c_str());
@@ -3169,6 +3177,7 @@ class StressTest {
         secondaries_.resize(FLAGS_threads);
         std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
         Options tmp_opts;
+        tmp_opts.env = options_.env;
         tmp_opts.max_open_files = FLAGS_open_files;
         for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
           const std::string secondary_path =
@@ -3694,7 +3703,7 @@ class NonBatchedOpsStressTest : public StressTest {
       s = FLAGS_env->DeleteFile(sst_filename);
     }
 
-    SstFileWriter sst_file_writer(EnvOptions(), options_);
+    SstFileWriter sst_file_writer(EnvOptions(options_), options_);
     if (s.ok()) {
       s = sst_file_writer.Open(sst_filename);
     }
@@ -4324,7 +4333,7 @@ class CfConsistencyStressTest : public StressTest {
       const std::vector<int64_t>& /* rand_keys */) {
     std::string checkpoint_dir =
         FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
+    DestroyDB(checkpoint_dir, options_);
     Checkpoint* checkpoint = nullptr;
     Status s = Checkpoint::Create(db_, &checkpoint);
     if (s.ok()) {
@@ -4358,7 +4367,7 @@ class CfConsistencyStressTest : public StressTest {
       delete checkpoint_db;
       checkpoint_db = nullptr;
     }
-    DestroyDB(checkpoint_dir, Options());
+    DestroyDB(checkpoint_dir, options_);
     if (!s.ok()) {
       fprintf(stderr, "A checkpoint operation failed with: %s\n",
               s.ToString().c_str());
@@ -4546,7 +4555,17 @@ int db_stress_tool(int argc, char** argv) {
       StringToCompressionType(FLAGS_compression_type.c_str());
   FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
   if (!FLAGS_hdfs.empty()) {
+    if (!FLAGS_env_uri.empty()) {
+      fprintf(stderr, "Cannot specify both --hdfs and --env_uri.\n");
+      exit(1);
+    }
     FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  } else if (!FLAGS_env_uri.empty()) {
+    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
+    if (FLAGS_env == nullptr) {
+      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
+      exit(1);
+    }
   }
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
@@ -4644,7 +4663,7 @@ int db_stress_tool(int argc, char** argv) {
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
     std::string default_db_path;
-    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    FLAGS_env->GetTestDirectory(&default_db_path);
     default_db_path += "/dbstress";
     FLAGS_db = default_db_path;
   }

From 09333606448a9499e0affd0d780c90d8a4dd7572 Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Tue, 22 Oct 2019 16:38:13 -0700
Subject: [PATCH 473/572] Fix the potential memory leak in trace_replay (#5955)

Summary:
In the previous PR https://github.com/facebook/rocksdb/issues/5934 , in the while loop, if/else if is used without ending with else to free the object referenced by ra, it might cause potential memory leak (warning during compiling). Fix it by changing the last "else if" to "else".
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5955

Test Plan: pass make asan check, pass the USE_CLANG=1 TEST_TMPDIR=/dev/shm/rocksdb OPT=-g make -j64 analyze.

Differential Revision: D18071612

Pulled By: zhichao-cao

fbshipit-source-id: 51c00023d0c97c2921507254329aed55d56e1786
---
 trace_replay/trace_replay.cc | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index b643e620e02..41d98829b31 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -315,11 +315,10 @@ Status Replayer::MultiThreadReplay(uint32_t threads_num) {
   ReadOptions roptions;
   uint64_t ops = 0;
   while (s.ok()) {
-    ReplayerWorkerArg* ra = new ReplayerWorkerArg;
+    std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
     ra->db = db_;
     s = ReadTrace(&(ra->trace_entry));
     if (!s.ok()) {
-      delete ra;
       break;
     }
     ra->woptions = woptions;
@@ -329,23 +328,29 @@ Status Replayer::MultiThreadReplay(uint32_t threads_num) {
         replay_epoch + std::chrono::microseconds(
                            (ra->trace_entry.ts - header.ts) / fast_forward_));
     if (ra->trace_entry.type == kTraceWrite) {
-      thread_pool.Schedule(&Replayer::BGWorkWriteBatch, ra, nullptr, nullptr);
+      thread_pool.Schedule(&Replayer::BGWorkWriteBatch, ra.release(), nullptr,
+                           nullptr);
       ops++;
     } else if (ra->trace_entry.type == kTraceGet) {
-      thread_pool.Schedule(&Replayer::BGWorkGet, ra, nullptr, nullptr);
+      thread_pool.Schedule(&Replayer::BGWorkGet, ra.release(), nullptr,
+                           nullptr);
       ops++;
     } else if (ra->trace_entry.type == kTraceIteratorSeek) {
-      thread_pool.Schedule(&Replayer::BGWorkIterSeek, ra, nullptr, nullptr);
+      thread_pool.Schedule(&Replayer::BGWorkIterSeek, ra.release(), nullptr,
+                           nullptr);
       ops++;
     } else if (ra->trace_entry.type == kTraceIteratorSeekForPrev) {
-      thread_pool.Schedule(&Replayer::BGWorkIterSeekForPrev, ra, nullptr,
-                           nullptr);
+      thread_pool.Schedule(&Replayer::BGWorkIterSeekForPrev, ra.release(),
+                           nullptr, nullptr);
       ops++;
     } else if (ra->trace_entry.type == kTraceEnd) {
       // Do nothing for now.
       // TODO: Add some validations later.
-      delete ra;
       break;
+    } else {
+      // Other trace entry types that are not implemented for replay.
+      // To finish the replay, we continue the process.
+      continue;
     }
   }
 

From b4ebda7a39b9df0584081d2f62bfbf3e9c623713 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 23 Oct 2019 13:51:03 -0700
Subject: [PATCH 474/572] Make buckifier python3 compatible (#5922)

Summary:
Make buckifier/buckify_rocksdb.py run on both Python 3 and 2
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5922

Test Plan:
```
$python3 buckifier/buckify_rocksdb.py
$python3 buckifier/buckify_rocksdb.py '{"fake": {"extra_deps": [":test_dep", "//fakes/module:mock1"], "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]}}'
$python2 buckifier/buckify_rocksdb.py
$python2 buckifier/buckify_rocksdb.py '{"fake": {"extra_deps": [":test_dep", "//fakes/module:mock1"], "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]}}'
```

Differential Revision: D17920611

Pulled By: riversand963

fbshipit-source-id: cc6e2f36013a88a710d96098f6ca18cbe85e3f62
---
 .gitignore                   |  1 +
 buckifier/buckify_rocksdb.py | 18 ++++++++----------
 buckifier/targets_builder.py |  8 +++++++-
 buckifier/util.py            | 15 +++++++++++++--
 4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 199458901ec..9a281d67675 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,4 @@ tp2/
 fbcode/
 fbcode
 buckifier/*.pyc
+buckifier/__pycache__
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index 3b814f3271f..d2bba5940cc 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -3,6 +3,10 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import str
+except ImportError:
+    from __builtin__ import str
 from targets_builder import TARGETSBuilder
 import json
 import os
@@ -108,9 +112,9 @@ def get_tests(repo_path):
 # Parse extra dependencies passed by user from command line
 def get_dependencies():
     deps_map = {
-        ''.encode('ascii'): {
-            'extra_deps'.encode('ascii'): [],
-            'extra_compiler_flags'.encode('ascii'): []
+        '': {
+            'extra_deps': [],
+            'extra_compiler_flags': []
         }
     }
     if len(sys.argv) < 2:
@@ -119,13 +123,7 @@ def get_dependencies():
     def encode_dict(data):
         rv = {}
         for k, v in data.items():
-            if isinstance(k, unicode):
-                k = k.encode('ascii')
-            if isinstance(v, unicode):
-                v = v.encode('ascii')
-            elif isinstance(v, list):
-                v = [x.encode('ascii') for x in v]
-            elif isinstance(v, dict):
+            if isinstance(v, dict):
                 v = encode_dict(v)
             rv[k] = v
         return rv
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index 78db6a169b3..ba90bc612da 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -3,6 +3,12 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import object
+    from builtins import str
+except ImportError:
+    from __builtin__ import object
+    from __builtin__ import str
 import targets_cfg
 
 def pretty_list(lst, indent=8):
@@ -18,7 +24,7 @@ def pretty_list(lst, indent=8):
     return res
 
 
-class TARGETSBuilder:
+class TARGETSBuilder(object):
     def __init__(self, path):
         self.path = path
         self.targets_file = open(path, 'w')
diff --git a/buckifier/util.py b/buckifier/util.py
index 2eda69f1075..f04929a277c 100644
--- a/buckifier/util.py
+++ b/buckifier/util.py
@@ -6,11 +6,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import object
+except ImportError:
+    from __builtin__ import object
 import subprocess
+import sys
 import os
 import time
 
-class ColorString:
+class ColorString(object):
     """ Generate colorful strings on terminal """
     HEADER = '\033[95m'
     BLUE = '\033[94m'
@@ -21,7 +26,13 @@ class ColorString:
 
     @staticmethod
     def _make_color_str(text, color):
-        return "".join([color, text.encode('utf-8'), ColorString.ENDC])
+        # In Python2, default encoding for unicode string is ASCII
+        if sys.version_info.major <= 2:
+            return "".join(
+                [color, text.encode('utf-8'), ColorString.ENDC])
+        # From Python3, default encoding for unicode string is UTF-8
+        return "".join(
+            [color, text, ColorString.ENDC])
 
     @staticmethod
     def ok(text):

From 6a32e3b5628b8d53a66c3422c82e50b6142aeb11 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Wed, 23 Oct 2019 15:45:46 -0700
Subject: [PATCH 475/572] Remove unused BloomFilterPolicy::hash_func_ (#5961)

Summary:
This is an internal, file-local "feature" that is not used and
potentially confusing.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5961

Test Plan: make check

Differential Revision: D18099018

Pulled By: pdillinger

fbshipit-source-id: 7870627eeed09941d12538ec55d10d2e164fc716
---
 util/bloom.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/util/bloom.cc b/util/bloom.cc
index 9e1f471760d..59c9351f5f3 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -201,7 +201,7 @@ class FullFilterBitsReader : public FilterBitsReader {
 class BloomFilterPolicy : public FilterPolicy {
  public:
   explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
-      : bits_per_key_(bits_per_key), hash_func_(BloomHash),
+      : bits_per_key_(bits_per_key),
         use_block_based_builder_(use_block_based_builder) {
     initialize();
   }
@@ -226,7 +226,7 @@ class BloomFilterPolicy : public FilterPolicy {
     dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
     for (int i = 0; i < n; i++) {
-      LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits, num_probes_,
+      LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
                                          array);
     }
   }
@@ -249,7 +249,7 @@ class BloomFilterPolicy : public FilterPolicy {
       return true;
     }
     // NB: using k not num_probes_
-    return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits, k,
+    return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
                                                    array);
   }
 
@@ -323,8 +323,6 @@ class BloomFilterPolicy : public FilterPolicy {
  private:
   int bits_per_key_;
   int num_probes_;
-  uint32_t (*hash_func_)(const Slice& key);
-
   const bool use_block_based_builder_;
 
   void initialize() {

From 9f1e5a0b87fb1eb50b3c580a0ab7ce7956f1a655 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 23 Oct 2019 16:55:43 -0700
Subject: [PATCH 476/572] CfConsistencyStressTest to validate key consistent
 across CFs in TestGet() (#5863)

Summary:
Right now in CF consitency stres test's TestGet(), keys are just fetched without validation. With this change, in 1/2 the time, compare all the CFs share the same value with the same key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5863

Test Plan: Run "make crash_test_with_atomic_flush" and see tests pass. Hack the code to generate some inconsistency and observe the test fails as expected.

Differential Revision: D17934206

fbshipit-source-id: 00ba1a130391f28785737b677f80f366fb83cced
---
 tools/db_stress_tool.cc | 72 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 6 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index fba2d015059..7e1fa3f3af4 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -4228,12 +4228,72 @@ class CfConsistencyStressTest : public StressTest {
                          const std::vector<int64_t>& rand_keys) {
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
-    auto cfh =
-        column_families_[rand_column_families[thread->rand.Next() %
-                                              rand_column_families.size()]];
-    std::string from_db;
-    Status s = db_->Get(readoptions, cfh, key, &from_db);
-    if (s.ok()) {
+    Status s;
+    bool is_consistent = true;
+
+    if (thread->rand.OneIn(2)) {
+      // 1/2 chance, does a random read from random CF
+      auto cfh =
+          column_families_[rand_column_families[thread->rand.Next() %
+                                                rand_column_families.size()]];
+      std::string from_db;
+      s = db_->Get(readoptions, cfh, key, &from_db);
+    } else {
+      // 1/2 chance, comparing one key is the same across all CFs
+      const Snapshot* snapshot = db_->GetSnapshot();
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = snapshot;
+
+      std::string value0;
+      s = db_->Get(readoptionscopy, column_families_[rand_column_families[0]],
+                   key, &value0);
+      if (s.ok() || s.IsNotFound()) {
+        bool found = s.ok();
+        for (size_t i = 1; i < rand_column_families.size(); i++) {
+          std::string value1;
+          s = db_->Get(readoptionscopy,
+                       column_families_[rand_column_families[i]], key, &value1);
+          if (!s.ok() && !s.IsNotFound()) {
+            break;
+          }
+          if (!found && s.ok()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    key_str.c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[0].c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(), value1.c_str());
+            is_consistent = false;
+          } else if (found && s.IsNotFound()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    key_str.c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(), value0.c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[i].c_str());
+            is_consistent = false;
+          } else if (s.ok() && value0 != value1) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    key_str.c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(), value0.c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(), value1.c_str());
+            is_consistent = false;
+          }
+          if (!is_consistent) {
+            break;
+          }
+        }
+      }
+
+      db_->ReleaseSnapshot(snapshot);
+    }
+    if (!is_consistent) {
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    } else if (s.ok()) {
       thread->stats.AddGets(1, 1);
     } else if (s.IsNotFound()) {
       thread->stats.AddGets(1, 0);

From 250953112348d39a3f29c37834041ae611358264 Mon Sep 17 00:00:00 2001
From: Dan Lambright <dlambrig@gmail.com>
Date: Thu, 24 Oct 2019 11:07:11 -0700
Subject: [PATCH 477/572] Add test showing range tombstones can create
 excessively large compactions (#5956)

Summary:
For more information on the original problem see this [link](https://github.com/facebook/rocksdb/issues/3977).

This change adds two new tests. They are identical other than one uses range tombstones and the other does not. Each test generates sub files at L2 which overlap with keys L3. The test that uses range tombstones generates a single file at L2. This single file will generate a very large range overlap that will in turn create excessively large compaction.

1: T001 - T005
2:  000 -  005

In contrast, the test that uses key ranges generates 3 files at L2. As a single file is compacted at a time, those 3 files will generate less work per compaction iteration.

1:  001 - 002
1:  003 - 004
1:  005
2:  000 - 005
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5956

Differential Revision: D18071631

Pulled By: dlambrig

fbshipit-source-id: 12abae75fb3e0b022d228c6371698aa5e53385df
---
 db/db_range_del_test.cc | 78 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index e58095b2d92..ec448b731e0 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -1521,6 +1521,84 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
   ASSERT_EQ(1, num_range_deletions);
 }
 
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key((kNumFiles)*kNumPerFile + 1)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+
+  // The tombstone range is not broken up into multiple SSTs which may incur a
+  // large compaction with L2.
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+    ASSERT_OK(Put(Key(i), "0x123"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // The key range is broken up into three SSTs to avoid a future big compaction
+  // with the grandparent
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb

From 2837008525d8d473ba0798144d427465ae71715d Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Oct 2019 13:07:09 -0700
Subject: [PATCH 478/572] Vary key size and alignment in filter_bench (#5933)

Summary:
The first version of filter_bench has selectable key size
but that size does not vary throughout a test run. This artificially
favors "branchy" hash functions like the existing BloomHash,
MurmurHash1, probably because of optimal return for branch prediction.

This change primarily varies those key sizes from -2 to +2 bytes vs.
the average selected size. We also set the default key size at 24 to
better reflect our best guess of typical key size.

But steadily random key sizes may not be realistic either. So this
change introduces a new filter_bench option:
-vary_key_size_log2_interval=n where the same key size is used 2^n
times and then changes to another size. I've set the default at 5
(32 times same size) as a compromise between deployments with
rather consistent vs. rather variable key sizes. On my Skylake
system, the performance boost to MurmurHash1 largely lies between
n=10 and n=15.

Also added -vary_key_alignment (bool, now default=true), though this
doesn't currently seem to matter in hash functions under
consideration.

This change also does a "dry run" for each testing scenario, to improve
the accuracy of those numbers, as there was more difference between
scenarios than expected. Subtracting gross test run times from dry run
times is now also embedded in the output, because these "net" times are
generally the most useful.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5933

Differential Revision: D18121683

Pulled By: pdillinger

fbshipit-source-id: 3c7efee1c5661a5fe43de555e786754ddf80dc1e
---
 util/filter_bench.cc | 158 +++++++++++++++++++++++++++++--------------
 1 file changed, 106 insertions(+), 52 deletions(-)

diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 359292e9970..e0b4a979986 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -13,6 +13,7 @@ int main() {
 
 #include <cinttypes>
 #include <iostream>
+#include <sstream>
 #include <vector>
 
 #include "port/port.h"
@@ -37,7 +38,15 @@ DEFINE_double(working_mem_size_mb, 200,
 DEFINE_uint32(average_keys_per_filter, 10000,
               "Average number of keys per filter");
 
-DEFINE_uint32(key_size, 16, "Number of bytes each key should be");
+DEFINE_uint32(key_size, 24, "Average number of bytes for each key");
+
+DEFINE_bool(vary_key_alignment, true,
+            "Whether to vary key alignment (default: at least 32-bit "
+            "alignment)");
+
+DEFINE_uint32(vary_key_size_log2_interval, 5,
+              "Use same key size 2^n times, then change. Key size varies from "
+              "-2 to +2 bytes vs. average, unless n>=30 to fix key size.");
 
 DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
 
@@ -66,6 +75,7 @@ void _always_assert_fail(int line, const char *file, const char *expr) {
 
 using rocksdb::BlockContents;
 using rocksdb::CachableEntry;
+using rocksdb::EncodeFixed32;
 using rocksdb::fastrange32;
 using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
@@ -76,21 +86,41 @@ using rocksdb::Slice;
 using rocksdb::mock::MockBlockBasedTableTester;
 
 struct KeyMaker {
-  KeyMaker(size_t size)
-      : data_(new char[size]),
-        slice_(data_.get(), size),
-        vals_(reinterpret_cast<uint32_t *>(data_.get())) {
-    assert(size >= 8);
-    memset(data_.get(), 0, size);
+  KeyMaker(size_t avg_size)
+      : smallest_size_(avg_size -
+                       (FLAGS_vary_key_size_log2_interval >= 30 ? 2 : 0)),
+        buf_size_(avg_size + 11),  // pad to vary key size and alignment
+        buf_(new char[buf_size_]) {
+    memset(buf_.get(), 0, buf_size_);
+    assert(smallest_size_ > 8);
   }
-  std::unique_ptr<char[]> data_;
-  Slice slice_;
-  uint32_t *vals_;
+  size_t smallest_size_;
+  size_t buf_size_;
+  std::unique_ptr<char[]> buf_;
 
+  // Returns a unique(-ish) key based on the given parameter values. Each
+  // call returns a Slice from the same buffer so previously returned
+  // Slices should be considered invalidated.
   Slice Get(uint32_t filter_num, uint32_t val_num) {
-    vals_[0] = filter_num + val_num;
-    vals_[1] = val_num;
-    return slice_;
+    size_t start = FLAGS_vary_key_alignment ? val_num % 4 : 0;
+    size_t len = smallest_size_;
+    if (FLAGS_vary_key_size_log2_interval < 30) {
+      // To get range [avg_size - 2, avg_size + 2]
+      // use range [smallest_size, smallest_size + 4]
+      len += fastrange32(
+          (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
+    }
+    char * data = buf_.get() + start;
+    // Populate key data such that all data makes it into a key of at
+    // least 8 bytes. We also don't want all the within-filter key
+    // variance confined to a contiguous 32 bits, because then a 32 bit
+    // hash function can "cheat" the false positive rate by
+    // approximating a perfect hash.
+    EncodeFixed32(data, val_num);
+    EncodeFixed32(data + 4, filter_num + val_num);
+    // ensure clearing leftovers from different alignment
+    EncodeFixed32(data + 8, 0);
+    return Slice(data, len);
   }
 };
 
@@ -157,6 +187,7 @@ struct FilterBench : public MockBlockBasedTableTester {
   std::vector<KeyMaker> kms_;
   std::vector<FilterInfo> infos_;
   Random32 random_;
+  std::ostringstream fp_rate_report_;
 
   FilterBench()
       : MockBlockBasedTableTester(
@@ -169,7 +200,7 @@ struct FilterBench : public MockBlockBasedTableTester {
 
   void Go();
 
-  void RandomQueryTest(bool inside, bool dry_run, TestMode mode);
+  double RandomQueryTest(bool inside, bool dry_run, TestMode mode);
 };
 
 void FilterBench::Go() {
@@ -184,7 +215,7 @@ void FilterBench::Go() {
   const std::vector<TestMode> &testModes =
       FLAGS_quick ? quickTestModes : allTestModes;
   if (FLAGS_quick) {
-    FLAGS_m_queries /= 10.0;
+    FLAGS_m_queries /= 7.0;
   }
 
   std::cout << "Building..." << std::endl;
@@ -260,27 +291,33 @@ void FilterBench::Go() {
 
   std::cout << "----------------------------" << std::endl;
   std::cout << "Inside queries..." << std::endl;
-  random_.Seed(FLAGS_seed + 1);
-  RandomQueryTest(/*inside*/ true, /*dry_run*/ true, kRandomFilter);
   for (TestMode tm : testModes) {
     random_.Seed(FLAGS_seed + 1);
-    RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
+    double f = RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
+    random_.Seed(FLAGS_seed + 1);
+    double d = RandomQueryTest(/*inside*/ true, /*dry_run*/ true, tm);
+    std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+              << std::endl;
   }
+  std::cout << fp_rate_report_.str();
 
   std::cout << "----------------------------" << std::endl;
   std::cout << "Outside queries..." << std::endl;
-  random_.Seed(FLAGS_seed + 2);
-  RandomQueryTest(/*inside*/ false, /*dry_run*/ true, kRandomFilter);
   for (TestMode tm : testModes) {
     random_.Seed(FLAGS_seed + 2);
-    RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
+    double f = RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
+    random_.Seed(FLAGS_seed + 2);
+    double d = RandomQueryTest(/*inside*/ false, /*dry_run*/ true, tm);
+    std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+              << std::endl;
   }
+  std::cout << fp_rate_report_.str();
 
   std::cout << "----------------------------" << std::endl;
   std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
 }
 
-void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
+double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
   for (auto &info : infos_) {
     info.outside_queries_ = 0;
     info.false_positives_ = 0;
@@ -313,16 +350,19 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     num_primary_filters = (num_primary_filters + 4) / 5;
   }
   uint32_t batch_size = 1;
-  std::unique_ptr<Slice *[]> batch_slices;
+  std::unique_ptr<Slice[]> batch_slices;
+  std::unique_ptr<Slice *[]> batch_slice_ptrs;
   std::unique_ptr<bool[]> batch_results;
   if (mode == kBatchPrepared || mode == kBatchUnprepared) {
     batch_size = static_cast<uint32_t>(kms_.size());
-    batch_slices.reset(new Slice *[batch_size]);
-    batch_results.reset(new bool[batch_size]);
-    for (uint32_t i = 0; i < batch_size; ++i) {
-      batch_slices[i] = &kms_[i].slice_;
-      batch_results[i] = false;
-    }
+  }
+
+  batch_slices.reset(new Slice[batch_size]);
+  batch_slice_ptrs.reset(new Slice *[batch_size]);
+  batch_results.reset(new bool[batch_size]);
+  for (uint32_t i = 0; i < batch_size; ++i) {
+    batch_results[i] = false;
+    batch_slice_ptrs[i] = &batch_slices[i];
   }
 
   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
@@ -339,9 +379,12 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     FilterInfo &info = infos_[filter_index];
     for (uint32_t i = 0; i < batch_size; ++i) {
       if (inside) {
-        kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
       } else {
-        kms_[i].Get(info.filter_id_, random_.Next() | uint32_t{0x80000000});
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_) |
+                                             uint32_t{0x80000000});
         info.outside_queries_++;
       }
     }
@@ -350,7 +393,7 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
       for (uint32_t i = 0; i < batch_size; ++i) {
         batch_results[i] = false;
       }
-      info.reader_->MayMatch(batch_size, batch_slices.get(),
+      info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
                              batch_results.get());
       for (uint32_t i = 0; i < batch_size; ++i) {
         if (inside) {
@@ -362,19 +405,19 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     } else {
       for (uint32_t i = 0; i < batch_size; ++i) {
         if (dry_run) {
-          dry_run_hash ^= rocksdb::BloomHash(kms_[i].slice_);
+          dry_run_hash ^= rocksdb::BloomHash(batch_slices[i]);
         } else {
           bool may_match;
           if (FLAGS_use_full_block_reader) {
             may_match = info.full_block_reader_->KeyMayMatch(
-                kms_[i].slice_,
+                batch_slices[i],
                 /*prefix_extractor=*/nullptr,
                 /*block_offset=*/rocksdb::kNotValid,
                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
                 /*get_context=*/nullptr,
                 /*lookup_context=*/nullptr);
           } else {
-            may_match = info.reader_->MayMatch(kms_[i].slice_);
+            may_match = info.reader_->MayMatch(batch_slices[i]);
           }
           if (inside) {
             ALWAYS_ASSERT(may_match);
@@ -389,15 +432,17 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
   uint64_t elapsed_nanos = timer.ElapsedNanos();
   double ns = double(elapsed_nanos) / max_queries;
 
-  if (dry_run) {
-    // Printing part of hash prevents dry run components from being optimized
-    // away by compiler
-    std::cout << "  Dry run (" << std::hex << (dry_run_hash & 0xfff) << std::dec
-              << ") ";
-  } else {
-    std::cout << "  " << TestModeToString(mode) << " ";
+  if (!FLAGS_quick) {
+    if (dry_run) {
+      // Printing part of hash prevents dry run components from being optimized
+      // away by compiler
+      std::cout << "    Dry run (" << std::hex << (dry_run_hash & 0xfffff)
+                << std::dec << ") ";
+    } else {
+      std::cout << "    Gross filter    ";
+    }
+    std::cout << "ns/op: " << ns << std::endl;
   }
-  std::cout << "ns/op: " << ns << std::endl;
 
   if (!inside && !dry_run && mode == kRandomFilter) {
     uint64_t q = 0;
@@ -413,16 +458,19 @@ void FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
         best_fp_rate = std::min(best_fp_rate, fp_rate);
       }
     }
-    std::cout << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
+    fp_rate_report_ << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
     if (!FLAGS_quick) {
-      std::cout << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
-                << std::endl;
-      std::cout << "    Best    FP rate %: " << 100.0 * best_fp_rate
-                << std::endl;
-      std::cout << "    Best possible bits/key: "
-                << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
+      fp_rate_report_ << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best    FP rate %: " << 100.0 * best_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best possible bits/key: "
+                      << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
     }
+  } else {
+    fp_rate_report_.clear();
   }
+  return ns;
 }
 
 int main(int argc, char **argv) {
@@ -440,8 +488,14 @@ int main(int argc, char **argv) {
         << "  \"Outside\" - key that was not added to filter" << std::endl
         << "  \"FN\" - false negative query (must not happen)" << std::endl
         << "  \"FP\" - false positive query (OK at low rate)" << std::endl
-        << "  \"Dry run\" - cost of testing and hashing overhead. Consider"
-        << "\n     subtracting this cost from the others." << std::endl
+        << "  \"Dry run\" - cost of testing and hashing overhead." << std::endl
+        << "  \"Gross filter\" - cost of filter queries including testing "
+        << "\n     and hashing overhead." << std::endl
+        << "  \"net\" - best estimate of time in filter operation, without "
+        << "\n     testing and hashing overhead (gross filter - dry run)"
+        << std::endl
+        << "  \"ns/op\" - nanoseconds per operation (key query or add)"
+        << std::endl
         << "  \"Single filter\" - essentially minimum cost, assuming filter"
         << "\n     fits easily in L1 CPU cache." << std::endl
         << "  \"Batched, prepared\" - several queries at once against a"

From dd19014a7ac3358e1cb89180225039da88b39ea2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Oct 2019 13:18:48 -0700
Subject: [PATCH 479/572] FilterPolicy consolidation, part 1/2 (#5963)

Summary:
The parts that are used to implement FilterPolicy /
NewBloomFilterPolicy and not used other than for the block-based table
should be consolidated under table/block_based/filter_policy*. I don't
foresee sharing these APIs with e.g. the Plain Table because they don't
expose hashes for reuse in indexing.

This change is step 1 of 2:
(a) mv table/full_filter_bits_builder.h to
table/block_based/filter_policy_internal.h which I expect to expand
soon to internally reveal more implementation details for testing.
(b) consolidate eventual contents of table/block_based/filter_policy.cc
in util/bloom.cc, which has the most elaborate revision history
(see step 2 ...)

Step 2 soon to follow:
mv util/bloom.cc table/block_based/filter_policy.cc
This gets its own PR so that git has the best chance of following the
rename for blame purposes. Note that low-level shared implementation
details of Bloom filters are in util/bloom_impl.h.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5963

Test Plan: make check

Differential Revision: D18121199

Pulled By: pdillinger

fbshipit-source-id: 8f21732c3d8909777e3240e4ac3123d73140326a
---
 CMakeLists.txt                                   |  1 -
 TARGETS                                          |  1 -
 src.mk                                           |  1 -
 .../filter_policy_internal.h}                    |  0
 table/block_based/full_filter_block_test.cc      |  2 +-
 .../block_based/partitioned_filter_block_test.cc |  2 +-
 util/bloom.cc                                    |  4 +++-
 util/bloom_test.cc                               |  2 +-
 util/filter_policy.cc                            | 16 ----------------
 9 files changed, 6 insertions(+), 23 deletions(-)
 rename table/{full_filter_bits_builder.h => block_based/filter_policy_internal.h} (100%)
 delete mode 100644 util/filter_policy.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16b3cafc856..ff09290f090 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -646,7 +646,6 @@ set(SOURCES
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/dynamic_bloom.cc
-        util/filter_policy.cc
         util/hash.cc
         util/murmurhash.cc
         util/random.cc
diff --git a/TARGETS b/TARGETS
index 1934f17645d..68d645a3d2b 100644
--- a/TARGETS
+++ b/TARGETS
@@ -274,7 +274,6 @@ cpp_library(
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
         "util/dynamic_bloom.cc",
-        "util/filter_policy.cc",
         "util/hash.cc",
         "util/murmurhash.cc",
         "util/random.cc",
diff --git a/src.mk b/src.mk
index 560f062023d..85fd83c24a5 100644
--- a/src.mk
+++ b/src.mk
@@ -166,7 +166,6 @@ LIB_SOURCES =                                                   \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
   util/dynamic_bloom.cc                                         \
-  util/filter_policy.cc                                         \
   util/hash.cc                                                  \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
diff --git a/table/full_filter_bits_builder.h b/table/block_based/filter_policy_internal.h
similarity index 100%
rename from table/full_filter_bits_builder.h
rename to table/block_based/filter_policy_internal.h
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 28b2cefaa77..0f6a5cdab8b 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -7,7 +7,7 @@
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/mock_block_based_table.h"
-#include "table/full_filter_bits_builder.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index ee93262ad01..21583f3e049 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -9,7 +9,7 @@
 
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
-#include "table/full_filter_bits_builder.h"
+#include "table/block_based/filter_policy_internal.h"
 
 #include "index_builder.h"
 #include "logging/logging.h"
diff --git a/util/bloom.cc b/util/bloom.cc
index 59c9351f5f3..5eef86be29c 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -12,7 +12,7 @@
 #include "rocksdb/slice.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/full_filter_block.h"
-#include "table/full_filter_bits_builder.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "third-party/folly/folly/ConstexprMath.h"
 #include "util/bloom_impl.h"
 #include "util/coding.h"
@@ -340,4 +340,6 @@ const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
   return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
 }
 
+FilterPolicy::~FilterPolicy() { }
+
 }  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index bcea94ef845..76a0541c897 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -21,7 +21,7 @@ int main() {
 #include "logging/logging.h"
 #include "memory/arena.h"
 #include "rocksdb/filter_policy.h"
-#include "table/full_filter_bits_builder.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
diff --git a/util/filter_policy.cc b/util/filter_policy.cc
deleted file mode 100644
index efb9bf4763c..00000000000
--- a/util/filter_policy.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "rocksdb/filter_policy.h"
-
-namespace rocksdb {
-
-FilterPolicy::~FilterPolicy() { }
-
-}  // namespace rocksdb

From f7e7b34ebeb5a876c2c8429f6bef0e7bf049e2d3 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 24 Oct 2019 14:42:43 -0700
Subject: [PATCH 480/572] Propagate SST and blob file numbers through the
 EventListener interface (#5962)

Summary:
This patch adds a number of new information elements to the FlushJobInfo and
CompactionJobInfo structures that are passed to EventListeners via the
OnFlush{Begin, Completed} and OnCompaction{Begin, Completed} callbacks.
Namely, for flushes, the file numbers of the new SST and the oldest blob file it
references are propagated. For compactions, the new pieces of information are
the file number, level, and the oldest blob file referenced by each compaction
input and output file.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5962

Test Plan:
Extended the EventListener unit tests with logic that checks that these information
elements are correctly propagated from the corresponding FileMetaData.

Differential Revision: D18095568

Pulled By: ltamasi

fbshipit-source-id: 6874359a6aadb53366b5fe87adcb2f9bd27a0a56
---
 HISTORY.md                             |   1 +
 db/db_impl/db_impl_compaction_flush.cc |  38 +++++++---
 db/flush_job.cc                        |   8 +-
 db/listener_test.cc                    | 100 ++++++++++++++++++++++---
 include/rocksdb/listener.h             |  37 +++++++--
 5 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 0450a0c4bcb..68269b19cf9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,7 @@
 * When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
 * `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
 * `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
+* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 4c37dc37bea..0497e859a6a 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -575,8 +575,10 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
     info.cf_name = cfd->GetName();
     // TODO(yhchiang): make db_paths dynamic in case flush does not
     //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
-                                       file_meta->fd.GetNumber());
+    const uint64_t file_number = file_meta->fd.GetNumber();
+    info.file_path =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+    info.file_number = file_number;
     info.thread_id = env_->GetThreadID();
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
@@ -1118,9 +1120,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
     info.compression = c->output_compression();
     for (size_t i = 0; i < c->num_input_levels(); ++i) {
       for (const auto fmd : *c->inputs(i)) {
+        const FileDescriptor& desc = fmd->fd;
+        const uint64_t file_number = desc.GetNumber();
         auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                                fmd->fd.GetNumber(), fmd->fd.GetPathId());
+                                file_number, desc.GetPathId());
         info.input_files.push_back(fn);
+        info.input_file_infos.push_back(CompactionFileInfo{
+            static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
         if (info.table_properties.count(fn) == 0) {
           std::shared_ptr<const TableProperties> tp;
           auto s = current->GetTableProperties(&tp, fmd, &fn);
@@ -1131,9 +1137,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
       }
     }
     for (const auto newf : c->edit()->GetNewFiles()) {
+      const FileMetaData& meta = newf.second;
+      const FileDescriptor& desc = meta.fd;
+      const uint64_t file_number = desc.GetNumber();
       info.output_files.push_back(TableFileName(
-          c->immutable_cf_options()->cf_paths, newf.second.fd.GetNumber(),
-          newf.second.fd.GetPathId()));
+          c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+      info.output_file_infos.push_back(CompactionFileInfo{
+          newf.first, file_number, meta.oldest_blob_file_number});
     }
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnCompactionBegin(this, info);
@@ -2956,9 +2966,13 @@ void DBImpl::BuildCompactionJobInfo(
   compaction_job_info->compression = c->output_compression();
   for (size_t i = 0; i < c->num_input_levels(); ++i) {
     for (const auto fmd : *c->inputs(i)) {
-      auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                              fmd->fd.GetNumber(), fmd->fd.GetPathId());
+      const FileDescriptor& desc = fmd->fd;
+      const uint64_t file_number = desc.GetNumber();
+      auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+                              desc.GetPathId());
       compaction_job_info->input_files.push_back(fn);
+      compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+          static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
       if (compaction_job_info->table_properties.count(fn) == 0) {
         std::shared_ptr<const TableProperties> tp;
         auto s = current->GetTableProperties(&tp, fmd, &fn);
@@ -2969,9 +2983,13 @@ void DBImpl::BuildCompactionJobInfo(
     }
   }
   for (const auto& newf : c->edit()->GetNewFiles()) {
-    compaction_job_info->output_files.push_back(
-        TableFileName(c->immutable_cf_options()->cf_paths,
-                      newf.second.fd.GetNumber(), newf.second.fd.GetPathId()));
+    const FileMetaData& meta = newf.second;
+    const FileDescriptor& desc = meta.fd;
+    const uint64_t file_number = desc.GetNumber();
+    compaction_job_info->output_files.push_back(TableFileName(
+        c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+    compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+        newf.first, file_number, meta.oldest_blob_file_number});
   }
 }
 #endif
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 4046b226bf9..83ce7e74f7c 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -434,8 +434,12 @@ std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
   std::unique_ptr<FlushJobInfo> info(new FlushJobInfo);
   info->cf_id = cfd_->GetID();
   info->cf_name = cfd_->GetName();
-  info->file_path = MakeTableFileName(cfd_->ioptions()->cf_paths[0].path,
-                                      meta_.fd.GetNumber());
+
+  const uint64_t file_number = meta_.fd.GetNumber();
+  info->file_path =
+      MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+  info->file_number = file_number;
+  info->oldest_blob_file_number = meta_.oldest_blob_file_number;
   info->thread_id = db_options_.env->GetThreadID();
   info->job_id = job_context_->job_id;
   info->smallest_seqno = meta_.fd.smallest_seqno;
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 9fbd5d0d3ff..0e8bae40785 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -42,6 +43,14 @@ class EventListenerTest : public DBTestBase {
  public:
   EventListenerTest() : DBTestBase("/listener_test") {}
 
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
   const size_t k110KB = 110 << 10;
 };
 
@@ -79,11 +88,47 @@ class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
 
 class TestCompactionListener : public EventListener {
  public:
+  explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
   void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
     std::lock_guard<std::mutex> lock(mutex_);
     compacted_dbs_.push_back(db);
     ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+    for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+      ASSERT_EQ(ci.input_file_infos[i].file_number,
+                TableFileNameToNumber(ci.input_files[i]));
+    }
+
     ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+    ASSERT_TRUE(test_);
+    ASSERT_EQ(test_->db_, db);
+
+    std::vector<std::vector<FileMetaData>> files_by_level;
+    test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+                                           &files_by_level);
+    ASSERT_GT(files_by_level.size(), ci.output_level);
+
+    for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+      ASSERT_EQ(ci.output_file_infos[i].file_number,
+                TableFileNameToNumber(ci.output_files[i]));
+
+      auto it = std::find_if(
+          files_by_level[ci.output_level].begin(),
+          files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+            return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+          });
+      ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+      ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+                it->oldest_blob_file_number);
+    }
+
     ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
     ASSERT_GT(ci.thread_id, 0U);
 
@@ -98,6 +143,7 @@ class TestCompactionListener : public EventListener {
     }
   }
 
+  EventListenerTest* test_;
   std::vector<DB*> compacted_dbs_;
   std::mutex mutex_;
 };
@@ -125,13 +171,19 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
 
-  TestCompactionListener* listener = new TestCompactionListener();
+  TestCompactionListener* listener = new TestCompactionListener(this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
       "nikitich", "alyosha", "popovich"};
   CreateAndReopenWithCF(cf_names, options);
   ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(123, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
   ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
   ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
   ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
@@ -140,11 +192,9 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    const Slice kRangeStart = "a";
-    const Slice kRangeEnd = "z";
-    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
-                                     &kRangeStart, &kRangeEnd));
     dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     nullptr, nullptr));
     dbfull()->TEST_WaitForCompact();
   }
 
@@ -157,8 +207,8 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
 // This simple Listener can only handle one flush at a time.
 class TestFlushListener : public EventListener {
  public:
-  explicit TestFlushListener(Env* env)
-      : slowdown_count(0), stop_count(0), db_closed(), env_(env) {
+  TestFlushListener(Env* env, EventListenerTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
     db_closed = false;
   }
   void OnTableFileCreated(
@@ -210,6 +260,27 @@ class TestFlushListener : public EventListener {
     ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
     ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
     ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
     ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
     ASSERT_GT(info.thread_id, 0U);
     ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
@@ -226,6 +297,7 @@ class TestFlushListener : public EventListener {
 
  protected:
   Env* env_;
+  EventListenerTest* test_;
 };
 
 TEST_F(EventListenerTest, OnSingleDBFlushTest) {
@@ -235,7 +307,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
@@ -245,6 +317,12 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   CreateAndReopenWithCF(cf_names, options);
 
   ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(456, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
   ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
   ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
   ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
@@ -272,7 +350,7 @@ TEST_F(EventListenerTest, MultiCF) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
@@ -313,7 +391,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   const int kNumDBs = 5;
   const int kNumListeners = 10;
   for (int i = 0; i < kNumListeners; ++i) {
-    listeners.emplace_back(new TestFlushListener(options.env));
+    listeners.emplace_back(new TestFlushListener(options.env, this));
   }
 
   std::vector<std::string> cf_names = {
@@ -390,7 +468,7 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   const int kCompactionTrigger = 1;
   const int kSlowdownTrigger = 5;
   const int kStopTrigger = 100;
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 8d11bfaeaf8..57bb1eeb0d4 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -170,6 +170,10 @@ struct FlushJobInfo {
   std::string cf_name;
   // the path to the newly created file
   std::string file_path;
+  // the file number of the newly created file
+  uint64_t file_number;
+  // the oldest blob file referenced by the newly created file
+  uint64_t oldest_blob_file_number;
   // the id of the thread that completed this flush job.
   uint64_t thread_id;
   // the job id, which is unique in the same thread.
@@ -194,11 +198,18 @@ struct FlushJobInfo {
   FlushReason flush_reason;
 };
 
-struct CompactionJobInfo {
-  CompactionJobInfo() = default;
-  explicit CompactionJobInfo(const CompactionJobStats& _stats)
-      : stats(_stats) {}
+struct CompactionFileInfo {
+  // The level of the file.
+  int level;
+
+  // The file number of the file.
+  uint64_t file_number;
 
+  // The file number of the oldest blob file this SST file references.
+  uint64_t oldest_blob_file_number;
+};
+
+struct CompactionJobInfo {
   // the id of the column family where the compaction happened.
   uint32_t cf_id;
   // the name of the column family where the compaction happened.
@@ -213,11 +224,25 @@ struct CompactionJobInfo {
   int base_input_level;
   // the output level of the compaction.
   int output_level;
-  // the names of the compaction input files.
+
+  // The following variables contain information about compaction inputs
+  // and outputs. A file may appear in both the input and output lists
+  // if it was simply moved to a different level. The order of elements
+  // is the same across input_files and input_file_infos; similarly, it is
+  // the same across output_files and output_file_infos.
+
+  // The names of the compaction input files.
   std::vector<std::string> input_files;
 
-  // the names of the compaction output files.
+  // Additional information about the compaction input files.
+  std::vector<CompactionFileInfo> input_file_infos;
+
+  // The names of the compaction output files.
   std::vector<std::string> output_files;
+
+  // Additional information about the compaction output files.
+  std::vector<CompactionFileInfo> output_file_infos;
+
   // Table properties for input and output tables.
   // The map is keyed by values from input_files and output_files.
   TablePropertiesCollection table_properties;

From ec11eff3bc15b8df3c038021ca0a0180da5eac9b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Oct 2019 15:43:27 -0700
Subject: [PATCH 481/572] FilterPolicy consolidation, part 2/2 (#5966)

Summary:
The parts that are used to implement FilterPolicy /
NewBloomFilterPolicy and not used other than for the block-based table
should be consolidated under table/block_based/filter_policy*.

This change is step 2 of 2:
mv util/bloom.cc table/block_based/filter_policy.cc
This gets its own PR so that git has the best chance of following the
rename for blame purposes. Note that low-level shared implementation
details of Bloom filters remain in util/bloom_impl.h, and
util/bloom_test.cc remains where it is for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5966

Test Plan: make check

Differential Revision: D18124930

Pulled By: pdillinger

fbshipit-source-id: 823bc09025b3395f092ef46a46aa5ba92a914d84
---
 CMakeLists.txt                                      | 2 +-
 TARGETS                                             | 2 +-
 src.mk                                              | 2 +-
 util/bloom.cc => table/block_based/filter_policy.cc | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename util/bloom.cc => table/block_based/filter_policy.cc (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff09290f090..45693df5024 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -599,6 +599,7 @@ set(SOURCES
         table/block_based/data_block_hash_index.cc
         table/block_based/data_block_footer.cc
         table/block_based/filter_block_reader_common.cc
+        table/block_based/filter_policy.cc
         table/block_based/flush_block_policy.cc
         table/block_based/full_filter_block.cc
         table/block_based/index_builder.cc
@@ -638,7 +639,6 @@ set(SOURCES
         tools/trace_analyzer_tool.cc
         trace_replay/trace_replay.cc
         trace_replay/block_cache_tracer.cc
-        util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
diff --git a/TARGETS b/TARGETS
index 68d645a3d2b..267b3cea8a4 100644
--- a/TARGETS
+++ b/TARGETS
@@ -230,6 +230,7 @@ cpp_library(
         "table/block_based/data_block_footer.cc",
         "table/block_based/data_block_hash_index.cc",
         "table/block_based/filter_block_reader_common.cc",
+        "table/block_based/filter_policy.cc",
         "table/block_based/flush_block_policy.cc",
         "table/block_based/full_filter_block.cc",
         "table/block_based/index_builder.cc",
@@ -265,7 +266,6 @@ cpp_library(
         "tools/sst_dump_tool.cc",
         "trace_replay/block_cache_tracer.cc",
         "trace_replay/trace_replay.cc",
-        "util/bloom.cc",
         "util/build_version.cc",
         "util/coding.cc",
         "util/compaction_job_stats_impl.cc",
diff --git a/src.mk b/src.mk
index 85fd83c24a5..19f03123730 100644
--- a/src.mk
+++ b/src.mk
@@ -125,6 +125,7 @@ LIB_SOURCES =                                                   \
   table/block_based/data_block_hash_index.cc                    \
   table/block_based/data_block_footer.cc                        \
   table/block_based/filter_block_reader_common.cc               \
+  table/block_based/filter_policy.cc                                                 \
   table/block_based/flush_block_policy.cc                       \
   table/block_based/full_filter_block.cc                        \
   table/block_based/index_builder.cc                            \
@@ -157,7 +158,6 @@ LIB_SOURCES =                                                   \
   tools/dump/db_dump_tool.cc                                    \
   trace_replay/trace_replay.cc                                  \
   trace_replay/block_cache_tracer.cc                            \
-  util/bloom.cc                                                 \
   util/build_version.cc                                         \
   util/coding.cc                                                \
   util/compaction_job_stats_impl.cc                             \
diff --git a/util/bloom.cc b/table/block_based/filter_policy.cc
similarity index 100%
rename from util/bloom.cc
rename to table/block_based/filter_policy.cc

From ca7ccbe2ea6be042f90f31eb75ad4dca032dbed1 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Oct 2019 17:14:27 -0700
Subject: [PATCH 482/572] Misc hashing updates / upgrades (#5909)

Summary:
- Updated our included xxhash implementation to version 0.7.2 (== the latest dev version as of 2019-10-09).
- Using XXH_NAMESPACE (like other fb projects) to avoid potential name collisions.
- Added fastrange64, and unit tests for it and fastrange32. These are faster alternatives to hash % range.
- Use preview version of XXH3 instead of MurmurHash64A for NPHash64
-- Had to update cache_test to increase probability of passing for any given hash function.
- Use fastrange64 instead of % with uses of NPHash64
-- Had to fix WritePreparedTransactionTest.CommitOfDelayedPrepared to avoid deadlock apparently caused by new hash collision.
- Set default seed for NPHash64 because specifying a seed rarely makes sense for it.
- Removed unnecessary include xxhash.h in a popular .h file
- Rename preview version of XXH3 to XXH3p for clarity and to ease backward compatibility in case final version of XXH3 is integrated.

Relying on existing unit tests for NPHash64-related changes. Each new implementation of fastrange64 passed unit tests when manipulating my local build to select it. I haven't done any integration performance tests, but I consider the improved performance of the pieces being swapped in to be well established.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5909

Differential Revision: D18125196

Pulled By: pdillinger

fbshipit-source-id: f6bf83d49d20cbb2549926adf454fd035f0ecc0d
---
 build_tools/build_detect_platform             |   13 +
 cache/cache_test.cc                           |    4 +-
 db/db_basic_test.cc                           |    5 +-
 db/memtable.cc                                |    2 +-
 memtable/hash_linklist_rep.cc                 |    3 +-
 .../block_based/block_based_table_builder.cc  |   10 +-
 table/format.h                                |    2 -
 trace_replay/block_cache_tracer.cc            |    4 +-
 util/hash.h                                   |   61 +-
 util/hash_test.cc                             |  110 +
 util/xxh3p.h                                  | 1643 +++++++++++++++
 util/xxhash.cc                                | 1804 +++++++++--------
 util/xxhash.h                                 |  637 ++++--
 .../transactions/transaction_lock_mgr.cc      |    3 +-
 .../write_prepared_transaction_test.cc        |    7 +-
 15 files changed, 3267 insertions(+), 1041 deletions(-)
 create mode 100644 util/xxh3p.h

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 042ac9f5199..dcc4dfee6b2 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -605,6 +605,19 @@ elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
 fi
 
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  int main() {
+    uint64_t a = 0xffffFFFFffffFFFF;
+    __uint128_t b = __uint128_t(a) * a;
+    a = static_cast<uint64_t>(b >> 64);
+    (void)a;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
+fi
+
 # iOS doesn't support thread-local storage, but this check would erroneously
 # succeed because the cross-compiler flags are added by the Makefile, not this
 # script.
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 1c6fc771928..a0f75bfdc4d 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -365,7 +365,7 @@ TEST_P(CacheTest, EvictionPolicy) {
   Insert(200, 201);
 
   // Frequently used entry must be kept around
-  for (int i = 0; i < kCacheSize + 200; i++) {
+  for (int i = 0; i < kCacheSize * 2; i++) {
     Insert(1000+i, 2000+i);
     ASSERT_EQ(101, Lookup(100));
   }
@@ -418,7 +418,7 @@ TEST_P(CacheTest, EvictionPolicyRef) {
   Insert(303, 104);
 
   // Insert entries much more than Cache capacity
-  for (int i = 0; i < kCacheSize + 200; i++) {
+  for (int i = 0; i < kCacheSize * 2; i++) {
     Insert(1000 + i, 2000 + i);
   }
 
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 07d27a41a5d..70ecb74ef00 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -871,11 +871,12 @@ TEST_F(DBBasicTest, ChecksumTest) {
     ASSERT_OK(Flush());
   }
 
-  // verify data with each type of checksum
-  for (int i = 0; i <= kxxHash64; ++i) {
+  // with each valid checksum type setting...
+  for (int i = 0; i <= max_checksum; ++i) {
     table_options.checksum = static_cast<ChecksumType>(i);
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
+    // verify every type of checksum (should be regardless of that setting)
     for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
       ASSERT_EQ(Key(j), Get(Key(j)));
     }
diff --git a/db/memtable.cc b/db/memtable.cc
index 8b2ddf0e178..e3c531c316e 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -442,7 +442,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
-  return &locks_[static_cast<size_t>(GetSliceNPHash64(key)) % locks_.size()];
+  return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
 }
 
 MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc
index e347abe6e69..5c906165e0a 100644
--- a/memtable/hash_linklist_rep.cc
+++ b/memtable/hash_linklist_rep.cc
@@ -218,8 +218,7 @@ class HashLinkListRep : public MemTableRep {
   }
 
   size_t GetHash(const Slice& slice) const {
-    return NPHash64(slice.data(), static_cast<int>(slice.size()), 0) %
-           bucket_size_;
+    return fastrange64(GetSliceNPHash64(slice), bucket_size_);
   }
 
   Pointer* GetBucket(size_t i) const {
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index cae93f7f26f..c137581ee3e 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -733,11 +733,13 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
         break;
       }
       case kxxHash: {
-        void* xxh = XXH32_init(0);
-        XXH32_update(xxh, block_contents.data(),
+        XXH32_state_t* const state = XXH32_createState();
+        XXH32_reset(state, 0);
+        XXH32_update(state, block_contents.data(),
                      static_cast<uint32_t>(block_contents.size()));
-        XXH32_update(xxh, trailer, 1);  // Extend  to cover block type
-        EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
+        XXH32_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(trailer_without_type, XXH32_digest(state));
+        XXH32_freeState(state);
         break;
       }
       case kxxHash64: {
diff --git a/table/format.h b/table/format.h
index 552cd940d8c..0722d97b8b3 100644
--- a/table/format.h
+++ b/table/format.h
@@ -23,8 +23,6 @@
 #include "port/malloc.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
-#include "util/crc32c.h"
-#include "util/xxhash.h"
 
 namespace rocksdb {
 
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index c70290b6787..0cf394afa3e 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -28,8 +28,8 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
   }
   // We use spatial downsampling so that we have a complete access history for a
   // block.
-  const uint64_t hash = GetSliceNPHash64(block_key);
-  return hash % trace_options.sampling_frequency == 0;
+  return 0 == fastrange64(GetSliceNPHash64(block_key),
+                          trace_options.sampling_frequency);
 }
 }  // namespace
 
diff --git a/util/hash.h b/util/hash.h
index 836f325ef2a..d55e74fec47 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -7,20 +7,25 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
-// Simple hash function used for internal data structures
+// Common hash functions with convenient interfaces.
 
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
 
 #include "rocksdb/slice.h"
-#include "util/murmurhash.h"
+#include "util/xxhash.h"
 
 namespace rocksdb {
 
-// Non-persistent hash. Only used for in-memory data structure
-// The hash results are applicable to change.
-extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed);
+// Non-persistent hash. Must only used for in-memory data structure.
+// The hash results are thus applicable to change. (Thus, it rarely makes
+// sense to specify a seed for this function.)
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed = 0) {
+  // XXH3 currently experimental, but generally faster than other quality
+  // 64-bit hash functions.
+  return XXH3p_64bits_withSeed(data, n, seed);
+}
 
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
@@ -29,32 +34,52 @@ inline uint32_t BloomHash(const Slice& key) {
 }
 
 inline uint64_t GetSliceNPHash64(const Slice& s) {
-  return NPHash64(s.data(), s.size(), 0);
+  return NPHash64(s.data(), s.size());
 }
 
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
 }
 
-inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
-  // Right now murmurhash2B is used. It should able to be freely
-  // changed to a better hash, without worrying about backward
-  // compatibility issue.
-  return MURMUR_HASH(data, static_cast<int>(n),
-                     static_cast<unsigned int>(seed));
-}
-
 // std::hash compatible interface.
 struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
 
 // An alternative to % for mapping a hash value to an arbitrary range. See
-// https://github.com/lemire/fastrange and
-// https://github.com/pdillinger/wormhashing/blob/2c4035a4462194bf15f3e9fc180c27c513335225/bloom_simulation_tests/foo.cc#L57
-inline uint32_t fastrange32(uint32_t a, uint32_t h) {
-  uint64_t product = static_cast<uint64_t>(a) * h;
+// https://github.com/lemire/fastrange
+inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
+  uint64_t product = uint64_t{range} * hash;
   return static_cast<uint32_t>(product >> 32);
 }
 
+// An alternative to % for mapping a 64-bit hash value to an arbitrary range
+// that fits in size_t. See https://github.com/lemire/fastrange
+// We find size_t more convenient than uint64_t for the range, with side
+// benefit of better optimization on 32-bit platforms.
+inline size_t fastrange64(uint64_t hash, size_t range) {
+#if defined(HAVE_UINT128_EXTENSION)
+  // Can use compiler's 128-bit type. Trust it to do the right thing.
+  __uint128_t wide = __uint128_t{range} * hash;
+  return static_cast<size_t>(wide >> 64);
+#else
+  // Fall back: full decomposition.
+  // NOTE: GCC seems to fully understand this code as 64-bit x {32 or 64}-bit
+  // -> {96 or 128}-bit multiplication and optimize it down to a single
+  // wide-result multiplication (64-bit platform) or two wide-result
+  // multiplications (32-bit platforms, where range64 >> 32 is zero).
+  uint64_t range64 = range;  // ok to shift by 32, even if size_t is 32-bit
+  uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
+  tmp >>= 32;
+  tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
+  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+  uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
+  tmp += static_cast<uint32_t>(tmp2);
+  tmp >>= 32;
+  tmp += (tmp2 >> 32);
+  tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
+  return static_cast<size_t>(tmp);
+#endif
+}
+
 }  // namespace rocksdb
diff --git a/util/hash_test.cc b/util/hash_test.cc
index dcfe39fbc14..27eddfbe5e5 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -70,6 +70,116 @@ TEST(HashTest, Values) {
             3382479516u);
 }
 
+TEST(Fastrange32Test, Values) {
+  using rocksdb::fastrange32;
+  // Zero range
+  EXPECT_EQ(fastrange32(0, 0), 0U);
+  EXPECT_EQ(fastrange32(123, 0), 0U);
+  EXPECT_EQ(fastrange32(0xffffffff, 0), 0U);
+
+  // One range
+  EXPECT_EQ(fastrange32(0, 1), 0U);
+  EXPECT_EQ(fastrange32(123, 1), 0U);
+  EXPECT_EQ(fastrange32(0xffffffff, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(fastrange32(0, 2), 0U);
+  EXPECT_EQ(fastrange32(123, 2), 0U);
+  EXPECT_EQ(fastrange32(0x7fffffff, 2), 0U);
+  EXPECT_EQ(fastrange32(0x80000000, 2), 1U);
+  EXPECT_EQ(fastrange32(0xffffffff, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(fastrange32(0, 7), 0U);
+  EXPECT_EQ(fastrange32(123, 7), 0U);
+  EXPECT_EQ(fastrange32(613566756, 7), 0U);
+  EXPECT_EQ(fastrange32(613566757, 7), 1U);
+  EXPECT_EQ(fastrange32(1227133513, 7), 1U);
+  EXPECT_EQ(fastrange32(1227133514, 7), 2U);
+  // etc.
+  EXPECT_EQ(fastrange32(0xffffffff, 7), 6U);
+
+  // Big
+  EXPECT_EQ(fastrange32(1, 0x80000000), 0U);
+  EXPECT_EQ(fastrange32(2, 0x80000000), 1U);
+  EXPECT_EQ(fastrange32(4, 0x7fffffff), 1U);
+  EXPECT_EQ(fastrange32(4, 0x80000000), 2U);
+  EXPECT_EQ(fastrange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
+  EXPECT_EQ(fastrange32(0xffffffff, 0x80000000), 0x7fffffffU);
+}
+
+TEST(Fastrange64Test, Values) {
+  using rocksdb::fastrange64;
+  // Zero range
+  EXPECT_EQ(fastrange64(0, 0), 0U);
+  EXPECT_EQ(fastrange64(123, 0), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 0), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0), 0U);
+
+  // One range
+  EXPECT_EQ(fastrange64(0, 1), 0U);
+  EXPECT_EQ(fastrange64(123, 1), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 1), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(fastrange64(0, 2), 0U);
+  EXPECT_EQ(fastrange64(123, 2), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 2), 0U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 2), 0U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 2), 1U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(fastrange64(0, 7), 0U);
+  EXPECT_EQ(fastrange64(123, 7), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 7), 0U);
+  EXPECT_EQ(fastrange64(2635249153387078802, 7), 0U);
+  EXPECT_EQ(fastrange64(2635249153387078803, 7), 1U);
+  EXPECT_EQ(fastrange64(5270498306774157604, 7), 1U);
+  EXPECT_EQ(fastrange64(5270498306774157605, 7), 2U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 7), 3U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 7), 3U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 7), 6U);
+
+  // Big but 32-bit range
+  EXPECT_EQ(fastrange64(0x100000000, 0x80000000), 0U);
+  EXPECT_EQ(fastrange64(0x200000000, 0x80000000), 1U);
+  EXPECT_EQ(fastrange64(0x400000000, 0x7fffFFFF), 1U);
+  EXPECT_EQ(fastrange64(0x400000000, 0x80000000), 2U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
+
+  // Big, > 32-bit range
+#if SIZE_MAX == UINT64_MAX
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
+
+  EXPECT_EQ(fastrange64(0x0000000000000000, 420000000002), 0U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 420000000002), 210000000001U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
+
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
+            0xffffFFFFffffFFFEU);
+#endif
+}
+
+// for inspection of disassembly
+uint32_t fastrange32(uint32_t hash, uint32_t range) {
+  return rocksdb::fastrange32(hash, range);
+}
+
+// for inspection of disassembly
+size_t fastrange64(uint64_t hash, size_t range) {
+  return rocksdb::fastrange64(hash, range);
+}
+
+// for inspection of disassembly
+uint64_t NPHash64(const char* data, size_t n) {
+  return rocksdb::NPHash64(data, n);
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
diff --git a/util/xxh3p.h b/util/xxh3p.h
new file mode 100644
index 00000000000..7d28ad3f244
--- /dev/null
+++ b/util/xxh3p.h
@@ -0,0 +1,1643 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* RocksDB Note: This file contains a preview release (xxhash repository
+   version 0.7.2) of XXH3 that is unlikely to be compatible with the final
+   version of XXH3. We have therefore renamed this XXH3p ("preview"), for
+   clarity so that we can continue to use this version even after
+   integrating a newer incompatible version.
+*/
+
+/* Note :
+   This file is separated for development purposes.
+   It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
+#ifndef XXH3p_H
+#define XXH3p_H
+
+
+/* ===   Dependencies   === */
+
+#undef XXH_INLINE_ALL   /* in case it's already defined */
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0
+#define XXH_SSE2   1
+#define XXH_AVX2   2
+#define XXH_NEON   3
+#define XXH_VSX    4
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == 0   /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == 1  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == 3  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 4  /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXH_VECTOR == XXH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXH_vec_permxor vec_permxor
+#  else
+#    define XXH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif  /* XXH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+        xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
+        xxh_u64  const mixed = keyed * PRIME64_1;
+        return XXH3p_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+        xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
+        return XXH3p_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
+        if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
+        return 0;
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
+
+XXH_FORCE_INLINE void
+XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3p_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3p_acc_128bits) {
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3p_acc_64bits */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3p_acc_128bits) {
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            } else {  /* XXH3p_acc_64bits */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXH3p_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3p_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXH3p_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3p_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
+    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xinput[i]; */
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3p_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3p_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3p_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+XXH_FORCE_INLINE void
+XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xsecret = (uint8_t const*) secret;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xsecret[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xsecret = (const U64x2*) secret;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXH_FORCE_INLINE void
+XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3p_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        XXH3p_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXH_FORCE_INLINE void
+#endif
+XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3p_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3p_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+
+    result64 += XXH3p_mix2Accs(acc+0, secret +  0);
+    result64 += XXH3p_mix2Accs(acc+2, secret + 16);
+    result64 += XXH3p_mix2Accs(acc+4, secret + 32);
+    result64 += XXH3p_mix2Accs(acc+6, secret + 48);
+
+    return XXH3p_avalanche(result64);
+}
+
+#define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3p_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3p_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
+        XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXH3p_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
+    XXH3p_initCustomSecret(secret, seed);
+    return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+    xxh_u64 const input_lo = XXH_readLE64(input);
+    xxh_u64 const input_hi = XXH_readLE64(input+8);
+    return XXH3p_mul128_fold64(
+               input_lo ^ (XXH_readLE64(secret)   + seed64),
+               input_hi ^ (XXH_readLE64(secret+8) - seed64) );
+}
+
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3p_mix16B(input+48, secret+96, seed);
+                    acc += XXH3p_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3p_mix16B(input+32, secret+64, seed);
+                acc += XXH3p_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3p_mix16B(input+16, secret+32, seed);
+            acc += XXH3p_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3p_mix16B(input+0, secret+0, seed);
+        acc += XXH3p_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3p_avalanche(acc);
+    }
+}
+
+#define XXH3p_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+    #define XXH3p_MIDSIZE_STARTOFFSET 3
+    #define XXH3p_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3p_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
+        return XXH3p_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
+{
+    return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset(XXH3p_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3p_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3p_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3p_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3p_scrambleAcc(acc, secret + secretLimit);
+        XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+XXH_FORCE_INLINE XXH_errorcode
+XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input now > XXH3p_INTERNALBUFFER_SIZE */
+
+        #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        if (state->bufferedSize) {   /* some input within internal buffer: fill then consume it */
+            size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3p_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* consume input by full buffer quantities */
+        if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
+            do {
+                XXH3p_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3p_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3p_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* some remaining input input : buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
+{
+    memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3p_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3p_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3p_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
+{
+    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
+        return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3p_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (=> XXH128)
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
+        xxh_u32  const combinedh = XXH_swap32(combinedl);
+        xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
+        xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
+        xxh_u64  const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64  const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
+        return h128;
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
+        xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
+        xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
+        xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
+        xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
+        {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
+        xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
+        m128.low64 += lenContrib;
+        m128.high64 += input_hi * PRIME64_1;
+        m128.low64  ^= (m128.high64 >> 32);
+        {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+            h128.low64   = XXH3p_avalanche(h128.low64);
+            h128.high64  = XXH3p_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/* Assumption : `secret` size is >= 16
+ * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t const h128 = { 0, 0 };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+        xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
+        XXH128_hash_t const h128 = { low64, high64 };
+        return h128;
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
+    XXH3p_initCustomSecret(secret, seed);
+    return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
+        }
+        acc.low64 = XXH3p_avalanche(acc.low64);
+        acc.high64 = XXH3p_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
+
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3p_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/* all the functions are actually the same as for 64-bit streaming variant,
+   just the reset one is different (different initial acc values for 0,5,6,7),
+   and near the end of the digest function */
+
+static void
+XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset(XXH3p_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3p_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
+{
+    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+            xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
+            XXH128_hash_t const h128 = { low64, high64 };
+            return h128;
+        }
+    }
+    /* len <= XXH3p_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+#endif  /* XXH3p_H */
diff --git a/util/xxhash.cc b/util/xxhash.cc
index 91343f674e7..b35473e1af1 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -1,97 +1,82 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 /*
-xxHash - Fast Hash algorithm
-Copyright (C) 2012-2014, Yann Collet.
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash source repository : http://code.google.com/p/xxhash/
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
-//**************************************
-// Tuning parameters
-//**************************************
+/* since xxhash.c can be included (via XXH_INLINE_ALL),
+ * it's good practice to protect it with guard
+ * in case of multiples inclusions */
+#ifndef XXHASH_C_01393879
+#define XXHASH_C_01393879
+
+/* *************************************
+*  Tuning parameters
+***************************************/
 /*!XXH_FORCE_MEMORY_ACCESS :
- * By default, access to unaligned memory is controlled by `memcpy()`, which is
- * safe and portable. Unfortunately, on some target/compiler combinations, the
- * generated assembly is sub-optimal. The below switch allow to select different
- * access method for improved performance. Method 0 (default) : use `memcpy()`.
- * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
- * extension (ie, not portable). This method is safe if your compiler supports
- * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
- * access. This method doesn't depend on compiler but violate C standard. It can
- * generate buggy code on targets which do not support unaligned memory
- * accesses. But in some circumstances, it's the only known way to get the most
- * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
- * for details. Prefer these methods in priority order (0 > 1 > 2)
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-
-#include "util/util.h"
-
-#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \
-                                   for example */
-#if defined(__GNUC__) &&                                     \
-    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||  \
-     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
-     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
-#define XXH_FORCE_MEMORY_ACCESS 2
-#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) ||      \
-    (defined(__GNUC__) &&                                     \
-     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||  \
-      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
-      defined(__ARM_ARCH_7S__)))
-#define XXH_FORCE_MEMORY_ACCESS 1
-#endif
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
-// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
-// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
-// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
-// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
-#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USE_UNALIGNED_ACCESS 1
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
 #endif
 
-// XXH_ACCEPT_NULL_INPUT_POINTER :
-// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
-// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
-// This option has a very small performance cost (only measurable on small inputs).
-// By default, this option is disabled. To enable it, uncomment below define :
-//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
-
-// XXH_FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
-// Results are therefore identical for little-endian and big-endian CPU.
-// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independence be of no importance for your application, you may set the #define below to 1.
-// It will improve speed for Big-endian CPU.
-// This option has no impact on Little_Endian CPU.
-#define XXH_FORCE_NATIVE_FORMAT 0
-
 /*!XXH_FORCE_ALIGN_CHECK :
  * This is a minor performance trick, only useful with lots of very small keys.
  * It means : check for aligned/unaligned input.
@@ -100,976 +85,1069 @@ You can contact the author at :
  * or when alignment doesn't matter for performance.
  */
 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
-    defined(_M_X64)
-#define XXH_FORCE_ALIGN_CHECK 0
-#else
-#define XXH_FORCE_ALIGN_CHECK 1
-#endif
-#endif
-
-//**************************************
-// Compiler Specific Options
-//**************************************
-// Disable some Visual warning messages
-#ifdef _MSC_VER  // Visual Studio
-#  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
-#  pragma warning(disable : 4804)      // disable: C4804: 'operation' : unsafe use of type 'bool' in operation (static assert line 313)
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
 #endif
 
-#ifdef _MSC_VER    // Visual Studio
-#  define FORCE_INLINE static __forceinline
-#else
-#  ifdef __GNUC__
-#    define FORCE_INLINE static inline __attribute__((always_inline))
+/*!XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
 #  else
-#    define FORCE_INLINE static inline
+#    define XXH_REROLL 0
 #  endif
 #endif
 
-
-//**************************************
-// Includes & Memory related functions
-//**************************************
-#include "xxhash.h"
-// Modify the local functions below should you wish to use some other memory related routines
-// for malloc(), free()
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
 #include <stdlib.h>
-FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); }
-FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
-// for memcpy()
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/*! and for memcpy() */
 #include <string.h>
-FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-#include <assert.h> /* assert */
-
-namespace rocksdb {
-//**************************************
-// Basic Types
-//**************************************
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
-# include <stdint.h>
-  typedef uint8_t  BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <limits.h>   /* ULLONG_MAX */
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash.h"
+
+/* BEGIN RocksDB customizations */
+#include "util/util.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
+/* END RocksDB customizations */
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
 #else
-  typedef unsigned char      BYTE;
-  typedef unsigned short     U16;
-  typedef unsigned int       U32;
-  typedef   signed int       S32;
-  typedef unsigned long long U64;
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
 #endif
 
-#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
-#  define _PACKED __attribute__ ((packed))
-#else
-#  define _PACKED
+
+
+/* *************************************
+*  Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
 #endif
 
-#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  ifdef __IBMC__
-#    pragma pack(1)
-#  else
-#    pragma pack(push, 1)
-#  endif
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
 #endif
 
-typedef struct _U32_S { U32 v; } _PACKED U32_S;
+/* note : use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }
 
-#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  pragma pack(pop)
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
 #endif
+typedef XXH32_hash_t xxh_u32;
+
 
-#define A32(x) (((U32_S *)(x))->v)
+/* ===   Memory access   === */
 
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
-/* Force direct memory access. Only works on CPU which support unaligned memory
- * access in hardware */
-static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
 
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
-/* __pack instructions are safer, but compiler specific, hence potentially
- * problematic for some compilers */
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
 /* currently only defined for gcc and icc */
-typedef union {
-  U32 u32;
-} __attribute__((packed)) unalign;
-static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
 
 #else
 
 /* portable and safe solution. Generally efficient.
  * see : http://stackoverflow.com/a/32095106/646947
  */
-static U32 XXH_read32(const void* memPtr) {
-  U32 val;
-  memcpy(&val, memPtr, sizeof(val));
-  return val;
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
-#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
 
-//***************************************
-// Compiler-specific Functions and Macros
-//***************************************
-#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+/* ===   Endianess   === */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
 
-// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
-#if defined(_MSC_VER)
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
-#define XXH_rotl64(x, r) _rotl64(x, r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
-#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
-#if defined(_MSC_VER)     // Visual Studio
+#if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
-#elif GCC_VERSION >= 403
+#elif XXH_GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
 #else
-static inline U32 XXH_swap32 (U32 x) {
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
     return  ((x << 24) & 0xff000000 ) |
-        ((x <<  8) & 0x00ff0000 ) |
-        ((x >>  8) & 0x0000ff00 ) |
-        ((x >> 24) & 0x000000ff );}
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
 #endif
 
 
-//**************************************
-// Constants
-//**************************************
-#define PRIME32_1   2654435761U
-#define PRIME32_2   2246822519U
-#define PRIME32_3   3266489917U
-#define PRIME32_4    668265263U
-#define PRIME32_5    374761393U
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
 
-//**************************************
-// Architecture Macros
-//**************************************
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
-    static const int one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
-#endif
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
 
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
 
-//**************************************
-// Macros
-//**************************************
-#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
 
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 
-//****************************
-// Memory reads
-//****************************
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
-FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align)
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
-    else
-        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
-}
-
-FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
-                                    XXH_alignment align) {
-  if (align == XXH_unaligned)
-    return endian == XXH_littleEndian ? XXH_read32(ptr)
-                                      : XXH_swap32(XXH_read32(ptr));
-  else
-    return endian == XXH_littleEndian ? *(const U32*)ptr
-                                      : XXH_swap32(*(const U32*)ptr);
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /* UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+     * (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+     *   making it slightly slower to multiply four integers at once compared to four
+     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+     *   still not worth it to go into SSE just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because the
+     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+     *   while v3 can multiply. SSE forces them to operate together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize. */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
 }
 
-FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) {
-  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
 }
 
-//****************************
-// Simple Hash Functions
-//****************************
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
-FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
 {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-    U32 h32;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
-#endif
-
-    if (len>=16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = seed + PRIME32_1 + PRIME32_2;
-        U32 v2 = seed + PRIME32_2;
-        U32 v3 = seed + 0;
-        U32 v4 = seed - PRIME32_1;
-
-        do
-        {
-            v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-            v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-            v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-            v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-        } while (p<=limit);
-
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-    }
-    else
-    {
-        h32  = seed + PRIME32_5;
+#define PROCESS1               \
+    h32 += (*ptr++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(ptr) * PRIME32_3; \
+    ptr+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
     }
+}
 
-    h32 += (U32) len;
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
 
-    while (p<=bEnd-4)
-    {
-        h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
     }
+#endif
 
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
     }
 
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
+    h32 += (xxh_u32)len;
 
-    return h32;
+    return XXH32_finalize(h32, input, len&15, align);
 }
 
 
-U32 XXH32(const void* input, int len, U32 seed)
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
 {
 #if 0
-    // Simple version, good for code maintenance, but unfortunately slow for small inputs
-    void* state = XXH32_init(seed);
-    XXH32_update(state, input, len);
-    return XXH32_digest(state);
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
 #else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
-    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
 #endif
 }
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
 
-struct XXH_state32_t
-{
-    U64 total_len;
-    U32 seed;
-    U32 v1;
-    U32 v2;
-    U32 v3;
-    U32 v4;
-    int memsize;
-    char memory[16];
-};
-
-
-int XXH32_sizeofState()
+/*======   Hash streaming   ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
 {
-    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
-    return sizeof(struct XXH_state32_t);
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
 }
-
-
-XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME32_1 + PRIME32_2;
-    state->v2 = seed + PRIME32_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME32_1;
-    state->total_len = 0;
-    state->memsize = 0;
+    XXH_free(statePtr);
     return XXH_OK;
 }
 
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
 
-void* XXH32_init (U32 seed)
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
 {
-    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
-    XXH32_resetState(state, seed);
-    return state;
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
 }
 
 
-FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
-    state->total_len += len;
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
 
-    if (state->memsize + len < 16)   // fill in tmp buffer
-    {
-        XXH_memcpy(state->memory + state->memsize, input, len);
-        state->memsize +=  len;
-        return XXH_OK;
-    }
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
 
-    if (state->memsize)   // some data left from previous update
-    {
-        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
-        {
-            const U32* p32 = (const U32*)state->memory;
-            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
-            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
-            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
-            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
         }
-        p += 16-state->memsize;
-        state->memsize = 0;
-    }
 
-    if (p <= bEnd-16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = state->v1;
-        U32 v2 = state->v2;
-        U32 v3 = state->v3;
-        U32 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-            v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-            v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-            v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-        } while (p<=limit);
-
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->memory, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
     return XXH_OK;
 }
 
-XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
-
 
-FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    const BYTE * p = (const BYTE*)state->memory;
-    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
-    U32 h32;
-
-    if (state->total_len >= 16)
-    {
-        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-    }
-    else
-    {
-        h32  = state->seed + PRIME32_5;
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
     }
 
-    h32 += (U32) state->total_len;
+    h32 += state->total_len_32;
 
-    while (p<=bEnd-4)
-    {
-        h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
 
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
 
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
-}
+/*======   Canonical representation   ======*/
 
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
 
-U32 XXH32_intermediateDigest (void* state_in)
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
 {
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
 }
 
-
-U32 XXH32_digest (void* state_in)
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
 {
-    U32 h32 = XXH32_intermediateDigest(state_in);
+    return XXH_readBE32(src);
+}
 
-    XXH_free(state_in);
 
-    return h32;
-}
+#ifndef XXH_NO_LONG_LONG
 
 /* *******************************************************************
- *  64-bit hash functions
- *********************************************************************/
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*! XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
+ * on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
+ * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
+ * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
+ * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
+ * and is more likely to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
 
- #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
- /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
- static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
 
- #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
- /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
- /* currently only defined for gcc and icc */
- typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
- static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
 
- #else
+#else
 
- /* portable and safe solution. Generally efficient.
-  * see : http://stackoverflow.com/a/32095106/646947
-  */
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
 
- static U64 XXH_read64(const void* memPtr)
- {
-     U64 val;
-     memcpy(&val, memPtr, sizeof(val));
-     return val;
- }
 #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
-#if defined(_MSC_VER) /* Visual Studio */
-#define XXH_swap64 _byteswap_uint64
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
 #elif XXH_GCC_VERSION >= 403
-#define XXH_swap64 __builtin_bswap64
+#  define XXH_swap64 __builtin_bswap64
 #else
-static U64 XXH_swap64(U64 x) {
-  return ((x << 56) & 0xff00000000000000ULL) |
-         ((x << 40) & 0x00ff000000000000ULL) |
-         ((x << 24) & 0x0000ff0000000000ULL) |
-         ((x << 8) & 0x000000ff00000000ULL) |
-         ((x >> 8) & 0x00000000ff000000ULL) |
-         ((x >> 24) & 0x0000000000ff0000ULL) |
-         ((x >> 40) & 0x000000000000ff00ULL) |
-         ((x >> 56) & 0x00000000000000ffULL);
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
 }
 #endif
 
-FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
-                                    XXH_alignment align) {
-  if (align == XXH_unaligned)
-    return endian == XXH_littleEndian ? XXH_read64(ptr)
-                                      : XXH_swap64(XXH_read64(ptr));
-  else
-    return endian == XXH_littleEndian ? *(const U64*)ptr
-                                      : XXH_swap64(*(const U64*)ptr);
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
 }
 
-FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
-  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
 }
 
-static U64 XXH_readBE64(const void* ptr) {
-  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
 }
 
+
 /*======   xxh64   ======*/
 
-static const U64 PRIME64_1 =
-    11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
-                              */
-static const U64 PRIME64_2 =
-    14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
-                              */
-static const U64 PRIME64_3 =
-    1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
-                             */
-static const U64 PRIME64_4 =
-    9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
-                             */
-static const U64 PRIME64_5 =
-    2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
-                             */
-
-static U64 XXH64_round(U64 acc, U64 input) {
-  acc += input * PRIME64_2;
-  acc = XXH_rotl64(acc, 31);
-  acc *= PRIME64_1;
-  return acc;
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
 }
 
-static U64 XXH64_mergeRound(U64 acc, U64 val) {
-  val = XXH64_round(0, val);
-  acc ^= val;
-  acc = acc * PRIME64_1 + PRIME64_4;
-  return acc;
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
 }
 
-static U64 XXH64_avalanche(U64 h64) {
-  h64 ^= h64 >> 33;
-  h64 *= PRIME64_2;
-  h64 ^= h64 >> 29;
-  h64 *= PRIME64_3;
-  h64 ^= h64 >> 32;
-  return h64;
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
 }
 
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
-
-static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len,
-                          XXH_endianess endian, XXH_alignment align) {
-  const BYTE* p = (const BYTE*)ptr;
-
-#define PROCESS1_64          \
-  h64 ^= (*p++) * PRIME64_5; \
-  h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-
-#define PROCESS4_64                           \
-  h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
-  p += 4;                                     \
-  h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-
-#define PROCESS8_64                                    \
-  {                                                    \
-    U64 const k1 = XXH64_round(0, XXH_get64bits(p));   \
-    p += 8;                                            \
-    h64 ^= k1;                                         \
-    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \
-  }
-
-  switch (len & 31) {
-    case 24:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 16:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 8:
-      PROCESS8_64;
-      return XXH64_avalanche(h64);
-
-    case 28:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 20:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 12:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 4:
-      PROCESS4_64;
-      return XXH64_avalanche(h64);
-
-    case 25:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 17:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 9:
-      PROCESS8_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 29:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 21:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 13:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 5:
-      PROCESS4_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 26:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 18:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 10:
-      PROCESS8_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 30:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 22:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 14:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 6:
-      PROCESS4_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 27:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 19:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 11:
-      PROCESS8_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 31:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 23:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 15:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 7:
-      PROCESS4_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 3:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 2:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 1:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 0:
-      return XXH64_avalanche(h64);
-  }
-
-  /* impossible to reach */
-  assert(0);
-  return 0; /* unreachable, but some compilers complain without it */
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64            \
+    h64 ^= (*ptr++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \
+    ptr+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
 }
 
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
-                                    XXH_endianess endian, XXH_alignment align) {
-  const BYTE* p = (const BYTE*)input;
-  const BYTE* bEnd = p + len;
-  U64 h64;
-
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
-  if (p == NULL) {
-    len = 0;
-    bEnd = p = (const BYTE*)(size_t)32;
-  }
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
 #endif
 
-  if (len >= 32) {
-    const BYTE* const limit = bEnd - 32;
-    U64 v1 = seed + PRIME64_1 + PRIME64_2;
-    U64 v2 = seed + PRIME64_2;
-    U64 v3 = seed + 0;
-    U64 v4 = seed - PRIME64_1;
-
-    do {
-      v1 = XXH64_round(v1, XXH_get64bits(p));
-      p += 8;
-      v2 = XXH64_round(v2, XXH_get64bits(p));
-      p += 8;
-      v3 = XXH64_round(v3, XXH_get64bits(p));
-      p += 8;
-      v4 = XXH64_round(v4, XXH_get64bits(p));
-      p += 8;
-    } while (p <= limit);
-
-    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
-          XXH_rotl64(v4, 18);
-    h64 = XXH64_mergeRound(h64, v1);
-    h64 = XXH64_mergeRound(h64, v2);
-    h64 = XXH64_mergeRound(h64, v3);
-    h64 = XXH64_mergeRound(h64, v4);
-
-  } else {
-    h64 = seed + PRIME64_5;
-  }
-
-  h64 += (U64)len;
-
-  return XXH64_finalize(h64, p, len, endian, align);
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
 }
 
-unsigned long long XXH64(const void* input, size_t len,
-                         unsigned long long seed) {
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
 #if 0
     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
     XXH64_state_t state;
     XXH64_reset(&state, seed);
-    XXH64_update(&state, input, len);
+    XXH64_update(&state, (const xxh_u8*)input, len);
     return XXH64_digest(&state);
+
 #else
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-  if (XXH_FORCE_ALIGN_CHECK) {
-    if ((((size_t)input) & 7) ==
-        0) { /* Input is aligned, let's leverage the speed advantage */
-      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
-                                  XXH_aligned);
-      else
-        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-  }
 
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
-                              XXH_unaligned);
-  else
-    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
 #endif
 }
 
 /*======   Hash Streaming   ======*/
 
-XXH64_state_t* XXH64_createState(void) {
-  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
 }
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
-  XXH_free(statePtr);
-  return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
 }
 
-void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
-  memcpy(dstState, srcState, sizeof(*dstState));
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
 }
 
-XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) {
-  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
-                          strict-aliasing warnings */
-  memset(&state, 0, sizeof(state));
-  state.v1 = seed + PRIME64_1 + PRIME64_2;
-  state.v2 = seed + PRIME64_2;
-  state.v3 = seed + 0;
-  state.v4 = seed - PRIME64_1;
-  /* do not write into reserved, planned to be removed in a future version */
-  memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
-  return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
 }
 
-FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
-                                               const void* input, size_t len,
-                                               XXH_endianess endian) {
-  if (input == NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
-    return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
 #else
-    return XXH_ERROR;
+        return XXH_ERROR;
 #endif
 
-  {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
 
-    state->total_len += len;
+        state->total_len += len;
 
-    if (state->memsize + len < 32) { /* fill in tmp buffer */
-      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
-      state->memsize += (U32)len;
-      return XXH_OK;
-    }
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
 
-    if (state->memsize) { /* tmp buffer is full */
-      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
-                 32 - state->memsize);
-      state->v1 =
-          XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
-      state->v2 =
-          XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
-      state->v3 =
-          XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
-      state->v4 =
-          XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
-      p += 32 - state->memsize;
-      state->memsize = 0;
-    }
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
 
-    if (p + 32 <= bEnd) {
-      const BYTE* const limit = bEnd - 32;
-      U64 v1 = state->v1;
-      U64 v2 = state->v2;
-      U64 v3 = state->v3;
-      U64 v4 = state->v4;
-
-      do {
-        v1 = XXH64_round(v1, XXH_readLE64(p, endian));
-        p += 8;
-        v2 = XXH64_round(v2, XXH_readLE64(p, endian));
-        p += 8;
-        v3 = XXH64_round(v3, XXH_readLE64(p, endian));
-        p += 8;
-        v4 = XXH64_round(v4, XXH_readLE64(p, endian));
-        p += 8;
-      } while (p <= limit);
-
-      state->v1 = v1;
-      state->v2 = v2;
-      state->v3 = v3;
-      state->v4 = v4;
-    }
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd) {
-      XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
-      state->memsize = (unsigned)(bEnd - p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
-  }
 
-  return XXH_OK;
+    return XXH_OK;
 }
 
-XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input,
-                           size_t len) {
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
-  else
-    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
-}
 
-FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
-                                     XXH_endianess endian) {
-  U64 h64;
-
-  if (state->total_len >= 32) {
-    U64 const v1 = state->v1;
-    U64 const v2 = state->v2;
-    U64 const v3 = state->v3;
-    U64 const v4 = state->v4;
-
-    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
-          XXH_rotl64(v4, 18);
-    h64 = XXH64_mergeRound(h64, v1);
-    h64 = XXH64_mergeRound(h64, v2);
-    h64 = XXH64_mergeRound(h64, v3);
-    h64 = XXH64_mergeRound(h64, v4);
-  } else {
-    h64 = state->v3 /*seed*/ + PRIME64_5;
-  }
-
-  h64 += (U64)state->total_len;
-
-  return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian,
-                        XXH_aligned);
-}
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
 
-unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+    h64 += (xxh_u64) state->total_len;
 
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_digest_endian(state_in, XXH_littleEndian);
-  else
-    return XXH64_digest_endian(state_in, XXH_bigEndian);
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
 }
 
+
 /*====== Canonical representation   ======*/
 
-void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
-  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
-  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
-  memcpy(dst, &hash, sizeof(*dst));
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
 }
 
-XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
-  return XXH_readBE64(src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
 }
-}  // namespace rocksdb
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3p.h" /* XXH3 preview for RocksDB */
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXHASH_C_01393879 */
diff --git a/util/xxhash.h b/util/xxhash.h
index 515375506e0..59d9b97d619 100644
--- a/util/xxhash.h
+++ b/util/xxhash.h
@@ -1,8 +1,12 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 /*
-   xxHash - Fast Hash algorithm
+   xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012-2014, Yann Collet.
+   Copyright (C) 2012-2016, Yann Collet.
+
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
@@ -29,7 +33,7 @@
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    You can contact the author at :
-   - xxHash source repository : http://code.google.com/p/xxhash/
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 /* Notice extracted from xxHash homepage :
@@ -49,193 +53,546 @@ Lookup3         1.2 GB/s      9       Bob Jenkins
 SuperFastHash   1.2 GB/s      1       Paul Hsieh
 CityHash64      1.05 GB/s    10       Pike & Alakuijala
 FNV             0.55 GB/s     5       Fowler, Noll, Vo
-CRC32           0.43 GB/s     9
+CRC32           0.43 GB/s #   9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
+Note #: other CRC32 implementations can be over 40x faster than SMHasher's:
+http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
 Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
-*/
 
-#pragma once
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
 
-#include <stdlib.h>
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
 
-#if !defined(__VMS) &&       \
-    (defined(__cplusplus) || \
-     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#include <stdint.h>
+/* BEGIN RocksDB customizations */
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY 1 /* access experimental APIs like XXH3 */
 #endif
+#define XXH_NAMESPACE ROCKSDB_
+/* END RocksDB customizations */
 
 #if defined (__cplusplus)
-namespace rocksdb {
+extern "C" {
 #endif
 
 
-//****************************
-// Type
-//****************************
-/* size_t */
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
 
-//****************************
-// Simple Hash Functions
-//****************************
-
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
 
-/*
-XXH32() :
-    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
-    The memory between input & input+len must be valid (allocated and read-accessible).
-    "seed" can be used to alter the result predictably.
-    This function successfully passes all SMHasher tests.
-    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-    Note that "len" is type "int", which means it is limited to 2^31-1.
-    If your data is larger, use the advanced functions below.
-*/
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform : need a 32-bit type"
+#     endif
+#   endif
+#endif
 
-//****************************
-// Advanced Hash Functions
-//****************************
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
 
-void*         XXH32_init   (unsigned int seed);
-XXH_errorcode XXH32_update (void* state, const void* input, int len);
-unsigned int  XXH32_digest (void* state);
+/*======   Streaming   ======*/
 
 /*
-These functions calculate the xxhash of an input provided in several small packets,
-as opposed to an input provided as a single block.
-
-It must be started with :
-void* XXH32_init()
-The function returns a pointer which holds the state of calculation.
-
-This pointer must be provided as "void* state" parameter for XXH32_update().
-XXH32_update() can be called as many times as necessary.
-The user must provide a valid (allocated) input.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
-Note that "len" is type "int", which means it is limited to 2^31-1.
-If your data is larger, it is recommended to chunk your data into blocks
-of size for example 2^30 (1GB) to avoid any "int" overflow issue.
-
-Finally, you can end the calculation anytime, by using XXH32_digest().
-This function returns the final 32-bits hash.
-You must provide the same "void* state" parameter created by XXH32_init().
-Memory will be freed by XXH32_digest().
-*/
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hash values later on, by invoking again XXH*_digest().
+ *
+ * When done, release the state, using XXH*_freeState().
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
+/*======   Canonical representation   ======*/
 
-int           XXH32_sizeofState();
-XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
-
-#define       XXH32_SIZEOFSTATE 48
-typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
-/*
-These functions allow user application to make its own allocation for state.
-
-XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
-Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
-This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+/* Default return values from XXH functions are basic unsigned 32 and 64 bits.
+ * This the simplest and fastest format for further post-processing.
+ * However, this leaves open the question of what is the order of bytes,
+ * since little and big endian conventions will write the same number differently.
+ *
+ * The canonical representation settles this issue,
+ * by mandating big-endian convention,
+ * aka, the same convention as human-readable numbers (large digits first).
+ * When writing hash values to storage, sending them over a network, or printing them,
+ * it's highly recommended to use the canonical representation,
+ * to ensure portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values into and from canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
 
-For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
-use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
 */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
 
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
 
-unsigned int XXH32_intermediateDigest (void* state);
-/*
-This function does the same as XXH32_digest(), generating a 32-bit hash,
-but preserve memory context.
-This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
-To free memory context, use XXH32_digest(), or free().
-*/
-
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
 
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
 
-//****************************
-// Deprecated function names
-//****************************
-// The following translations are provided to ease code transition
-// You are encouraged to no longer this function names
-#define XXH32_feed   XXH32_update
-#define XXH32_result XXH32_digest
-#define XXH32_getIntermediateResult XXH32_intermediateDigest
 
-/*-**********************************************************************
- *  64-bit hash
- ************************************************************************/
-typedef unsigned long long XXH64_hash_t;
+#endif  /* XXH_NO_LONG_LONG */
 
-/*! XXH64() :
-    Calculate the 64-bit hash of sequence of length "len" stored at memory
-   address "input". "seed" can be used to alter the result predictably. This
-   function runs faster on 64-bit systems, but slower on 32-bit systems (see
-   benchmark).
-*/
-XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
 
-/*======   Streaming   ======*/
-typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
-XXH64_state_t* XXH64_createState(void);
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
-void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
 
-XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
-XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input,
-                           size_t length);
-XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
+#ifdef XXH_STATIC_LINKING_ONLY
 
-/*======   Canonical representation   ======*/
-typedef struct {
-  unsigned char digest[8];
-} XXH64_canonical_t;
-void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
-XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
 
 /* These definitions are only present to allow
  * static allocation of XXH state, on stack or in a struct for example.
  * Never **ever** use members directly. */
 
-#if !defined(__VMS) &&       \
-    (defined(__cplusplus) || \
-     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+#ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
 struct XXH64_state_s {
-  uint64_t total_len;
-  uint64_t v1;
-  uint64_t v2;
-  uint64_t v3;
-  uint64_t v4;
-  uint64_t mem64[4];
-  uint32_t memsize;
-  uint32_t reserved[2]; /* never read nor write, might be removed in a future
-                           version */
-};                      /* typedef'd to XXH64_state_t */
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+#endif   /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+#ifndef XXH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Q1 : Would there be a better representation for a 128-bit hash result ?
+ *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXH128() :   XXH128() uses the same arguments as XXH64(), for consistency.
+ *                          It means it maps to XXH3p_128bits_withSeed().
+ *                          This variant is slightly slower than XXH3p_128bits(),
+ *                          because the seed is now part of the algorithm, and can't be simplified.
+ *                          Is that a good idea ?
+ *
+ * - Seed type for XXH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXH128 ?
+ *
+ * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
+ *                          It seems okay as a return value when using "default" secret and seed.
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) :      Streaming XXH128 uses an XXH3 state, which is the same state as XXH3p_64bits().
+ *                          It means a 128bit streaming loop must invoke the following symbols :
+ *                          XXH3p_createState(), XXH3p_128bits_reset(), XXH3p_128bits_update() (loop), XXH3p_128bits_digest(), XXH3p_freeState().
+ *                          Is that consistent enough ?
+ *
+ * - Consistency (2) :      The canonical representation of `XXH3p_64bits` is provided by existing functions
+ *                          XXH64_canonicalFromHash(), and reverse operation XXH64_hashFromCanonical().
+ *                          As a mirror, canonical functions for XXH128_hash_t results generated by `XXH3p_128bits`
+ *                          are XXH128_canonicalFromHash() and XXH128_hashFromCanonical().
+ *                          Which means, `XXH3` doesn't appear in the names, because canonical functions operate on a type,
+ *                          independently of which algorithm was used to generate that type.
+ *                          Is that consistent enough ?
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3p_64bits XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits)
+#  define XXH3p_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSecret)
+#  define XXH3p_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSeed)
+
+#  define XXH3p_createState XXH_NAME2(XXH_NAMESPACE, XXH3p_createState)
+#  define XXH3p_freeState XXH_NAME2(XXH_NAMESPACE, XXH3p_freeState)
+#  define XXH3p_copyState XXH_NAME2(XXH_NAMESPACE, XXH3p_copyState)
+
+#  define XXH3p_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset)
+#  define XXH3p_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSeed)
+#  define XXH3p_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSecret)
+#  define XXH3p_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_update)
+#  define XXH3p_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_digest)
+#endif
 
+/* XXH3p_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* data, size_t len);
+
+/* XXH3p_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXH3p_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3p_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXH3p_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXH3p_64bits() */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
 #else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
 
-#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
-struct XXH64_state_s {
-  unsigned long long total_len;
-  unsigned long long v1;
-  unsigned long long v2;
-  unsigned long long v3;
-  unsigned long long v4;
-  unsigned long long mem64[4];
-  unsigned memsize;
-  unsigned reserved[2]; /* never read nor write, might be removed in a future
-                           version */
-};                      /* typedef'd to XXH64_state_t */
+typedef struct XXH3p_state_s XXH3p_state_t;
+
+#define XXH3p_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
+#define XXH3p_INTERNALBUFFER_SIZE 256
+struct XXH3p_state_s {
+   XXH_ALIGN(64) XXH64_hash_t acc[8];
+   XXH_ALIGN(64) unsigned char customSecret[XXH3p_SECRET_DEFAULT_SIZE];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+   XXH_ALIGN(64) unsigned char buffer[XXH3p_INTERNALBUFFER_SIZE];
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   const unsigned char* secret;    /* note : there is some padding after, due to alignment on 64 bytes */
+};   /* typedef'd to XXH3p_state_t */
+
+/* Streaming requires state maintenance.
+ * This operation costs memory and cpu.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer using one-shot functions whenever possible. */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API void XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state);
+
+
+/* XXH3p_64bits_reset() :
+ * initialize with default parameters.
+ * result will be equivalent to `XXH3p_64bits()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset(XXH3p_state_t* statePtr);
+/* XXH3p_64bits_reset_withSeed() :
+ * generate a custom secret from `seed`, and store it into state.
+ * digest will be equivalent to `XXH3p_64bits_withSeed()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+/* XXH3p_64bits_reset_withSecret() :
+ * `secret` is referenced, and must outlive the hash streaming session.
+ * secretSize must be >= XXH3p_SECRET_SIZE_MIN.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3p_64bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3p_128bits XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits)
+#  define XXH3p_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSeed)
+#  define XXH3p_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSecret)
+
+#  define XXH3p_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset)
+#  define XXH3p_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSeed)
+#  define XXH3p_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSecret)
+#  define XXH3p_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_update)
+#  define XXH3p_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
 #endif
 
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* Note : for better performance, following functions can be inlined,
+ * using XXH_INLINE_ALL */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/* This comparator is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXH_INLINE_ALL
+************************************************************************/
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.cc"   /* include xxhash function bodies as `static`, for inlining */
 #endif
 
+
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
 #if defined (__cplusplus)
-}  // namespace rocksdb
+}
 #endif
+
+#endif /* XXHASH_H_5627135585666179 */
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index 084d817ea08..45d04d188fa 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -179,8 +179,7 @@ TransactionLockMgr::~TransactionLockMgr() {}
 
 size_t LockMap::GetStripe(const std::string& key) const {
   assert(num_stripes_ > 0);
-  size_t stripe = static_cast<size_t>(GetSliceNPHash64(key)) % num_stripes_;
-  return stripe;
+  return fastrange64(GetSliceNPHash64(key), num_stripes_);
 }
 
 void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 7357907d068..9881b356ccc 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -3351,7 +3351,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
         snap.store(db->GetSnapshot());
         ReadOptions roptions;
         roptions.snapshot = snap.load();
-        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value);
         ASSERT_OK(s);
       };
       auto callback = [&](void* param) {
@@ -3387,7 +3387,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           ASSERT_OK(txn->SetName("xid"));
           std::string val_str = "value" + ToString(i);
           for (size_t b = 0; b < sub_batch_cnt; b++) {
-            ASSERT_OK(txn->Put(Slice("key"), val_str));
+            ASSERT_OK(txn->Put(Slice("key2"), val_str));
           }
           ASSERT_OK(txn->Prepare());
           // Let an eviction to kick in
@@ -3405,7 +3405,8 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           roptions.snapshot = snap.load();
           ASSERT_NE(nullptr, roptions.snapshot);
           PinnableSlice value2;
-          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value2);
+          auto s =
+              db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2);
           ASSERT_OK(s);
           // It should see its own write
           ASSERT_TRUE(val_str == value2);

From 2309fd63bf2c7fb1b45713b2bf4e879bdbdb4822 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 24 Oct 2019 18:28:03 -0700
Subject: [PATCH 483/572] Update column families' log number altogether after
 flushing during recovery (#5856)

Summary:
A bug occasionally shows up in crash test, and https://github.com/facebook/rocksdb/issues/5851 reproduces it.
The bug can surface in the following way.
1. Database has multiple column families.
2. Between one DB restart, the last log file is corrupted in the middle (not the tail)
3. During restart, DB crashes between flushing between two column families.

Then DB will fail to be opened again with error "SST file is ahead of WALs".
Solution is to update the log number associated with each column family altogether after flushing all column families' memtables. The version edits should be written to a new MANIFEST. Only after writing to all these version edits succeed does RocksDB (atomically) points the CURRENT file to the new MANIFEST.

Test plan (on devserver):
```
$make all && make check
```
Specifically
```
$make db_test2
$./db_test2 --gtest_filter=DBTest2.CrashInRecoveryMultipleCF
```
Also checked for compatibility as follows.
Use this branch, run DBTest2.CrashInRecoveryMultipleCF and preserve the db directory.
Then checkout 5.4, build ldb, and dump the MANIFEST.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5856

Differential Revision: D17620818

Pulled By: riversand963

fbshipit-source-id: b52ce5969c9a8052cacec2bd805fcfb373589039
---
 HISTORY.md                 |   2 +
 db/db_impl/db_impl_open.cc |  30 ++++++++---
 db/db_test2.cc             | 106 ++++++++++++++++++++-----------------
 db/version_set.cc          |   5 +-
 4 files changed, 85 insertions(+), 58 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 68269b19cf9..de700bc0801 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -12,6 +12,8 @@
 * Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
 * Fixed an sst_dump crash on some plain table SST files.
 * Fixed a memory leak in some error cases of opening plain table SST files.
+* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856).
+
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index b5311060a27..77ab2ebd809 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1023,6 +1023,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         continue;
       }
 
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
       // flush the final memtable (if non-empty)
       if (cfd->mem()->GetFirstSequenceNumber() != 0) {
         // If flush happened in the middle of recovery (e.g. due to memtable
@@ -1042,7 +1045,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         data_seen = true;
       }
 
-      // write MANIFEST with update
+      // Update the log number info in the version edit corresponding to this
+      // column family. Note that the version edits will be written to MANIFEST
+      // together later.
       // writing log_number in the manifest means that any log file
       // with number strongly less than (log_number + 1) is already
       // recovered and should be ignored on next reincarnation.
@@ -1051,19 +1056,28 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
         edit->SetLogNumber(max_log_number + 1);
       }
+    }
+    if (status.ok()) {
       // we must mark the next log number as used, even though it's
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
       versions_->MarkFileNumberUsed(max_log_number + 1);
-      status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
-                                      edit, &mutex_);
-      TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:AfterLogAndApply",
-                               nullptr);
-      if (!status.ok()) {
-        // Recovery failed
-        break;
+
+      autovector<ColumnFamilyData*> cfds;
+      autovector<const MutableCFOptions*> cf_opts;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfds.push_back(cfd);
+        cf_opts.push_back(cfd->GetLatestMutableCFOptions());
+        auto iter = version_edits.find(cfd->GetID());
+        assert(iter != version_edits.end());
+        edit_lists.push_back({&iter->second});
       }
+      // write MANIFEST with update
+      status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
+                                      directories_.GetDbDir(),
+                                      /*new_descriptor_log=*/true);
     }
   }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index fac84e3fe5d..368ebe93647 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -4109,63 +4109,73 @@ TEST_F(DBTest2, RowCacheSnapshot) {
 }
 #endif  // ROCKSDB_LITE
 
-// Disabled but the test is failing.
 // When DB is reopened with multiple column families, the manifest file
 // is written after the first CF is flushed, and it is written again
 // after each flush. If DB crashes between the flushes, the flushed CF
 // flushed will pass the latest log file, and now we require it not
 // to be corrupted, and triggering a corruption report.
 // We need to fix the bug and enable the test.
-TEST_F(DBTest2, DISABLED_CrashInRecoveryMultipleCF) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put(1, "foo", "bar"));
-  ASSERT_OK(Flush(1));
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_OK(Put(1, "foo", "bar"));
-  // The value is large enough to be divided to two blocks.
-  std::string large_value(400, ' ');
-  ASSERT_OK(Put("foo1", large_value));
-  ASSERT_OK(Put("foo2", large_value));
-  Close();
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+  const std::vector<std::string> sync_points = {
+      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+  for (const auto& test_sync_point : sync_points) {
+    Options options = CurrentOptions();
+    // First destroy original db to ensure a clean start.
+    DestroyAndReopen(options);
+    options.create_if_missing = true;
+    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(1, "foo", "bar"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put(1, "foo", "bar"));
+    // The value is large enough to be divided to two blocks.
+    std::string large_value(400, ' ');
+    ASSERT_OK(Put("foo1", large_value));
+    ASSERT_OK(Put("foo2", large_value));
+    Close();
 
-  // Corrupt the log file in the middle, so that it is not corrupted
-  // in the tail.
-  std::vector<std::string> filenames;
-  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
-  for (const auto& f : filenames) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
-      std::string fname = dbname_ + "/" + f;
-      std::string file_content;
-      ASSERT_OK(ReadFileToString(env_, fname, &file_content));
-      file_content[400] = 'h';
-      file_content[401] = 'a';
-      ASSERT_OK(WriteStringToFile(env_, file_content, fname));
-      break;
+    // Corrupt the log file in the middle, so that it is not corrupted
+    // in the tail.
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+        std::string fname = dbname_ + "/" + f;
+        std::string file_content;
+        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+        file_content[400] = 'h';
+        file_content[401] = 'a';
+        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+        break;
+      }
     }
-  }
 
-  // Reopen and freeze the file system after the first manifest write.
-  FaultInjectionTestEnv fit_env(options.env);
-  options.env = &fit_env;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::RecoverLogFiles:AfterLogAndApply",
-      [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-
-  fit_env.SetFilesystemActive(true);
-  // If we continue using failure ingestion Env, it will conplain something
-  // when renaming current file, which is not expected. Need to investigate why.
-  options.env = env_;
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+    // Reopen and freeze the file system after the first manifest write.
+    FaultInjectionTestEnv fit_env(options.env);
+    options.env = &fit_env;
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        test_sync_point,
+        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NOK(TryReopenWithColumnFamilies(
+        {kDefaultColumnFamilyName, "pikachu"}, options));
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    fit_env.SetFilesystemActive(true);
+    // If we continue using failure ingestion Env, it will conplain something
+    // when renaming current file, which is not expected. Need to investigate
+    // why.
+    options.env = env_;
+    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                                          options));
+  }
 }
 }  // namespace rocksdb
 
diff --git a/db/version_set.cc b/db/version_set.cc
index e6553ddf800..ac66a5cb38f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3807,8 +3807,9 @@ Status VersionSet::ProcessManifestWrites(
                          rocksdb_kill_odds * REDUCE_ODDS2);
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
-          TEST_SYNC_POINT(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0");
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              nullptr);
           TEST_SYNC_POINT(
               "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
         }

From 013babc6858c79ca7e476fe7b146544bb4dd6bd2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Oct 2019 18:46:48 -0700
Subject: [PATCH 484/572] Clean up some filter tests and comments (#5960)

Summary:
Some filtering tests were unfriendly to new implementations of
FilterBitsBuilder because of dynamic_cast to FullFilterBitsBuilder. Most
of those have now been cleaned up, worked around, or at least changed
from crash on dynamic_cast failure to individual test failure.

Also put some clarifying comments on filter-related APIs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5960

Test Plan: make check

Differential Revision: D18121223

Pulled By: pdillinger

fbshipit-source-id: e83827d9d5d96315d96f8e25a99cd70f497d802c
---
 include/rocksdb/filter_policy.h               | 15 +++---
 table/block_based/filter_policy.cc            |  8 +--
 table/block_based/filter_policy_internal.h    | 11 ++--
 table/block_based/full_filter_block_test.cc   | 52 ++++++++++++++++---
 .../partitioned_filter_block_test.cc          | 18 +++----
 util/bloom_test.cc                            |  1 +
 6 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 950fbe616ea..c923ba354ea 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -44,12 +44,13 @@ class FilterBitsBuilder {
   // The ownership of actual data is set to buf
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
 
-  // Calculate num of entries fit into a space.
+  // Calculate num of keys that can be added and generate a filter
+  // <= the specified number of bytes.
 #if defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4702)  // unreachable code
 #endif
-  virtual int CalculateNumEntry(const uint32_t /*space*/) {
+  virtual int CalculateNumEntry(const uint32_t /*bytes*/) {
 #ifndef ROCKSDB_LITE
     throw std::runtime_error("CalculateNumEntry not Implemented");
 #else
@@ -119,13 +120,13 @@ class FilterPolicy {
   // list, but it should aim to return false with a high probability.
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
 
-  // Get the FilterBitsBuilder, which is ONLY used for full filter block
-  // It contains interface to take individual key, then generate filter
+  // Return a new FilterBitsBuilder for full or partitioned filter blocks, or
+  // nullptr if using block-based filter.
   virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
 
-  // Get the FilterBitsReader, which is ONLY used for full filter block
-  // It contains interface to tell if key can be in filter
-  // The input slice should NOT be deleted by FilterPolicy
+  // Return a new FilterBitsReader for full or partitioned filter blocks, or
+  // nullptr if using block-based filter.
+  // As here, the input slice should NOT be deleted by FilterPolicy.
   virtual FilterBitsReader* GetFilterBitsReader(
       const Slice& /*contents*/) const {
     return nullptr;
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 5eef86be29c..c2acc570aee 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -103,16 +103,16 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
   return data;
 }
 
-int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
+int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
   assert(bits_per_key_);
-  assert(space > 0);
+  assert(bytes > 0);
   uint32_t dont_care1, dont_care2;
-  int high = static_cast<int>(space * 8 / bits_per_key_ + 1);
+  int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
   int low = 1;
   int n = high;
   for (; n >= low; n--) {
     uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
-    if (sz <= space) {
+    if (sz <= bytes) {
       break;
     }
   }
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 681d20dac42..458a3441274 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -28,7 +28,7 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
 
   ~FullFilterBitsBuilder();
 
-  virtual void AddKey(const Slice& key) override;
+  void AddKey(const Slice& key) override;
 
   // Create a filter that for hashes [0, n-1], the filter is allocated here
   // When creating filter, it is ensured that
@@ -44,12 +44,13 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
   // +----------------------------------------------------------------+
   // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
   // +----------------------------------------------------------------+
-  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override;
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
 
-  // Calculate num of entries fit into a space.
-  virtual int CalculateNumEntry(const uint32_t space) override;
+  int CalculateNumEntry(const uint32_t bytes) override;
 
-  // Calculate space for new filter. This is reverse of CalculateNumEntry.
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to CalculateNumEntry should
+  // return >= the num_entry passed in.
   uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
                           uint32_t* num_lines);
 
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 0f6a5cdab8b..3d9655644e5 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <set>
+
 #include "table/block_based/full_filter_block.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
@@ -205,24 +207,62 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) {
                                  /*lookup_context=*/nullptr));
 }
 
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+  std::unique_ptr<FilterBitsBuilder> b_;
+  std::set<std::string> uniq_;
+
+ public:
+  explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+  ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+  void AddKey(const Slice& key) override {
+    b_->AddKey(key);
+    uniq_.insert(key.ToString());
+  }
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    Slice rv = b_->Finish(buf);
+    uniq_.clear();
+    return rv;
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    return b_->CalculateNumEntry(bytes);
+  }
+
+  size_t CountUnique() { return uniq_.size(); }
+};
+
 TEST_F(FullFilterBlockTest, DuplicateEntries) {
   {  // empty prefixes
     std::unique_ptr<const SliceTransform> prefix_extractor(
         NewFixedPrefixTransform(0));
-    auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
         table_options_.filter_policy->GetFilterBitsBuilder());
     const bool WHOLE_KEY = true;
     FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                    bits_builder);
     ASSERT_EQ(0, builder.NumAdded());
-    builder.Add("key");  // test with empty prefix
-    ASSERT_EQ(2, bits_builder->hash_entries_.size());
+    ASSERT_EQ(0, bits_builder->CountUnique());
+    // adds key and empty prefix; both abstractions count them
+    builder.Add("key1");
+    ASSERT_EQ(2, builder.NumAdded());
+    ASSERT_EQ(2, bits_builder->CountUnique());
+    // Add different key (unique) and also empty prefix (not unique).
+    // From here in this test, it's immaterial whether the block builder
+    // can count unique keys.
+    builder.Add("key2");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+    // Empty key -> nothing unique
+    builder.Add("");
+    ASSERT_EQ(3, bits_builder->CountUnique());
   }
 
   // mix of empty and non-empty
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(7));
-  auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
       table_options_.filter_policy->GetFilterBitsBuilder());
   const bool WHOLE_KEY = true;
   FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
@@ -234,8 +274,8 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
   builder.Add("prefix1key2");
   builder.Add("prefix1key3");
   builder.Add("prefix2key4");
-  // two prefix adn 4 keys
-  ASSERT_EQ(1 + 2 + 4, bits_builder->hash_entries_.size());
+  // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
 }
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 21583f3e049..999dbb7f9ab 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -65,12 +65,15 @@ class PartitionedFilterBlockTest
   InternalKeyComparator icomp_;
   std::unique_ptr<BlockBasedTable> table_;
   std::shared_ptr<Cache> cache_;
+  int bits_per_key_;
 
   PartitionedFilterBlockTest()
       : ioptions_(options_),
         env_options_(options_),
-        icomp_(options_.comparator) {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
+        icomp_(options_.comparator),
+        bits_per_key_(10) {
+    table_options_.filter_policy.reset(
+        NewBloomFilterPolicy(bits_per_key_, false));
     table_options_.format_version = GetParam();
     table_options_.index_block_restart_interval = 3;
   }
@@ -91,16 +94,9 @@ class PartitionedFilterBlockTest
   }
 
   uint64_t MaxFilterSize() {
-    uint32_t dont_care1, dont_care2;
     int num_keys = sizeof(keys) / sizeof(*keys);
-    auto filter_bits_reader = dynamic_cast<rocksdb::FullFilterBitsBuilder*>(
-        table_options_.filter_policy->GetFilterBitsBuilder());
-    assert(filter_bits_reader);
-    auto partition_size =
-        filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
-    delete filter_bits_reader;
-    return partition_size +
-               partition_size * table_options_.block_size_deviation / 100;
+    // General, rough over-approximation
+    return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
   }
 
   uint64_t last_offset = 10;
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 76a0541c897..9e9b794f193 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -324,6 +324,7 @@ class FullBloomTest : public testing::Test {
 TEST_F(FullBloomTest, FilterSize) {
   uint32_t dont_care1, dont_care2;
   auto full_bits_builder = GetFullFilterBitsBuilder();
+  ASSERT_TRUE(full_bits_builder != nullptr);
   for (int n = 1; n < 100; n++) {
     auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
     auto n2 = full_bits_builder->CalculateNumEntry(space);

From ec880436c1627b2f6f79b508a6face5671e3117f Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 25 Oct 2019 11:52:24 -0700
Subject: [PATCH 485/572] API to get file_creation_time of the oldest file in
 the DB (#5948)

Summary:
Adding a new API to db.h that allows users to get file_creation_time of the oldest file in the DB.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5948

Test Plan: Added unit test.

Differential Revision: D18056151

Pulled By: vjnadimpalli

fbshipit-source-id: 448ec9d34cb6772e1e5a62db399ace00dcbfbb5d
---
 db/db_impl/db_impl.cc                    |  20 +++++
 db/db_impl/db_impl.h                     |   2 +
 db/db_test.cc                            | 105 +++++++++++++++++++++++
 db/version_set.cc                        |  19 ++++
 db/version_set.h                         |   4 +
 include/rocksdb/db.h                     |  12 +++
 include/rocksdb/utilities/stackable_db.h |   5 ++
 7 files changed, 167 insertions(+)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 72cfb0be93f..14d9523f75f 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -4270,6 +4270,26 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion(
   dummy_sv_ctx.Clean();
   return s;
 }
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  if (mutable_db_options_.max_open_files == -1) {
+    uint64_t oldest_time = port::kMaxUint64;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      uint64_t ctime;
+      cfd->current()->GetCreationTimeOfOldestFile(&ctime);
+      if (ctime < oldest_time) {
+        oldest_time = ctime;
+      }
+      if (oldest_time == 0) {
+        break;
+      }
+    }
+    *creation_time = oldest_time;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("This API only works if max_open_files = -1");
+  }
+}
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 4a9b243d922..d3bf37b7d1b 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -349,6 +349,8 @@ class DBImpl : public DB {
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
   virtual Status GetCurrentWalFile(
       std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
diff --git a/db/db_test.cc b/db/db_test.cc
index 04785c3c1da..89553b3a799 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2798,6 +2798,11 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* /*creation_time*/) override {
+    return Status::NotSupported();
+  }
+
   Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
 
   Status GetUpdatesSince(
@@ -6271,6 +6276,106 @@ TEST_F(DBTest, LargeBlockSizeTest) {
   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
 
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  bool set_file_creation_time_to_zero = true;
+  int idx = 0;
+
+  int64_t time_1 = 0;
+  env_->GetCurrentTime(&time_1);
+  const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+  // Add 50 hours
+  env_->addon_time_.fetch_add(50 * 60 * 60);
+
+  int64_t time_2 = 0;
+  env_->GetCurrentTime(&time_2);
+  const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          if (idx == 0) {
+            props->file_creation_time = 0;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_1;
+            idx = 0;
+          }
+        } else {
+          if (idx == 0) {
+            props->file_creation_time = uint_time_1;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_2;
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files, oen with file_creation_time = 0 and
+  // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+  uint64_t creation_time;
+  Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+  ASSERT_EQ(0, creation_time);
+  ASSERT_EQ(s1, Status::OK());
+
+  // Testing with non-zero file creation time.
+  set_file_creation_time_to_zero = false;
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files with non-zero file creation time.
+  // GetCreationTimeOfOldestFile API should return non-zero value.
+  uint64_t ctime;
+  Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(uint_time_1, ctime);
+  ASSERT_EQ(s2, Status::OK());
+
+  // Testing with max_open_files != -1
+  options = CurrentOptions();
+  options.max_open_files = 10;
+  DestroyAndReopen(options);
+  Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(s3, Status::NotSupported());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 }  // namespace rocksdb
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/version_set.cc b/db/version_set.cc
index ac66a5cb38f..11264205a29 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1481,6 +1481,25 @@ uint64_t Version::GetSstFilesSize() {
   return sst_files_size;
 }
 
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  uint64_t oldest_time = port::kMaxUint64;
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      assert(meta->fd.table_reader != nullptr);
+      uint64_t file_creation_time =
+          meta->fd.table_reader->GetTableProperties()->file_creation_time;
+      if (file_creation_time == 0) {
+        *creation_time = file_creation_time;
+        return;
+      }
+      if (file_creation_time < oldest_time) {
+        oldest_time = file_creation_time;
+      }
+    }
+  }
+  *creation_time = oldest_time;
+}
+
 uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
   // Estimation will be inaccurate when:
   // (1) there exist merge keys
diff --git a/db/version_set.h b/db/version_set.h
index 24919a6020d..6b9d71c3740 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -672,6 +672,10 @@ class Version {
 
   uint64_t GetSstFilesSize();
 
+  // Retrieves the file_creation_time of the oldest file in the DB.
+  // Prerequisite for this API is max_open_files = -1
+  void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
   const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
 
  private:
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index b03c12e003c..e73ae9c2055 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1140,6 +1140,18 @@ class DB {
   virtual Status GetCurrentWalFile(
       std::unique_ptr<LogFile>* current_log_file) = 0;
 
+  // Retrieves the creation time of the oldest file in the DB.
+  // This API only works if max_open_files = -1, if it is not then
+  // Status returned is Status::NotSupported()
+  // The file creation time is set using the env provided to the DB.
+  // If the DB was created from a very old release then its possible that
+  // the SST files might not have file_creation_time property and even after
+  // moving to a newer release its possible that some files never got compacted
+  // and may not have file_creation_time property. In both the cases
+  // file_creation_time is considered 0 which means this API will return
+  // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
   // Note: this API is not yet consistent with WritePrepared transactions.
   // Sets iter to an iterator that is positioned at a write-batch containing
   // seq_number. If the sequence number is non existent, it returns an iterator
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index e6618cf4510..c0cb7e31619 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -383,6 +383,11 @@ class StackableDB : public DB {
     return db_->GetCurrentWalFile(current_log_file);
   }
 
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override {
+    return db_->GetCreationTimeOfOldestFile(creation_time);
+  }
+
   virtual Status DeleteFile(std::string name) override {
     return db_->DeleteFile(name);
   }

From b3dc2f3691528c52587452847590d0477cdb017d Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 25 Oct 2019 12:53:11 -0700
Subject: [PATCH 486/572] Update xxhash.cc to allow combined compilation
 (#5969)

Summary:
To fix unity_test
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5969

Test Plan: make unity_test

Differential Revision: D18140426

Pulled By: pdillinger

fbshipit-source-id: d5516e6d665f57e3706b9f9b965b0c458e58ccef
---
 util/xxhash.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util/xxhash.cc b/util/xxhash.cc
index b35473e1af1..1cae65a2874 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -119,7 +119,10 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 
 #include <limits.h>   /* ULLONG_MAX */
 
+#ifndef XXH_STATIC_LINKING_ONLY
 #define XXH_STATIC_LINKING_ONLY
+#endif
+
 #include "xxhash.h"
 
 /* BEGIN RocksDB customizations */

From 3f891c40a06f03bec99cf57e62237280d273995b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 25 Oct 2019 13:25:28 -0700
Subject: [PATCH 487/572] More improvements to filter_bench (#5968)

Summary:
* Adds support for plain table filter. This is not critical right now, but does add a -impl flag that will be useful for new filter implementations initially targeted at block-based table (and maybe later ported to plain table)
* Better mixing of inside vs. outside queries, for more realism
* A -best_case option handy for implementation tuning inner loop
* Option for whether to include hashing time in dry run / net timings

No modifications to production code, just filter_bench.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5968

Differential Revision: D18139872

Pulled By: pdillinger

fbshipit-source-id: 5b09eba963111b48f9e0525a706e9921070990e8
---
 util/filter_bench.cc | 250 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 197 insertions(+), 53 deletions(-)

diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index e0b4a979986..53e888fb89b 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -16,11 +16,13 @@ int main() {
 #include <sstream>
 #include <vector>
 
+#include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/mock_block_based_table.h"
+#include "table/plain/plain_table_bloom.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
 #include "util/random.h"
@@ -57,8 +59,24 @@ DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
 DEFINE_bool(use_full_block_reader, false,
             "Use FullFilterBlockReader interface rather than FilterBitsReader");
 
+DEFINE_bool(use_plain_table_bloom, false,
+            "Use PlainTableBloom structure and interface rather than "
+            "FilterBitsReader/FullFilterBlockReader");
+
+DEFINE_uint32(impl, 0,
+              "Select filter implementation. Without -use_plain_table_bloom:"
+              "0 = full filter, 1 = block-based filter. With "
+              "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
+
+DEFINE_bool(net_includes_hashing, false,
+            "Whether query net ns/op times should include hashing. "
+            "(if not, dry run will include hashing) "
+            "(build times always include hashing)");
+
 DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
 
+DEFINE_bool(best_case, false, "Run limited tests only for best-case");
+
 DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
 
 DEFINE_bool(legend, false,
@@ -73,14 +91,18 @@ void _always_assert_fail(int line, const char *file, const char *expr) {
 #define ALWAYS_ASSERT(cond) \
   ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
 
+using rocksdb::Arena;
 using rocksdb::BlockContents;
+using rocksdb::BloomHash;
 using rocksdb::CachableEntry;
 using rocksdb::EncodeFixed32;
 using rocksdb::fastrange32;
 using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
 using rocksdb::FullFilterBlockReader;
+using rocksdb::GetSliceHash;
 using rocksdb::ParsedFullFilterBlock;
+using rocksdb::PlainTableBloomV1;
 using rocksdb::Random32;
 using rocksdb::Slice;
 using rocksdb::mock::MockBlockBasedTableTester;
@@ -142,6 +164,7 @@ struct FilterInfo {
   uint32_t keys_added_ = 0;
   std::unique_ptr<FilterBitsReader> reader_;
   std::unique_ptr<FullFilterBlockReader> full_block_reader_;
+  std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
   uint64_t outside_queries_ = 0;
   uint64_t false_positives_ = 0;
 };
@@ -165,6 +188,10 @@ static const std::vector<TestMode> quickTestModes = {
     kRandomFilter,
 };
 
+static const std::vector<TestMode> bestCaseTestModes = {
+    kSingleFilter,
+};
+
 const char *TestModeToString(TestMode tm) {
   switch (tm) {
     case kSingleFilter:
@@ -183,11 +210,23 @@ const char *TestModeToString(TestMode tm) {
   return "Bad TestMode";
 }
 
+// Do just enough to keep some data dependence for the
+// compiler / CPU
+static inline uint32_t NoHash(Slice &s) {
+  uint32_t sz = static_cast<uint32_t>(s.size());
+  if (sz >= 4) {
+    return sz + s.data()[3];
+  } else {
+    return sz;
+  }
+}
+
 struct FilterBench : public MockBlockBasedTableTester {
   std::vector<KeyMaker> kms_;
   std::vector<FilterInfo> infos_;
   Random32 random_;
   std::ostringstream fp_rate_report_;
+  Arena arena_;
 
   FilterBench()
       : MockBlockBasedTableTester(
@@ -200,12 +239,27 @@ struct FilterBench : public MockBlockBasedTableTester {
 
   void Go();
 
-  double RandomQueryTest(bool inside, bool dry_run, TestMode mode);
+  double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                         TestMode mode);
 };
 
 void FilterBench::Go() {
-  std::unique_ptr<FilterBitsBuilder> builder(
-      table_options_.filter_policy->GetFilterBitsBuilder());
+  if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
+    throw std::runtime_error(
+        "Can't combine -use_plain_table_bloom and -use_full_block_reader");
+  }
+  if (FLAGS_impl > 1) {
+    throw std::runtime_error("-impl must currently be >= 0 and <= 1");
+  }
+  if (!FLAGS_use_plain_table_bloom && FLAGS_impl == 1) {
+    throw std::runtime_error(
+        "Block-based filter not currently supported by filter_bench");
+  }
+
+  std::unique_ptr<FilterBitsBuilder> builder;
+  if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
+    builder.reset(table_options_.filter_policy->GetFilterBitsBuilder());
+  }
 
   uint32_t variance_mask = 1;
   while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) {
@@ -213,9 +267,13 @@ void FilterBench::Go() {
   }
 
   const std::vector<TestMode> &testModes =
-      FLAGS_quick ? quickTestModes : allTestModes;
+      FLAGS_best_case ? bestCaseTestModes
+                      : FLAGS_quick ? quickTestModes : allTestModes;
   if (FLAGS_quick) {
     FLAGS_m_queries /= 7.0;
+  } else if (FLAGS_best_case) {
+    FLAGS_m_queries /= 3.0;
+    FLAGS_working_mem_size_mb /= 10.0;
   }
 
   std::cout << "Building..." << std::endl;
@@ -230,22 +288,35 @@ void FilterBench::Go() {
     uint32_t keys_to_add = FLAGS_average_keys_per_filter +
                            (random_.Next() & variance_mask) -
                            (variance_mask / 2);
-    for (uint32_t i = 0; i < keys_to_add; ++i) {
-      builder->AddKey(kms_[0].Get(filter_id, i));
-    }
     infos_.emplace_back();
     FilterInfo &info = infos_.back();
     info.filter_id_ = filter_id;
-    info.filter_ = builder->Finish(&info.owner_);
     info.keys_added_ = keys_to_add;
-    info.reader_.reset(
-        table_options_.filter_policy->GetFilterBitsReader(info.filter_));
-    CachableEntry<ParsedFullFilterBlock> block(
-        new ParsedFullFilterBlock(table_options_.filter_policy.get(),
-                                  BlockContents(info.filter_)),
-        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
-    info.full_block_reader_.reset(
-        new FullFilterBlockReader(table_.get(), std::move(block)));
+    if (FLAGS_use_plain_table_bloom) {
+      info.plain_table_bloom_.reset(new PlainTableBloomV1());
+      info.plain_table_bloom_->SetTotalBits(
+          &arena_, keys_to_add * FLAGS_bits_per_key, FLAGS_impl,
+          0 /*huge_page*/, nullptr /*logger*/);
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
+        info.plain_table_bloom_->AddHash(hash);
+      }
+      info.filter_ = info.plain_table_bloom_->GetRawData();
+    } else {
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        builder->AddKey(kms_[0].Get(filter_id, i));
+      }
+      info.filter_ = builder->Finish(&info.owner_);
+      info.reader_.reset(
+          table_options_.filter_policy->GetFilterBitsReader(info.filter_));
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                    BlockContents(info.filter_)),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      info.full_block_reader_.reset(
+          new FullFilterBlockReader(table_.get(), std::move(block)));
+    }
     total_memory_used += info.filter_.size();
     total_keys_added += keys_to_add;
   }
@@ -259,7 +330,7 @@ void FilterBench::Go() {
 
   double bpk = total_memory_used * 8.0 / total_keys_added;
   std::cout << "Bits/key actual: " << bpk << std::endl;
-  if (!FLAGS_quick) {
+  if (!FLAGS_quick && !FLAGS_best_case) {
     double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
     std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
               << std::endl;
@@ -273,11 +344,23 @@ void FilterBench::Go() {
     for (uint32_t i = 0; i < infos_.size(); ++i) {
       FilterInfo &info = infos_[i];
       for (uint32_t j = 0; j < info.keys_added_; ++j) {
-        ALWAYS_ASSERT(info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
+          ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
+        } else {
+          ALWAYS_ASSERT(
+              info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+        }
       }
       for (uint32_t j = 0; j < outside_q_per_f; ++j) {
-        fps += info.reader_->MayMatch(
-            kms_[0].Get(info.filter_id_, j | 0x80000000));
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash =
+              GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
+          fps += info.plain_table_bloom_->MayContainHash(hash);
+        } else {
+          fps += info.reader_->MayMatch(
+              kms_[0].Get(info.filter_id_, j | 0x80000000));
+        }
       }
     }
     std::cout << " No FNs :)" << std::endl;
@@ -290,26 +373,46 @@ void FilterBench::Go() {
   }
 
   std::cout << "----------------------------" << std::endl;
-  std::cout << "Inside queries..." << std::endl;
+  std::cout << "Mixed inside/outside queries..." << std::endl;
+  // 50% each inside and outside
+  uint32_t inside_threshold = UINT32_MAX / 2;
   for (TestMode tm : testModes) {
     random_.Seed(FLAGS_seed + 1);
-    double f = RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
+    double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
     random_.Seed(FLAGS_seed + 1);
-    double d = RandomQueryTest(/*inside*/ true, /*dry_run*/ true, tm);
+    double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
     std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
               << std::endl;
   }
-  std::cout << fp_rate_report_.str();
 
-  std::cout << "----------------------------" << std::endl;
-  std::cout << "Outside queries..." << std::endl;
-  for (TestMode tm : testModes) {
-    random_.Seed(FLAGS_seed + 2);
-    double f = RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
-    random_.Seed(FLAGS_seed + 2);
-    double d = RandomQueryTest(/*inside*/ false, /*dry_run*/ true, tm);
-    std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
-              << std::endl;
+  if (!FLAGS_quick) {
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Inside queries (mostly)..." << std::endl;
+    // Do about 95% inside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20 * 19;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 1);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 1);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Outside queries (mostly)..." << std::endl;
+    // Do about 95% outside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 2);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 2);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
   }
   std::cout << fp_rate_report_.str();
 
@@ -317,7 +420,8 @@ void FilterBench::Go() {
   std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
 }
 
-double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
+double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                                    TestMode mode) {
   for (auto &info : infos_) {
     info.outside_queries_ = 0;
     info.false_positives_ = 0;
@@ -368,6 +472,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
   rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
 
   for (uint64_t q = 0; q < max_queries; q += batch_size) {
+    bool inside_this_time = random_.Next() <= inside_threshold;
+
     uint32_t filter_index;
     if (random_.Next() <= primary_filter_threshold) {
       filter_index = random_.Uniformish(num_primary_filters);
@@ -378,7 +484,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     }
     FilterInfo &info = infos_[filter_index];
     for (uint32_t i = 0; i < batch_size; ++i) {
-      if (inside) {
+      if (inside_this_time) {
         batch_slices[i] =
             kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
       } else {
@@ -389,14 +495,27 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
       }
     }
     // TODO: implement batched interface to full block reader
-    if (mode == kBatchPrepared && !dry_run && !FLAGS_use_full_block_reader) {
+    // TODO: implement batched interface to plain table bloom
+    if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
+        !FLAGS_use_plain_table_bloom) {
       for (uint32_t i = 0; i < batch_size; ++i) {
         batch_results[i] = false;
       }
-      info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
-                             batch_results.get());
+      if (dry_run) {
+        for (uint32_t i = 0; i < batch_size; ++i) {
+          batch_results[i] = true;
+          if (FLAGS_net_includes_hashing) {
+            dry_run_hash += NoHash(batch_slices[i]);
+          } else {
+            dry_run_hash ^= BloomHash(batch_slices[i]);
+          }
+        }
+      } else {
+        info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
+                               batch_results.get());
+      }
       for (uint32_t i = 0; i < batch_size; ++i) {
-        if (inside) {
+        if (inside_this_time) {
           ALWAYS_ASSERT(batch_results[i]);
         } else {
           info.false_positives_ += batch_results[i];
@@ -404,11 +523,28 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
       }
     } else {
       for (uint32_t i = 0; i < batch_size; ++i) {
-        if (dry_run) {
-          dry_run_hash ^= rocksdb::BloomHash(batch_slices[i]);
-        } else {
-          bool may_match;
-          if (FLAGS_use_full_block_reader) {
+        bool may_match;
+        if (FLAGS_use_plain_table_bloom) {
+          if (dry_run) {
+            if (FLAGS_net_includes_hashing) {
+              dry_run_hash += NoHash(batch_slices[i]);
+            } else {
+              dry_run_hash ^= GetSliceHash(batch_slices[i]);
+            }
+            may_match = true;
+          } else {
+            uint32_t hash = GetSliceHash(batch_slices[i]);
+            may_match = info.plain_table_bloom_->MayContainHash(hash);
+          }
+        } else if (FLAGS_use_full_block_reader) {
+          if (dry_run) {
+            if (FLAGS_net_includes_hashing) {
+              dry_run_hash += NoHash(batch_slices[i]);
+            } else {
+              dry_run_hash ^= BloomHash(batch_slices[i]);
+            }
+            may_match = true;
+          } else {
             may_match = info.full_block_reader_->KeyMayMatch(
                 batch_slices[i],
                 /*prefix_extractor=*/nullptr,
@@ -416,15 +552,24 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
                 /*get_context=*/nullptr,
                 /*lookup_context=*/nullptr);
-          } else {
-            may_match = info.reader_->MayMatch(batch_slices[i]);
           }
-          if (inside) {
-            ALWAYS_ASSERT(may_match);
+        } else {
+          if (dry_run) {
+            if (FLAGS_net_includes_hashing) {
+              dry_run_hash += NoHash(batch_slices[i]);
+            } else {
+              dry_run_hash ^= BloomHash(batch_slices[i]);
+            }
+            may_match = true;
           } else {
-            info.false_positives_ += may_match;
+            may_match = info.reader_->MayMatch(batch_slices[i]);
           }
         }
+        if (inside_this_time) {
+          ALWAYS_ASSERT(may_match);
+        } else {
+          info.false_positives_ += may_match;
+        }
       }
     }
   }
@@ -444,7 +589,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
     std::cout << "ns/op: " << ns << std::endl;
   }
 
-  if (!inside && !dry_run && mode == kRandomFilter) {
+  if (!dry_run) {
+    fp_rate_report_ = std::ostringstream();
     uint64_t q = 0;
     uint64_t fp = 0;
     double worst_fp_rate = 0.0;
@@ -459,7 +605,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
       }
     }
     fp_rate_report_ << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
-    if (!FLAGS_quick) {
+    if (!FLAGS_quick && !FLAGS_best_case) {
       fp_rate_report_ << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
                       << std::endl;
       fp_rate_report_ << "    Best    FP rate %: " << 100.0 * best_fp_rate
@@ -467,8 +613,6 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
       fp_rate_report_ << "    Best possible bits/key: "
                       << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
     }
-  } else {
-    fp_rate_report_.clear();
   }
   return ns;
 }

From 1075c376ef71cf63b812822f8c333c7d45695ec5 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Fri, 25 Oct 2019 18:20:40 -0700
Subject: [PATCH 488/572] Fix for lite build (#5971)

Summary:
Fix for lite build
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5971

Test Plan: make J=1 -j64  LITE=1 all check

Differential Revision: D18148306

Pulled By: vjnadimpalli

fbshipit-source-id: 5b9a3edc3e73e054fee6b96e6f6e583cecc898f3
---
 db/db_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 89553b3a799..c39345d8ea8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6276,6 +6276,8 @@ TEST_F(DBTest, LargeBlockSizeTest) {
   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
 
+#ifndef ROCKSDB_LITE
+
 TEST_F(DBTest, CreationTimeOfOldestFile) {
   const int kNumKeysPerFile = 32;
   const int kNumLevelFiles = 2;
@@ -6376,6 +6378,8 @@ TEST_F(DBTest, CreationTimeOfOldestFile) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+#endif
+
 }  // namespace rocksdb
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

From 79018ba51bba94954752ed5d7764830e3d1ed220 Mon Sep 17 00:00:00 2001
From: Vijay Nadimpalli <vijaynadimpalli@fb.com>
Date: Mon, 28 Oct 2019 13:13:15 -0700
Subject: [PATCH 489/572] Upgrading version to 6.6.0 on Master (#5965)

Summary:
Upgrading version to 6.6.0 on Master.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5965

Differential Revision: D18119839

Pulled By: vjnadimpalli

fbshipit-source-id: 4adbcbb82b108d2f626e88c786453baad8455f4e
---
 HISTORY.md                | 13 +++++++++++++
 include/rocksdb/version.h |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index de700bc0801..1ff833b6717 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,16 @@
 # Rocksdb Change Log
 ## Unreleased
+### Public API Change
+* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the
+file_creation_time of the oldest SST file in the DB. 
+
+## 6.5.1 (10/16/2019)
+### Bug Fixes
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
+* Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound.
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+
+## 6.5.0 (9/13/2019)
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
@@ -25,6 +36,7 @@
 * `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
 * `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
 * A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
+
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
@@ -34,6 +46,7 @@
 * Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
 * Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
 * Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
+
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 * Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index d86c5fc886c..24a0527897e 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 4
+#define ROCKSDB_MINOR 6
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From 26dc29633ea57483e85b4bfa892a8f0785bce3c4 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 28 Oct 2019 14:10:51 -0700
Subject: [PATCH 490/572] filter_bench not needed for ROCKSDB_LITE (#5978)

Summary:
filter_bench is a specialized micro-benchmarking tool that
should not be needed with ROCKSDB_LITE. This should fix the LITE build.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5978

Test Plan: make LITE=1 check

Differential Revision: D18177941

Pulled By: pdillinger

fbshipit-source-id: b73a171404661e09e018bc99afcf8d4bf1e2949c
---
 util/filter_bench.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 53e888fb89b..3271c8574f5 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef GFLAGS
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
 #include <cstdio>
 int main() {
-  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n");
   return 1;
 }
 #else
@@ -660,4 +660,4 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-#endif  // GFLAGS
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)

From 4c9aa30a62d004ad0addb83dba062941783fc201 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Tue, 29 Oct 2019 15:04:26 -0700
Subject: [PATCH 491/572] Auto enable Periodic Compactions if a Compaction
 Filter is used (#5865)

Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.

Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.

`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.

This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865

Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`

Differential Revision: D17659180

Pulled By: sagar0

fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
---
 HISTORY.md                         |  1 +
 db/column_family.cc                | 19 ++++---
 db/db_compaction_test.cc           | 85 ++++++++++++++++++++++++++++++
 db/version_set.cc                  | 13 ++++-
 include/rocksdb/advanced_options.h | 12 ++++-
 utilities/blob_db/blob_db_impl.cc  |  3 ++
 6 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 1ff833b6717..63254df953e 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
 ### Public API Change
+* Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the
 file_creation_time of the oldest SST file in the DB. 
 
diff --git a/db/column_family.cc b/db/column_family.cc
index 16688d6cee2..f0360eefe96 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -30,6 +30,7 @@
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
+#include "port/port.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
@@ -342,6 +343,16 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.max_compaction_bytes = result.target_file_size_base * 25;
   }
 
+  // Turn on periodic compactions and set them to occur once every 30 days if
+  // compaction filters are used and periodic_compaction_seconds is set to the
+  // default value.
+  if (result.compaction_style == kCompactionStyleLevel &&
+      (result.compaction_filter != nullptr ||
+       result.compaction_filter_factory != nullptr) &&
+      result.periodic_compaction_seconds == port::kMaxUint64) {
+    result.periodic_compaction_seconds = 30 * 24 * 60 * 60;
+  }
+
   return result;
 }
 
@@ -1180,12 +1191,8 @@ Status ColumnFamilyData::ValidateOptions(
     }
   }
 
-  if (cf_options.periodic_compaction_seconds > 0) {
-    if (db_options.max_open_files != -1) {
-      return Status::NotSupported(
-          "Periodic Compaction is only supported when files are always "
-          "kept open (set max_open_files = -1). ");
-    }
+  if (cf_options.periodic_compaction_seconds > 0 &&
+      cf_options.periodic_compaction_seconds < port::kMaxUint64) {
     if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
       return Status::NotSupported(
           "Periodic Compaction is only supported in "
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index dad19921c12..bf301d9834a 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3748,6 +3748,91 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+  class TestCompactionFilter : public CompactionFilter {
+    const char* Name() const override { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    const char* Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Random rnd(301);
+
+  Options options = CurrentOptions();
+  TestCompactionFilter test_compaction_filter;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+  env_->addon_time_.store(0);
+
+  enum CompactionFilterType {
+    kUseCompactionFilter,
+    kUseCompactionFilterFactory
+  };
+
+  for (CompactionFilterType comp_filter_type :
+       {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+    // Assert that periodic compactions are not enabled.
+    ASSERT_EQ(port::kMaxUint64, options.periodic_compaction_seconds);
+
+    if (comp_filter_type == kUseCompactionFilter) {
+      options.compaction_filter = &test_compaction_filter;
+      options.compaction_filter_factory.reset();
+    } else if (comp_filter_type == kUseCompactionFilterFactory) {
+      options.compaction_filter = nullptr;
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    DestroyAndReopen(options);
+
+    // periodic_compaction_seconds should be set to the sanitized value when
+    // a compaction filter or a compaction filter factory is used.
+    ASSERT_EQ(30 * 24 * 60 * 60,
+              dbfull()->GetOptions().periodic_compaction_seconds);
+
+    int periodic_compactions = 0;
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+          Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+          auto compaction_reason = compaction->compaction_reason();
+          if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+            periodic_compactions++;
+          }
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 0; i < kNumLevelFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(
+            Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("2", FilesPerLevel());
+    ASSERT_EQ(0, periodic_compactions);
+
+    // Add 31 days and do a write
+    env_->addon_time_.fetch_add(31 * 24 * 60 * 60);
+    ASSERT_OK(Put("a", "1"));
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+    // Assert that the files stay in the same level
+    ASSERT_EQ("3", FilesPerLevel());
+    // The two old files go through the periodic compaction process
+    ASSERT_EQ(2, periodic_compactions);
+
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 
 TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
   // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
diff --git a/db/version_set.cc b/db/version_set.cc
index 11264205a29..61d140a6fb6 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2445,7 +2445,8 @@ void VersionStorageInfo::ComputeCompactionScore(
   if (mutable_cf_options.ttl > 0) {
     ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
   }
-  if (mutable_cf_options.periodic_compaction_seconds > 0) {
+  if (mutable_cf_options.periodic_compaction_seconds > 0 &&
+      mutable_cf_options.periodic_compaction_seconds < port::kMaxUint64) {
     ComputeFilesMarkedForPeriodicCompaction(
         immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
   }
@@ -2505,7 +2506,8 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
 void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
     const ImmutableCFOptions& ioptions,
     const uint64_t periodic_compaction_seconds) {
-  assert(periodic_compaction_seconds > 0);
+  assert(periodic_compaction_seconds > 0 &&
+      periodic_compaction_seconds < port::kMaxUint64);
 
   files_marked_for_periodic_compaction_.clear();
 
@@ -2515,6 +2517,13 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
     return;
   }
   const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+  assert(periodic_compaction_seconds <= current_time);
+  // Disable periodic compaction if periodic_compaction_seconds > current_time.
+  // This also help handle the underflow case.
+  if (periodic_compaction_seconds > current_time) {
+    return;
+  }
   const uint64_t allowed_time_limit =
       current_time - periodic_compaction_seconds;
 
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 2964491f7ee..77c55d977ef 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -670,10 +670,18 @@ struct AdvancedColumnFamilyOptions {
   // Only supported in Level compaction.
   // Pre-req: max_open_file == -1.
   // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
-  // Default: 0 (disabled)
+  //
+  // Values:
+  // 0: Turn off Periodic compactions.
+  // UINT64_MAX (i.e 0xffffffffffffffff): Let RocksDB control this feature
+  //     as needed. For now, RocksDB will change this value to 30 days
+  //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+  //     process at least once every 30 days if not compacted sooner.
+  //
+  // Default: UINT64_MAX (allow RocksDB to auto-tune)
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t periodic_compaction_seconds = 0;
+  uint64_t periodic_compaction_seconds = 0xffffffffffffffff;
 
   // If this option is set then 1 in N blocks are compressed
   // using a fast (lz4) and slow (zstd) compression algorithm.
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index b7338f2305f..d3a4d8e4aec 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -137,6 +137,9 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
       cf_options_.compaction_filter_factory != nullptr) {
     return Status::NotSupported("Blob DB doesn't support compaction filter.");
   }
+  // BlobDB does not support Periodic Compactions. So disable periodic
+  // compactions irrespective of the user set value.
+  cf_options_.periodic_compaction_seconds = 0;
 
   Status s;
 

From f22aaf8b3f46022c016ae5bcd52d77a3e71ce9c9 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 29 Oct 2019 18:15:04 -0700
Subject: [PATCH 492/572] db_stress: CF Consistency check to use random CF to
 validate iterator results (#5983)

Summary:
Right now, in db_stress's iterator tests, we always use the same CF to validate iterator results. This commit changes it so that a randomized CF is used in Cf consistency test, where every CF should have exactly the same data. This would help catch more bugs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5983

Test Plan: Run "make crash_test_with_atomic_flush".

Differential Revision: D18217643

fbshipit-source-id: 3ac998852a0378bb59790b20c5f236f6a5d681fe
---
 tools/db_stress_tool.cc | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 7e1fa3f3af4..ab2c9d7d015 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2400,6 +2400,14 @@ class StressTest {
       const std::vector<int64_t>& rand_keys,
       std::unique_ptr<MutexLock>& lock) = 0;
 
+  // Return a column family handle that mirrors what is pointed by
+  // `column_family_id`, which will be used to validate data to be correct.
+  // By default, the column family itself will be returned.
+  virtual ColumnFamilyHandle* GetControlCfh(ThreadState* /* thread*/,
+                                            int column_family_id) {
+    return column_families_[column_family_id];
+  }
+
   // Given a key K, this creates an iterator which scans to K and then
   // does a random sequence of Next/Prev operations.
   virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
@@ -2458,13 +2466,15 @@ class StressTest {
       ReadOptions cmp_ro;
       cmp_ro.snapshot = snapshot;
       cmp_ro.total_order_seek = true;
-      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cfh));
+      ColumnFamilyHandle* cmp_cfh =
+          GetControlCfh(thread, rand_column_families[0]);
+      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
       bool diverged = false;
 
       iter->Seek(key);
       cmp_iter->Seek(key);
-      VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
-                     &diverged);
+      VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
+                     cmp_iter.get(), key, &diverged);
 
       bool no_reverse =
           (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek &&
@@ -2483,8 +2493,8 @@ class StressTest {
             cmp_iter->Prev();
           }
         }
-        VerifyIterator(thread, readoptionscopy, iter.get(), cmp_iter.get(), key,
-                       &diverged);
+        VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
+                       cmp_iter.get(), key, &diverged);
       }
 
       if (s.ok()) {
@@ -2506,9 +2516,9 @@ class StressTest {
   // Will flag failure if the verification fails.
   // diverged = true if the two iterator is already diverged.
   // True if verification passed, false if not.
-  void VerifyIterator(ThreadState* thread, const ReadOptions& ro,
-                      Iterator* iter, Iterator* cmp_iter, const Slice& seek_key,
-                      bool* diverged) {
+  void VerifyIterator(ThreadState* thread, ColumnFamilyHandle* cmp_cfh,
+                      const ReadOptions& ro, Iterator* iter, Iterator* cmp_iter,
+                      const Slice& seek_key, bool* diverged) {
     if (*diverged) {
       return;
     }
@@ -2601,6 +2611,7 @@ class StressTest {
       }
     }
     if (*diverged) {
+      fprintf(stderr, "Control CF %s\n", cmp_cfh->GetName().c_str());
       thread->stats.AddErrors(1);
       // Fail fast to preserve the DB state.
       thread->shared->SetVerificationFailure();
@@ -4376,6 +4387,13 @@ class CfConsistencyStressTest : public StressTest {
     return s;
   }
 
+  virtual ColumnFamilyHandle* GetControlCfh(ThreadState* thread,
+                                            int /*column_family_id*/
+  ) {
+    // All column families should contain the same data. Randomly pick one.
+    return column_families_[thread->rand.Next() % column_families_.size()];
+  }
+
 #ifdef ROCKSDB_LITE
   virtual Status TestCheckpoint(
       ThreadState* /* thread */,

From a3960fc875233308976351f185b672e8f01296ec Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 29 Oct 2019 18:15:08 -0700
Subject: [PATCH 493/572] Move pipeline write waiting logic into
 WaitForPendingWrites() (#5716)

Summary:
In pipeline writing mode, memtable switching needs to wait for memtable writing to finish to make sure that when memtables are made immutable, inserts are not going to them. This is currently done in DBImpl::SwitchMemtable(). This is done after flush_scheduler_.TakeNextColumnFamily() is called to fetch the list of column families to switch. The function flush_scheduler_.TakeNextColumnFamily() itself, however, is not thread-safe when being called together with flush_scheduler_.ScheduleFlush().
This change provides a fix, which moves the waiting logic before flush_scheduler_.TakeNextColumnFamily(). WaitForPendingWrites() is a natural place where the logic can happen.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5716

Test Plan: Run all tests with ASAN and TSAN.

Differential Revision: D18217658

fbshipit-source-id: b9c5e765c9989645bf10afda7c5c726c3f82f6c3
---
 db/db_impl/db_impl.h        | 12 ++++++++++++
 db/db_impl/db_impl_write.cc | 10 ----------
 tools/db_crashtest.py       |  5 ++---
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index d3bf37b7d1b..8f1b6e88a5d 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1409,10 +1409,22 @@ class DBImpl : public DB {
       bool resuming_from_bg_err);
 
   inline void WaitForPendingWrites() {
+    mutex_.AssertHeld();
+    // In case of pipelined write is enabled, wait for all pending memtable
+    // writers.
+    if (immutable_db_options_.enable_pipelined_write) {
+      // Memtable writers may call DB::Get in case max_successive_merges > 0,
+      // which may lock mutex. Unlocking mutex here to avoid deadlock.
+      mutex_.Unlock();
+      write_thread_.WaitForMemTableWriters();
+      mutex_.Lock();
+    }
+
     if (!immutable_db_options_.unordered_write) {
       // Then the writes are finished before the next write group starts
       return;
     }
+
     // Wait for the ones who already wrote to the WAL to finish their
     // memtable write.
     if (pending_memtable_writes_.load() != 0) {
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 2f6d35d17c0..4ab9de8c400 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1592,16 +1592,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     return s;
   }
 
-  // In case of pipelined write is enabled, wait for all pending memtable
-  // writers.
-  if (immutable_db_options_.enable_pipelined_write) {
-    // Memtable writers may call DB::Get in case max_successive_merges > 0,
-    // which may lock mutex. Unlocking mutex here to avoid deadlock.
-    mutex_.Unlock();
-    write_thread_.WaitForMemTableWriters();
-    mutex_.Lock();
-  }
-
   // Attempt to switch to a new memtable and trigger flush of old.
   // Do this without holding the dbmutex lock.
   assert(versions_->prev_log_number() == 0);
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 54c3fb9aca9..dfc6f760c92 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -37,8 +37,7 @@
     "delpercent": 4,
     "delrangepercent": 1,
     "destroy_db_initially": 0,
-    # Temporarily disable it until its concurrency issue are fixed
-    "enable_pipelined_write": 0,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
     # Temporarily disable hash index
@@ -147,7 +146,7 @@ def is_direct_io_supported(dbname):
     # more frequently
     "write_buffer_size": 1024 * 1024,
     # disable pipelined write when test_atomic_flush is true
-    "enable_pipelined_write": 0,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
 }
 
 

From 15119f08e2cd968c804c9518cac2644efebbedac Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 30 Oct 2019 11:15:14 -0700
Subject: [PATCH 494/572] Add more release branches to
 tools/check_format_compatible.sh (#5985)

Summary:
More release branches are created. We should include them in continuous format compatibility checks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5985

Test Plan: Let's see whether it is passes.

Differential Revision: D18226532

fbshipit-source-id: 75d8cad5b03ccea4ce16f00cea1f8b7893b0c0c8
---
 tools/check_format_compatible.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 444c1111a73..30be49f0800 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -55,9 +55,9 @@ EOF
 
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
 declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
-declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb")
+declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb")
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 
 generate_db()
 {

From 0337d87b42699cad1f6d7810bdaf2eb6bb7d9a7e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 30 Oct 2019 11:34:38 -0700
Subject: [PATCH 495/572] crash_test: disable atomic flush with pipelined write
 (#5986)

Summary:
Recently, pipelined write is enabled even if atomic flush is enabled, which causing sanitizing failure in db_stress. Revert this change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5986

Test Plan: Run "make crash_test_with_atomic_flush" and see it to run for some while so that the old sanitizing error (which showed up quickly) doesn't show up.

Differential Revision: D18228278

fbshipit-source-id: 27fdf2f8e3e77068c9725a838b9bef4ab25a2553
---
 tools/db_crashtest.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index dfc6f760c92..11ecfb2b955 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -145,7 +145,6 @@ def is_direct_io_supported(dbname):
     # use small value for write_buffer_size so that RocksDB triggers flush
     # more frequently
     "write_buffer_size": 1024 * 1024,
-    # disable pipelined write when test_atomic_flush is true
     "enable_pipelined_write": lambda: random.randint(0, 1),
 }
 
@@ -181,9 +180,11 @@ def finalize_and_sanitize(src_params):
             dest_params["partition_filters"] = 0
         else:
             dest_params["use_block_based_filter"] = 0
+    if dest_params.get("atomic_flush", 0) == 1:
+        # disable pipelined write when atomic flush is used.
+        dest_params["enable_pipelined_write"] = 0
     return dest_params
 
-
 def gen_cmd_params(args):
     params = {}
 

From dccaf9f03cd582c7ac3da1e52dab5168c9c67de1 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 30 Oct 2019 13:47:08 -0700
Subject: [PATCH 496/572] Turn compaction asserts to runtime check (#5935)

Summary:
Compaction iterator has many assert statements that are active only during test runs. Some rare bugs would show up only at runtime could violate the assert condition but go unnoticed since assert statements are not compiled in release mode. Turning the assert statements to runtime check sone pors and cons:
Pros:
- A bug that would result into incorrect data would be detected early before the incorrect data is written to the disk.

Cons:
- Runtime overhead: which should be negligible since compaction cpu is the minority in the overall cpu usage
- The assert statements might already being violated at runtime, and turning them to runtime failure might result into reliability issues.

The patch takes a conservative step in this direction by logging the assert violations at runtime. If we see any violation reported in logs, we investigate. Otherwise, we can go ahead turning them to runtime error.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5935

Differential Revision: D18229697

Pulled By: maysamyabandeh

fbshipit-source-id: f1890eca80ccd7cca29737f1825badb9aa8038a8
---
 db/compaction/compaction_iterator.cc | 70 +++++++++++++++++++++++++---
 db/compaction/compaction_iterator.h  |  7 ++-
 db/compaction/compaction_job.cc      |  3 +-
 3 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 89ceb06eacf..59097eec0cf 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <cinttypes>
+
 #include "db/compaction/compaction_iterator.h"
 #include "db/snapshot_checker.h"
 #include "port/likely.h"
@@ -38,7 +40,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    const std::atomic<bool>* manual_compaction_paused)
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
@@ -46,7 +49,7 @@ CompactionIterator::CompactionIterator(
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          manual_compaction_paused) {}
+          manual_compaction_paused, info_log) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -59,7 +62,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    const std::atomic<bool>* manual_compaction_paused)
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
@@ -78,7 +82,8 @@ CompactionIterator::CompactionIterator(
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
-      current_key_committed_(false) {
+      current_key_committed_(false),
+      info_log_(info_log) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
   assert(snapshots_ != nullptr);
   bottommost_level_ =
@@ -142,6 +147,11 @@ void CompactionIterator::Next() {
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       assert(valid_key);
+      if (!valid_key) {
+        ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                        key_.ToString(true).c_str());
+      }
+
       // Keep current_key_ in sync.
       current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
       key_ = current_key_.GetInternalKey();
@@ -338,7 +348,18 @@ void CompactionIterator::NextFromInput() {
       // not compact out.  We will keep this Put, but can drop it's data.
       // (See Optimization 3, below.)
       assert(ikey_.type == kTypeValue);
+      if (ikey_.type != kTypeValue) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "Unexpected key type %d for compaction output",
+                        ikey_.type);
+      }
       assert(current_user_key_snapshot_ == last_snapshot);
+      if (current_user_key_snapshot_ != last_snapshot) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "current_user_key_snapshot_ (%" PRIu64
+                        ") != last_snapshot (%" PRIu64 ")",
+                        current_user_key_snapshot_, last_snapshot);
+      }
 
       value_.clear();
       valid_ = true;
@@ -480,6 +501,12 @@ void CompactionIterator::NextFromInput() {
       // checking since there has already been a record returned for this key
       // in this snapshot.
       assert(last_sequence >= current_user_key_sequence_);
+      if (last_sequence < current_user_key_sequence_) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "last_sequence (%" PRIu64
+                        ") < current_user_key_sequence_ (%" PRIu64 ")",
+                        last_sequence, current_user_key_sequence_);
+      }
 
       ++iter_stats_.num_record_drop_hidden;  // (A)
       input_->Next();
@@ -563,6 +590,10 @@ void CompactionIterator::NextFromInput() {
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
         assert(valid_key);
+        if (!valid_key) {
+          ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                          key_.ToString(true).c_str());
+        }
         // Keep current_key_ in sync.
         current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
         key_ = current_key_.GetInternalKey();
@@ -623,6 +654,11 @@ void CompactionIterator::PrepareOutput() {
       ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && valid_ &&
       IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
     assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "Unexpected key type %d for seq-zero optimization",
+                      ikey_.type);
+    }
     ikey_.sequence = 0;
     current_key_.UpdateInternalKey(0, ikey_.type);
   }
@@ -631,6 +667,10 @@ void CompactionIterator::PrepareOutput() {
 inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
     SequenceNumber in, SequenceNumber* prev_snapshot) {
   assert(snapshots_->size());
+  if (snapshots_->size() == 0) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "No snapshot left in findEarliestVisibleSnapshot");
+  }
   auto snapshots_iter = std::lower_bound(
       snapshots_->begin(), snapshots_->end(), in);
   if (snapshots_iter == snapshots_->begin()) {
@@ -638,6 +678,10 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
   } else {
     *prev_snapshot = *std::prev(snapshots_iter);
     assert(*prev_snapshot < in);
+    if (*prev_snapshot >= in) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "*prev_snapshot >= in in findEarliestVisibleSnapshot");
+    }
   }
   if (snapshot_checker_ == nullptr) {
     return snapshots_iter != snapshots_->end()
@@ -647,6 +691,9 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
   for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
     auto cur = *snapshots_iter;
     assert(in <= cur);
+    if (in > cur) {
+      ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot");
+    }
     // Skip if cur is in released_snapshots.
     if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
       continue;
@@ -671,9 +718,14 @@ inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
 
 bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
   assert(snapshot_checker_ != nullptr);
-  assert(earliest_snapshot_ == kMaxSequenceNumber ||
-         (earliest_snapshot_iter_ != snapshots_->end() &&
-          *earliest_snapshot_iter_ == earliest_snapshot_));
+  bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
+                        (earliest_snapshot_iter_ != snapshots_->end() &&
+                         *earliest_snapshot_iter_ == earliest_snapshot_));
+  assert(pre_condition);
+  if (!pre_condition) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Pre-Condition is not hold in IsInEarliestSnapshot");
+  }
   auto in_snapshot =
       snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
   while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
@@ -692,6 +744,10 @@ bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
         snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
   }
   assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+  if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Unexpected released snapshot in IsInEarliestSnapshot");
+  }
   return in_snapshot == SnapshotCheckerResult::kInSnapshot;
 }
 
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index a06a867dd71..1e08b407d28 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -70,7 +70,8 @@ class CompactionIterator {
       const CompactionFilter* compaction_filter = nullptr,
       const std::atomic<bool>* shutting_down = nullptr,
       const SequenceNumber preserve_deletes_seqnum = 0,
-      const std::atomic<bool>* manual_compaction_paused = nullptr);
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(
@@ -84,7 +85,8 @@ class CompactionIterator {
       const CompactionFilter* compaction_filter = nullptr,
       const std::atomic<bool>* shutting_down = nullptr,
       const SequenceNumber preserve_deletes_seqnum = 0,
-      const std::atomic<bool>* manual_compaction_paused = nullptr);
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
 
   ~CompactionIterator();
 
@@ -222,6 +224,7 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
+  std::shared_ptr<Logger> info_log_;
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 2c625997ebe..4204c402c09 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -891,7 +891,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
       &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_));
+      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
+      db_options_.info_log));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {

From 2a9e5caffe0a1143b40ebb86bc0665f1ff390d7d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 31 Oct 2019 10:59:13 -0700
Subject: [PATCH 497/572] Make FIFO compaction take default 30 days TTL by
 default (#5987)

Summary:
Right now, by default FIFO compaction has no TTL. We believe that a default TTL of 30 days will be better. With this patch, the default will be changed to 30 days. Default of Options.periodic_compaction_seconds will mean the same as options.ttl. If Options.ttl and Options.periodic_compaction_seconds left default, a default 30 days TTL will be used. If both options are set, the stricter value of the two will be used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5987

Test Plan: Add an option sanitize test to cover the case.

Differential Revision: D18237935

fbshipit-source-id: a6dcea1f36c3849e13c0a69e413d73ad8eab58c9
---
 HISTORY.md                         |  1 +
 db/column_family.cc                | 16 ++++++++++++++--
 db/db_options_test.cc              | 26 ++++++++++++++++++++++++++
 include/rocksdb/advanced_options.h |  9 ++++++++-
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 63254df953e..6aacb132ed5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 ## Unreleased
 ### Public API Change
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
+* With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the
 file_creation_time of the oldest SST file in the DB. 
 
diff --git a/db/column_family.cc b/db/column_family.cc
index f0360eefe96..7c3b0276dd5 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -343,14 +343,26 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.max_compaction_bytes = result.target_file_size_base * 25;
   }
 
+  const uint64_t kDefaultPeriodicCompSecs = 0xffffffffffffffff;
+  const uint64_t kDefaultTtlSecs = 30 * 24 * 60 * 60;
+
   // Turn on periodic compactions and set them to occur once every 30 days if
   // compaction filters are used and periodic_compaction_seconds is set to the
   // default value.
   if (result.compaction_style == kCompactionStyleLevel &&
       (result.compaction_filter != nullptr ||
        result.compaction_filter_factory != nullptr) &&
-      result.periodic_compaction_seconds == port::kMaxUint64) {
-    result.periodic_compaction_seconds = 30 * 24 * 60 * 60;
+      result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = kDefaultTtlSecs;
+  } else if (result.compaction_style == kCompactionStyleFIFO) {
+    if (result.ttl == 0) {
+      if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+        result.periodic_compaction_seconds = kDefaultTtlSecs;
+      }
+      result.ttl = result.periodic_compaction_seconds;
+    } else if (result.periodic_compaction_seconds != 0) {
+      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+    }
   }
 
   return result;
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 787f77ebd72..004c9acfbd8 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -606,6 +606,32 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
 }
 
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100 * 24 * 60 * 60;
+  Reopen(options);
+  ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 200;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+  options.ttl = 500;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
 TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 77c55d977ef..63d1f1ec972 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -653,6 +653,8 @@ struct AdvancedColumnFamilyOptions {
   //           compation process.
   // In FIFO: Files older than TTL will be deleted.
   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+  // In FIFO, this option will have the same meaning as
+  // periodic_compaction_seconds. Whichever stricter will be used.
   //
   // Default: 0 (disabled)
   //
@@ -667,7 +669,9 @@ struct AdvancedColumnFamilyOptions {
   // age is based on the file's last modified time (given by the underlying
   // Env).
   //
-  // Only supported in Level compaction.
+  // Supported in Level and FIFO compaction.
+  // In FIFO compaction, this option has the same meaning as TTL and whichever
+  // stricter will be used.
   // Pre-req: max_open_file == -1.
   // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
   //
@@ -677,6 +681,9 @@ struct AdvancedColumnFamilyOptions {
   //     as needed. For now, RocksDB will change this value to 30 days
   //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
   //     process at least once every 30 days if not compacted sooner.
+  //     In FIFO compaction, since the option has the same meaning as ttl,
+  //     when this value is left default, and ttl is left to 0, 30 days will be
+  //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
   //
   // Default: UINT64_MAX (allow RocksDB to auto-tune)
   //

From aa6f7d099577f02ffefc40e54031ac8a7de3699d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 31 Oct 2019 11:16:33 -0700
Subject: [PATCH 498/572] Support periodic compaction in universal compaction
 (#5970)

Summary:
Previously, periodic compaction is not supported in universal compaction. Add the support using following approach: if any file is marked as qualified for periodid compaction, trigger a full compaction. If a full compaction is prevented by files being compacted, try to compact the higher levels than files currently being compacted. If in this way we can only compact the last sorted run and none of the file to be compacted qualifies for periodic compaction, skip the compact. This is to prevent the same single level compaction from being executed again and again.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5970

Test Plan: Add several test cases.

Differential Revision: D18147097

fbshipit-source-id: 8ecc308154d9aca96fb192c51fbceba3947550c1
---
 HISTORY.md                                   |   3 +
 db/compaction/compaction_picker_test.cc      | 114 +++++++++
 db/compaction/compaction_picker_universal.cc | 229 ++++++++++++++-----
 db/db_universal_compaction_test.cc           |  81 ++++++-
 db/version_set.h                             |   4 +
 5 files changed, 364 insertions(+), 67 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 6aacb132ed5..32628a65caa 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,9 @@
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the
 file_creation_time of the oldest SST file in the DB. 
 
+### New Features
+* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
+
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index a76f3b450be..c3c9263971e 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -499,6 +499,120 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
   ASSERT_TRUE(compaction->is_trivial_move());
 }
 
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+  // The case where universal periodic compaction does not
+  // pick up only level to compact if it doesn't cover
+  // any file marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+  // The case where universal periodic compaction does not
+  // pick up only the last sorted run which is an L0 file if it isn't
+  // marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+  // The case where universal periodic compaction couldn't form
+  // a compaction that inlcudes any file marked for periodic compaction.
+  // Right now we form the compaction anyway if it is more than one
+  // sorted run. Just put the case here to validate that it doesn't
+  // crash.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(!compaction ||
+              compaction->start_level() != compaction->output_level());
+}
+
 TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const int kFileCount =
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 7eddfa2b83c..4acc5c893b5 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -90,6 +90,19 @@ class UniversalCompactionBuilder {
 
   Compaction* PickDeleteTriggeredCompaction();
 
+  // Form a compaction from the sorted run indicated by start_index to the
+  // oldest sorted run.
+  // The caller is responsible for making sure that those files are not in
+  // compaction.
+  Compaction* PickCompactionToOldest(size_t start_index,
+                                     CompactionReason compaction_reason);
+
+  // Try to pick periodic compaction. The caller should only call it
+  // if there is at least one file marked for periodic compaction.
+  // null will be returned if no such a compaction can be formed
+  // because some files are being compacted.
+  Compaction* PickPeriodicCompaction();
+
   // Used in universal compaction when the enabled_trivial_move
   // option is set. Checks whether there are any overlapping files
   // in the input. Returns true if the input files are non
@@ -253,6 +266,9 @@ bool UniversalCompactionPicker::NeedsCompaction(
   if (vstorage->CompactionScore(kLevel0) >= 1) {
     return true;
   }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
   if (!vstorage->FilesMarkedForCompaction().empty()) {
     return true;
   }
@@ -358,7 +374,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
       CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
 
   if (sorted_runs_.size() == 0 ||
-      (vstorage_->FilesMarkedForCompaction().empty() &&
+      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+       vstorage_->FilesMarkedForCompaction().empty() &&
        sorted_runs_.size() < (unsigned int)mutable_cf_options_
                                  .level0_file_num_compaction_trigger)) {
     ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
@@ -373,11 +390,19 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
       "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
       cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
-  // Check for size amplification first.
   Compaction* c = nullptr;
-  if (sorted_runs_.size() >=
-      static_cast<size_t>(
-          mutable_cf_options_.level0_file_num_compaction_trigger)) {
+  // Periodic compaction has higher priority than other type of compaction
+  // because it's a hard requirement.
+  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    // Always need to do a full compaction for periodic compaction.
+    c = PickPeriodicCompaction();
+  }
+
+  // Check for size amplification.
+  if (c == nullptr &&
+      sorted_runs_.size() >=
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger)) {
     if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
       ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
                        cf_name_.c_str());
@@ -441,7 +466,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   }
 
   if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
-      true) {
+          true &&
+      c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
     c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
   }
 
@@ -815,59 +841,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
         " earliest-file-size %" PRIu64,
         cf_name_.c_str(), candidate_size, earliest_file_size);
   }
-  assert(start_index < sorted_runs_.size() - 1);
-
-  // Estimate total file size
-  uint64_t estimated_total_size = 0;
-  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
-    estimated_total_size += sorted_runs_[loop].size;
-  }
-  uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
-  int start_level = sorted_runs_[start_index].level;
-
-  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    inputs[i].level = start_level + static_cast<int>(i);
-  }
-  // We always compact all the files, so always compress.
-  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
-    auto& picking_sr = sorted_runs_[loop];
-    if (picking_sr.level == 0) {
-      FileMetaData* f = picking_sr.file;
-      inputs[0].files.push_back(f);
-    } else {
-      auto& files = inputs[picking_sr.level - start_level].files;
-      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
-        files.push_back(f);
-      }
-    }
-    char file_num_buf[256];
-    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
-    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: size amp picking %s",
-                     cf_name_.c_str(), file_num_buf);
-  }
-
-  // output files at the bottom most level, unless it's reserved
-  int output_level = vstorage_->num_levels() - 1;
-  // last level is reserved for the files ingested behind
-  if (ioptions_.allow_ingest_behind) {
-    assert(output_level > 1);
-    output_level--;
-  }
-
-  return new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
-      output_level,
-      MaxFileSizeForLevel(mutable_cf_options_, output_level,
-                          kCompactionStyleUniversal),
-      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
-                         output_level, 1),
-      GetCompressionOptions(ioptions_, vstorage_, output_level),
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score_, false /* deletion_compaction */,
-      CompactionReason::kUniversalSizeAmplification);
+  return PickCompactionToOldest(start_index,
+                                CompactionReason::kUniversalSizeAmplification);
 }
 
 // Pick files marked for compaction. Typically, files are marked by
@@ -987,6 +962,142 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       score_, false /* deletion_compaction */,
       CompactionReason::kFilesMarkedForCompaction);
 }
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+    size_t start_index, CompactionReason compaction_reason) {
+  assert(start_index < sorted_runs_.size() - 1);
+
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    auto& picking_sr = sorted_runs_[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    std::string comp_reason_print_string;
+    if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+      comp_reason_print_string = "periodic compaction";
+    } else if (compaction_reason ==
+               CompactionReason::kUniversalSizeAmplification) {
+      comp_reason_print_string = "size amp";
+    } else {
+      assert(false);
+    }
+
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+                     cf_name_.c_str(), comp_reason_print_string.c_str(),
+                     file_num_buf);
+  }
+
+  // output files at the bottom most level, unless it's reserved
+  int output_level = vstorage_->num_levels() - 1;
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  // We never check size for
+  // compaction_options_universal.compression_size_percent,
+  // because we always compact all the files, so always compress.
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+                         1, true /* enable_compression */),
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
+                            true /* enable_compression */),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score_, false /* deletion_compaction */, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+  ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+                   cf_name_.c_str());
+
+  // In universal compaction, sorted runs contain older data are almost always
+  // generated earlier too. To simplify the problem, we just try to trigger
+  // a full compaction. We start from the oldest sorted run and include
+  // all sorted runs, until we hit a sorted already being compacted.
+  // Since usually the largest (which is usually the oldest) sorted run is
+  // included anyway, doing a full compaction won't increase write
+  // amplification much.
+
+  // Get some information from marked files to check whether a file is
+  // included in the compaction.
+
+  size_t start_index = sorted_runs_.size();
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+    start_index--;
+  }
+  if (start_index == sorted_runs_.size()) {
+    return nullptr;
+  }
+
+  // There is a rare corner case where we can't pick up all the files
+  // because some files are being compacted and we end up with picking files
+  // but none of them need periodic compaction. Unless we simply recompact
+  // the last sorted run (either the last level or last L0 file), we would just
+  // execute the compaction, in order to simplify  the logic.
+  if (start_index == sorted_runs_.size() - 1) {
+    bool included_file_marked = false;
+    int start_level = sorted_runs_[start_index].level;
+    FileMetaData* start_file = sorted_runs_[start_index].file;
+    for (const std::pair<int, FileMetaData*>& level_file_pair :
+         vstorage_->FilesMarkedForPeriodicCompaction()) {
+      if (start_level != 0) {
+        // Last sorted run is a level
+        if (start_level == level_file_pair.first) {
+          included_file_marked = true;
+          break;
+        }
+      } else {
+        // Last sorted run is a L0 file.
+        if (start_file == level_file_pair.second) {
+          included_file_marked = true;
+          break;
+        }
+      }
+    }
+    if (!included_file_marked) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Cannot form a compaction covering file "
+                       "marked for periodic compaction",
+                       cf_name_.c_str());
+      return nullptr;
+    }
+  }
+
+  Compaction* c = PickCompactionToOldest(start_index,
+                                         CompactionReason::kPeriodicCompaction);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+  return c;
+}
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index fa2277bad28..4d42084e6d5 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -41,10 +41,9 @@ class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
       DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
 };
 
-class DBTestUniversalDeleteTrigCompaction : public DBTestBase {
+class DBTestUniversalCompaction2 : public DBTestBase {
  public:
-  DBTestUniversalDeleteTrigCompaction()
-      : DBTestBase("/db_universal_compaction_test") {}
+  DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {}
 };
 
 namespace {
@@ -1915,7 +1914,7 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
                         ::testing::Combine(::testing::Values(1, 8),
                                            ::testing::Bool()));
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) {
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -1956,7 +1955,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) {
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -1995,7 +1994,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) {
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) {
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
 
@@ -2067,7 +2066,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) {
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
 
@@ -2107,7 +2106,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) {
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -2150,6 +2149,72 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) {
   ASSERT_GT(NumTableFilesAtLevel(5), 0);
 }
 
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+  Options opts = CurrentOptions();
+  opts.env = env_;
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.max_open_files = -1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  opts.num_levels = 5;
+  env_->addon_time_.store(0);
+  Reopen(opts);
+
+  int periodic_compactions = 0;
+  int start_level = -1;
+  int output_level = -1;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+      [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(arg != nullptr);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kPeriodicCompaction);
+        start_level = compaction->start_level();
+        output_level = compaction->output_level();
+        periodic_compactions++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+  ASSERT_OK(Put("foo", "bar"));
+  Flush();
+  ASSERT_EQ(0, periodic_compactions);
+  // Move clock forward so that the flushed file would qualify periodic
+  // compaction.
+  env_->addon_time_.store(48 * 60 * 60 + 100);
+
+  // Another flush would trigger compaction the oldest file.
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+
+  // Case 2: Oldest compacted file excceeds periodic compaction threshold
+  periodic_compactions = 0;
+  // A flush doesn't trigger a periodic compaction when threshold not hit
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, periodic_compactions);
+
+  // After periodic compaction threshold hits, a flush will trigger
+  // a compaction
+  ASSERT_OK(Put("foo", "bar2"));
+  env_->addon_time_.fetch_add(48 * 60 * 60 + 100);
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+}
+
 }  // namespace rocksdb
 
 #endif  // !defined(ROCKSDB_LITE)
diff --git a/db/version_set.h b/db/version_set.h
index 6b9d71c3740..758bd5e5d32 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -312,6 +312,10 @@ class VersionStorageInfo {
     return files_marked_for_periodic_compaction_;
   }
 
+  void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+    files_marked_for_periodic_compaction_.emplace_back(level, f);
+  }
+
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
   // REQUIRES: DB mutex held during access
   const autovector<std::pair<int, FileMetaData*>>&

From 351e25401b4ca1c9a607a65dc9dcae7da2d418b6 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 31 Oct 2019 13:43:07 -0700
Subject: [PATCH 499/572] Disable pre-5.5 versions in the format compatibility
 test (#5990)

Summary:
We have updated earlier release branches going back to 5.5 so they are
built using gcc7 by default. Disabling ancient versions before that
until we figure out a plan for them.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5990

Test Plan: Ran the script locally.

Differential Revision: D18252386

Pulled By: ltamasi

fbshipit-source-id: a7bbb30dc52ff2eaaf31a29ecc79f7cf4e2834dc
---
 tools/check_format_compatible.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 30be49f0800..cc6a54ea973 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -53,10 +53,13 @@ with open('${sorted_input_data}', 'w') as f:
     print >> f, k + " ==> " + v
 EOF
 
-declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+# Disable pre-5.5 versions that have not been ported to gcc7.
+#declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
+#declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a forward_compatible_checkout_objs=("5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
 declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
-declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+#declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+declare -a checkout_objs=(${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
 declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 
 generate_db()

From 685e895652da4b18c61069b423b9b9c5669eea53 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 31 Oct 2019 14:11:08 -0700
Subject: [PATCH 500/572] Prepare filter tests for more implementations (#5967)

Summary:
This change sets up for alternate implementations underlying
BloomFilterPolicy:

* Refactor BloomFilterPolicy and expose in internal .h file so that it's easy to iterate over / select implementations for testing, regardless of what the best public interface will look like. Most notably updated db_bloom_filter_test to use this.
* Hide FullFilterBitsBuilder from unit tests (alternate derived classes planned); expose the part important for testing (CalculateSpace), as abstract class BuiltinFilterBitsBuilder. (Also cleaned up internally exposed interface to CalculateSpace.)
* Rename BloomTest -> BlockBasedBloomTest for clarity (despite ongoing confusion between block-based table and block-based filter)
* Assert that block-based filter construction interface is only used on BloomFilterPolicy appropriately constructed. (A couple of tests updated to add ", true".)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5967

Test Plan: make check

Differential Revision: D18138704

Pulled By: pdillinger

fbshipit-source-id: 55ef9273423b0696309e251f50b8c1b5e9ec7597
---
 db/db_bloom_filter_test.cc                    | 136 ++++---
 include/rocksdb/filter_policy.h               |   4 +-
 .../block_based_filter_block_test.cc          |   2 +-
 table/block_based/filter_policy.cc            | 340 ++++++++++--------
 table/block_based/filter_policy_internal.h    |  90 ++---
 util/bloom_test.cc                            |  44 ++-
 6 files changed, 333 insertions(+), 283 deletions(-)

diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index beed590ae66..d2e88b0e465 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -10,9 +10,14 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "table/block_based/filter_policy_internal.h"
 
 namespace rocksdb {
 
+namespace {
+using BFP = BloomFilterPolicy;
+}  // namespace
+
 // DB tests related to bloom filter.
 
 class DBBloomFilterTest : public DBTestBase {
@@ -20,12 +25,12 @@ class DBBloomFilterTest : public DBTestBase {
   DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
 };
 
-class DBBloomFilterTestWithParam
-    : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool, bool, uint32_t>> {
+class DBBloomFilterTestWithParam : public DBTestBase,
+                                   public testing::WithParamInterface<
+                                       std::tuple<BFP::Impl, bool, uint32_t>> {
   //                             public testing::WithParamInterface<bool> {
  protected:
-  bool use_block_based_filter_;
+  BFP::Impl bfp_impl_;
   bool partition_filters_;
   uint32_t format_version_;
 
@@ -35,7 +40,7 @@ class DBBloomFilterTestWithParam
   ~DBBloomFilterTestWithParam() override {}
 
   void SetUp() override {
-    use_block_based_filter_ = std::get<0>(GetParam());
+    bfp_impl_ = std::get<0>(GetParam());
     partition_filters_ = std::get<1>(GetParam());
     format_version_ = std::get<2>(GetParam());
   }
@@ -71,8 +76,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     ReadOptions ropts;
     std::string value;
     anon::OptionsOverride options_override;
-    options_override.filter_policy.reset(
-        NewBloomFilterPolicy(20, use_block_based_filter_));
+    options_override.filter_policy.reset(new BFP(20, bfp_impl_));
     options_override.partition_filters = partition_filters_;
     options_override.metadata_block_size = 32;
     Options options = CurrentOptions(options_override);
@@ -432,8 +436,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
     // trigger reset of table_factory
     BlockBasedTableOptions table_options;
     table_options.no_block_cache = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_filter_));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl_));
     table_options.partition_filters = partition_filters_;
     if (partition_filters_) {
       table_options.index_type =
@@ -502,24 +505,24 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestDefFormatVersion,
-    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
-                      std::make_tuple(false, true, test::kDefaultFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kDefaultFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestWithParam,
-    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
-                      std::make_tuple(false, true, test::kDefaultFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kDefaultFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, DBBloomFilterTestWithParam,
-    ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion),
-                      std::make_tuple(false, true, test::kLatestFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kLatestFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kBlock, false, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kFull, true, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kFull, false, test::kLatestFormatVersion)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
@@ -640,7 +643,7 @@ namespace {
 class WrappedBloom : public FilterPolicy {
  public:
   explicit WrappedBloom(int bits_per_key)
-      : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {}
+      : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
 
   ~WrappedBloom() override { delete filter_; }
 
@@ -858,11 +861,11 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
 #ifndef ROCKSDB_LITE
 class BloomStatsTestWithParam
     : public DBBloomFilterTest,
-      public testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+      public testing::WithParamInterface<std::tuple<bool, BFP::Impl, bool>> {
  public:
   BloomStatsTestWithParam() {
     use_block_table_ = std::get<0>(GetParam());
-    use_block_based_builder_ = std::get<1>(GetParam());
+    bfp_impl_ = std::get<1>(GetParam());
     partition_filters_ = std::get<2>(GetParam());
 
     options_.create_if_missing = true;
@@ -873,13 +876,12 @@ class BloomStatsTestWithParam
       BlockBasedTableOptions table_options;
       table_options.hash_index_allow_collision = false;
       if (partition_filters_) {
-        assert(!use_block_based_builder_);
+        assert(bfp_impl_ != BFP::kBlock);
         table_options.partition_filters = partition_filters_;
         table_options.index_type =
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
       }
-      table_options.filter_policy.reset(
-          NewBloomFilterPolicy(10, use_block_based_builder_));
+      table_options.filter_policy.reset(new BFP(10, bfp_impl_));
       options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
     } else {
       assert(!partition_filters_);  // not supported in plain table
@@ -902,7 +904,7 @@ class BloomStatsTestWithParam
   static void TearDownTestCase() {}
 
   bool use_block_table_;
-  bool use_block_based_builder_;
+  BFP::Impl bfp_impl_;
   bool partition_filters_;
   Options options_;
 };
@@ -1006,7 +1008,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_EQ(value3, iter->value().ToString());
   // The seek doesn't check block-based bloom filter because last index key
   // starts with the same prefix we're seeking to.
-  uint64_t expected_hits = use_block_based_builder_ ? 1 : 2;
+  uint64_t expected_hits = bfp_impl_ == BFP::kBlock ? 1 : 2;
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 
   iter->Seek(key2);
@@ -1016,12 +1018,12 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 }
 
-INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
-                        ::testing::Values(std::make_tuple(true, true, false),
-                                          std::make_tuple(true, false, false),
-                                          std::make_tuple(true, false, true),
-                                          std::make_tuple(false, false,
-                                                          false)));
+INSTANTIATE_TEST_CASE_P(
+    BloomStatsTestWithParam, BloomStatsTestWithParam,
+    ::testing::Values(std::make_tuple(true, BFP::kBlock, false),
+                      std::make_tuple(true, BFP::kFull, false),
+                      std::make_tuple(true, BFP::kFull, true),
+                      std::make_tuple(false, BFP::kFull, false)));
 
 namespace {
 void PrefixScanInit(DBBloomFilterTest* dbtest) {
@@ -1328,8 +1330,8 @@ int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
 // into the same string, or 2) the transformed seek key is of the same length
 // as the upper bound and two keys are adjacent according to the comparator.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllImpls) {
+    int using_full_builder = bfp_impl != BFP::kBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewCappedPrefixTransform(4));
@@ -1338,8 +1340,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     table_options.index_shortening = BlockBasedTableOptions::
         IndexShorteningMode::kShortenSeparatorsAndSuccessor;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -1382,7 +1383,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
       // should check bloom filter since upper bound meets requirement
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration);
+                2 + using_full_builder);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1396,7 +1397,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
       // should skip bloom filter since upper bound is too long
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration);
+                2 + using_full_builder);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1410,7 +1411,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       // should check bloom filter since upper bound matches transformed seek
       // key
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1424,7 +1425,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
       // should skip bloom filter since mismatch is found
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
@@ -1438,7 +1439,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter, "abc"), 4);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
@@ -1451,18 +1452,17 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter, "abc"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                3 + iteration * 2);
+                3 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
     }
-    iteration++;
   }
 }
 
 // Create multiple SST files each with a different prefix_extractor config,
 // verify iterators can read all SST files using the latest config.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllImpls) {
+    int using_full_builder = bfp_impl != BFP::kBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1470,8 +1470,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     options.statistics = CreateDBStatistics();
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     table_options.cache_index_and_filter_blocks = true;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
@@ -1497,10 +1496,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
     ASSERT_EQ(CountIter(iter, "foo"), 2);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              1 + iteration);
+              1 + using_full_builder);
     ASSERT_EQ(CountIter(iter, "gpk"), 0);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              1 + iteration);
+              1 + using_full_builder);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
 
     // second SST with capped:3 BF
@@ -1514,13 +1513,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
       ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
       // both counters are incremented because BF is "not changed" for 1 of the
       // 2 SST files, so filter is checked once and found no match.
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                3 + iteration * 2);
+                3 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
     }
 
@@ -1539,24 +1538,24 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
       // the first and last BF are checked
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                4 + iteration * 3);
+                4 + using_full_builder * 3);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
       ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
       // only last BF is checked and not found
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                5 + iteration * 3);
+                5 + using_full_builder * 3);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
     }
 
     // iter_old can only see the first SST, so checked plus 1
     ASSERT_EQ(CountIter(iter_old, "foo"), 4);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              6 + iteration * 3);
+              6 + using_full_builder * 3);
     // iter was created after the first setoptions call so only full filter
     // will check the filter
     ASSERT_EQ(CountIter(iter, "foo"), 2);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              6 + iteration * 4);
+              6 + using_full_builder * 4);
 
     {
       // keys in all three SSTs are visible to iterator
@@ -1565,11 +1564,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter_all, "foo"), 9);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                7 + iteration * 5);
+                7 + using_full_builder * 5);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
       ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                8 + iteration * 5);
+                8 + using_full_builder * 5);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
@@ -1581,15 +1580,14 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       // all three SST are checked because the current options has the same as
       // the remaining SST (capped:3)
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                9 + iteration * 7);
+                9 + using_full_builder * 7);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
       ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                10 + iteration * 7);
+                10 + using_full_builder * 7);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
     }
     // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
-    iteration++;
   }
 }
 
@@ -1598,7 +1596,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
 // as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
   int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllImpls) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1607,8 +1605,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
     ReadOptions read_options;
@@ -1657,8 +1654,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
 // Verify it's possible to change prefix_extractor at runtime and iterators
 // behaves as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllImpls) {
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1667,8 +1663,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
@@ -1719,7 +1714,6 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
     ASSERT_EQ(CountIter(iter_old, "abc"), 0);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
-    iteration++;
   }
 }
 
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index c923ba354ea..6cc7bff6730 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -138,8 +138,8 @@ class FilterPolicy {
 //
 // bits_per_key: bits per key in bloom filter. A good value for bits_per_key
 // is 10, which yields a filter with ~ 1% false positive rate.
-// use_block_based_builder: use block based filter rather than full filter.
-// If you want to builder full filter, it needs to be set to false.
+// use_block_based_builder: use deprecated block based filter (true) rather
+// than full or partitioned filter (false).
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index eb7a2dc1d5c..677203ae24d 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -240,7 +240,7 @@ class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
                                   public testing::Test {
  public:
   BlockBasedFilterBlockTest()
-      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10)) {}
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {}
 };
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index c2acc570aee..6a458f72b9d 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -20,9 +20,66 @@
 
 namespace rocksdb {
 
+namespace {
+
 typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
-class BlockBasedFilterBlockBuilder;
-class FullFilterBlockBuilder;
+
+class FullFilterBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
+
+  // No Copy allowed
+  FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
+  void operator=(const FullFilterBitsBuilder&) = delete;
+
+  ~FullFilterBitsBuilder() override;
+
+  void AddKey(const Slice& key) override;
+
+  // Create a filter that for hashes [0, n-1], the filter is allocated here
+  // When creating filter, it is ensured that
+  // total_bits = num_lines * CACHE_LINE_SIZE * 8
+  // dst len is >= 5, 1 for num_probes, 4 for num_lines
+  // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
+  // +----------------------------------------------------------------+
+  // |              filter data with length total_bits/8              |
+  // +----------------------------------------------------------------+
+  // |                                                                |
+  // | ...                                                            |
+  // |                                                                |
+  // +----------------------------------------------------------------+
+  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
+  // +----------------------------------------------------------------+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+  int CalculateNumEntry(const uint32_t bytes) override;
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t dont_care1;
+    uint32_t dont_care2;
+    return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+  }
+
+ private:
+  friend class FullFilterBlockTest_DuplicateEntries_Test;
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint32_t> hash_entries_;
+
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+                     uint32_t* num_lines);
+
+  // Implementation-specific variant of public CalculateSpace
+  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+                          uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
 
 FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
                                              const int num_probes)
@@ -30,35 +87,35 @@ FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
   assert(bits_per_key_);
 }
 
-  FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
+FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
 
-  void FullFilterBitsBuilder::AddKey(const Slice& key) {
-    uint32_t hash = BloomHash(key);
-    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
-      hash_entries_.push_back(hash);
-    }
+void FullFilterBitsBuilder::AddKey(const Slice& key) {
+  uint32_t hash = BloomHash(key);
+  if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+    hash_entries_.push_back(hash);
   }
+}
 
-  Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
-    uint32_t total_bits, num_lines;
-    char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
-                              &total_bits, &num_lines);
-    assert(data);
+Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+  uint32_t total_bits, num_lines;
+  char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), &total_bits,
+                            &num_lines);
+  assert(data);
 
-    if (total_bits != 0 && num_lines != 0) {
-      for (auto h : hash_entries_) {
-        AddHash(h, data, num_lines, total_bits);
-      }
+  if (total_bits != 0 && num_lines != 0) {
+    for (auto h : hash_entries_) {
+      AddHash(h, data, num_lines, total_bits);
     }
-    data[total_bits/8] = static_cast<char>(num_probes_);
-    EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
+  }
+  data[total_bits / 8] = static_cast<char>(num_probes_);
+  EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
 
-    const char* const_data = data;
-    buf->reset(const_data);
-    hash_entries_.clear();
+  const char* const_data = data;
+  buf->reset(const_data);
+  hash_entries_.clear();
 
-    return Slice(data, total_bits / 8 + 5);
-  }
+  return Slice(data, total_bits / 8 + 5);
+}
 
 uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
   uint32_t num_lines =
@@ -106,13 +163,11 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
 int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
   assert(bits_per_key_);
   assert(bytes > 0);
-  uint32_t dont_care1, dont_care2;
   int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
   int low = 1;
   int n = high;
   for (; n >= low; n--) {
-    uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
-    if (sz <= bytes) {
+    if (CalculateSpace(n) <= bytes) {
       break;
     }
   }
@@ -131,7 +186,6 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
                                 folly::constexpr_log2(CACHE_LINE_SIZE));
 }
 
-namespace {
 class AlwaysTrueFilter : public FilterBitsReader {
  public:
   bool MayMatch(const Slice&) override { return true; }
@@ -196,148 +250,148 @@ class FullFilterBitsReader : public FilterBitsReader {
   const uint32_t log2_cache_line_size_;
 };
 
+}  // namespace
 
-// An implementation of filter policy
-class BloomFilterPolicy : public FilterPolicy {
- public:
-  explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
-      : bits_per_key_(bits_per_key),
-        use_block_based_builder_(use_block_based_builder) {
-    initialize();
-  }
-
-  ~BloomFilterPolicy() override {}
-
-  const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
+const std::vector<BloomFilterPolicy::Impl> BloomFilterPolicy::kAllImpls = {
+    kFull,
+    kBlock,
+};
 
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
-    // Compute bloom filter size (in both bits and bytes)
-    uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
+BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Impl impl)
+    : bits_per_key_(bits_per_key), impl_(impl) {
+  // We intentionally round down to reduce probing cost a little bit
+  num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+  if (num_probes_ < 1) num_probes_ = 1;
+  if (num_probes_ > 30) num_probes_ = 30;
+}
 
-    // For small n, we can see a very high false positive rate.  Fix it
-    // by enforcing a minimum bloom filter length.
-    if (bits < 64) bits = 64;
+BloomFilterPolicy::~BloomFilterPolicy() {}
 
-    uint32_t bytes = (bits + 7) / 8;
-    bits = bytes * 8;
+const char* BloomFilterPolicy::Name() const {
+  return "rocksdb.BuiltinBloomFilter";
+}
 
-    const size_t init_size = dst->size();
-    dst->resize(init_size + bytes, 0);
-    dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
-    char* array = &(*dst)[init_size];
-    for (int i = 0; i < n; i++) {
-      LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
-                                         array);
-    }
+void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                     std::string* dst) const {
+  // We should ideally only be using this deprecated interface for
+  // appropriately constructed BloomFilterPolicy
+  assert(impl_ == kBlock);
+
+  // Compute bloom filter size (in both bits and bytes)
+  uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
+
+  // For small n, we can see a very high false positive rate.  Fix it
+  // by enforcing a minimum bloom filter length.
+  if (bits < 64) bits = 64;
+
+  uint32_t bytes = (bits + 7) / 8;
+  bits = bytes * 8;
+
+  const size_t init_size = dst->size();
+  dst->resize(init_size + bytes, 0);
+  dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
+  char* array = &(*dst)[init_size];
+  for (int i = 0; i < n; i++) {
+    LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
+                                       array);
   }
+}
 
-  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
-    const size_t len = bloom_filter.size();
-    if (len < 2 || len > 0xffffffffU) {
-      return false;
-    }
+bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
+                                    const Slice& bloom_filter) const {
+  const size_t len = bloom_filter.size();
+  if (len < 2 || len > 0xffffffffU) {
+    return false;
+  }
 
-    const char* array = bloom_filter.data();
-    const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
+  const char* array = bloom_filter.data();
+  const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
 
-    // Use the encoded k so that we can read filters generated by
-    // bloom filters created using different parameters.
-    const int k = static_cast<uint8_t>(array[len - 1]);
-    if (k > 30) {
-      // Reserved for potentially new encodings for short bloom filters.
-      // Consider it a match.
-      return true;
-    }
-    // NB: using k not num_probes_
-    return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
-                                                   array);
+  // Use the encoded k so that we can read filters generated by
+  // bloom filters created using different parameters.
+  const int k = static_cast<uint8_t>(array[len - 1]);
+  if (k > 30) {
+    // Reserved for potentially new encodings for short bloom filters.
+    // Consider it a match.
+    return true;
   }
+  // NB: using k not num_probes_
+  return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
+                                                 array);
+}
 
-  FilterBitsBuilder* GetFilterBitsBuilder() const override {
-    if (use_block_based_builder_) {
-      return nullptr;
-    }
-
+FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
+  if (impl_ == kBlock) {
+    return nullptr;
+  } else {
     return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
   }
+}
 
-  // Read metadata to determine what kind of FilterBitsReader is needed
-  // and return a new one.
-  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
-    uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
-    if (len_with_meta <= 5) {
-      // filter is empty or broken. Treat like zero keys added.
-      return new AlwaysFalseFilter();
-    }
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  if (len_with_meta <= 5) {
+    // filter is empty or broken. Treat like zero keys added.
+    return new AlwaysFalseFilter();
+  }
 
-    char raw_num_probes = contents.data()[len_with_meta - 5];
-    // NB: *num_probes > 30 and < 128 probably have not been used, because of
-    // BloomFilterPolicy::initialize, unless directly calling
-    // FullFilterBitsBuilder as an API, but we are leaving those cases in
-    // limbo with FullFilterBitsReader for now.
+  char raw_num_probes = contents.data()[len_with_meta - 5];
+  // NB: *num_probes > 30 and < 128 probably have not been used, because of
+  // BloomFilterPolicy::initialize, unless directly calling
+  // FullFilterBitsBuilder as an API, but we are leaving those cases in
+  // limbo with FullFilterBitsReader for now.
 
-    if (raw_num_probes < 1) {
-      // Treat as zero probes (always FP) for now.
-      // NB: < 0 (or unsigned > 127) effectively reserved for future use.
-      return new AlwaysTrueFilter();
-    }
-    // else attempt decode for FullFilterBitsReader
+  if (raw_num_probes < 1) {
+    // Treat as zero probes (always FP) for now.
+    // NB: < 0 (or unsigned > 127) effectively reserved for future use.
+    return new AlwaysTrueFilter();
+  }
+  // else attempt decode for FullFilterBitsReader
 
-    int num_probes = raw_num_probes;
-    assert(num_probes >= 1);
-    assert(num_probes <= 127);
+  int num_probes = raw_num_probes;
+  assert(num_probes >= 1);
+  assert(num_probes <= 127);
 
-    uint32_t len = len_with_meta - 5;
-    assert(len > 0);
+  uint32_t len = len_with_meta - 5;
+  assert(len > 0);
 
-    uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
-    uint32_t log2_cache_line_size;
+  uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+  uint32_t log2_cache_line_size;
 
-    if (num_lines * CACHE_LINE_SIZE == len) {
-      // Common case
-      log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
-    } else if (num_lines == 0 || len % num_lines != 0) {
-      // Invalid (no solution to num_lines * x == len)
+  if (num_lines * CACHE_LINE_SIZE == len) {
+    // Common case
+    log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
+  } else if (num_lines == 0 || len % num_lines != 0) {
+    // Invalid (no solution to num_lines * x == len)
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  } else {
+    // Determine the non-native cache line size (from another system)
+    log2_cache_line_size = 0;
+    while ((num_lines << log2_cache_line_size) < len) {
+      ++log2_cache_line_size;
+    }
+    if ((num_lines << log2_cache_line_size) != len) {
+      // Invalid (block size not a power of two)
       // Treat as zero probes (always FP) for now.
       return new AlwaysTrueFilter();
-    } else {
-      // Determine the non-native cache line size (from another system)
-      log2_cache_line_size = 0;
-      while ((num_lines << log2_cache_line_size) < len) {
-        ++log2_cache_line_size;
-      }
-      if ((num_lines << log2_cache_line_size) != len) {
-        // Invalid (block size not a power of two)
-        // Treat as zero probes (always FP) for now.
-        return new AlwaysTrueFilter();
-      }
     }
-    // if not early return
-    return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
-                                    log2_cache_line_size);
   }
-
-  // If choose to use block based builder
-  bool UseBlockBasedBuilder() { return use_block_based_builder_; }
-
- private:
-  int bits_per_key_;
-  int num_probes_;
-  const bool use_block_based_builder_;
-
-  void initialize() {
-    // We intentionally round down to reduce probing cost a little bit
-    num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
-    if (num_probes_ < 1) num_probes_ = 1;
-    if (num_probes_ > 30) num_probes_ = 30;
-  }
-};
-
-}  // namespace
+  // if not early return
+  return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
+                                  log2_cache_line_size);
+}
 
 const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
                                          bool use_block_based_builder) {
-  return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
+  if (use_block_based_builder) {
+    return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kBlock);
+  } else {
+    return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kFull);
+  }
 }
 
 FilterPolicy::~FilterPolicy() { }
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 458a3441274..017c8b1d259 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -18,57 +18,61 @@ namespace rocksdb {
 
 class Slice;
 
-class FullFilterBitsBuilder : public FilterBitsBuilder {
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
  public:
-  explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
-
-  // No Copy allowed
-  FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
-  void operator=(const FullFilterBitsBuilder&) = delete;
-
-  ~FullFilterBitsBuilder();
-
-  void AddKey(const Slice& key) override;
-
-  // Create a filter that for hashes [0, n-1], the filter is allocated here
-  // When creating filter, it is ensured that
-  // total_bits = num_lines * CACHE_LINE_SIZE * 8
-  // dst len is >= 5, 1 for num_probes, 4 for num_lines
-  // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
-  // +----------------------------------------------------------------+
-  // |              filter data with length total_bits/8              |
-  // +----------------------------------------------------------------+
-  // |                                                                |
-  // | ...                                                            |
-  // |                                                                |
-  // +----------------------------------------------------------------+
-  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
-  // +----------------------------------------------------------------+
-  Slice Finish(std::unique_ptr<const char[]>* buf) override;
-
-  int CalculateNumEntry(const uint32_t bytes) override;
-
   // Calculate number of bytes needed for a new filter, including
   // metadata. Passing the result to CalculateNumEntry should
   // return >= the num_entry passed in.
-  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
-                          uint32_t* num_lines);
+  virtual uint32_t CalculateSpace(const int num_entry) = 0;
+};
 
- private:
-  friend class FullFilterBlockTest_DuplicateEntries_Test;
-  int bits_per_key_;
-  int num_probes_;
-  std::vector<uint32_t> hash_entries_;
+// RocksDB built-in filter policy for Bloom or Bloom-like filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy.
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  // An internal marker for which Bloom filter implementation to use.
+  // This makes it easier for tests to track or to walk over the built-in
+  // set of Bloom filter policies. The only variance in BloomFilterPolicy
+  // by implementation is in GetFilterBitsBuilder(), so an enum is practical
+  // here vs. subclasses.
+  enum Impl {
+    // Implementation of Bloom filter for full and partitioned filters.
+    // Set to 0 in case of value confusion with bool use_block_based_builder
+    kFull = 0,
+    // Deprecated block-based Bloom filter implementation.
+    // Set to 1 in case of value confusion with bool use_block_based_builder
+    kBlock = 1,
+  };
+  static const std::vector<Impl> kAllImpls;
+
+  explicit BloomFilterPolicy(int bits_per_key, Impl impl);
 
-  // Get totalbits that optimized for cpu cache line
-  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+  ~BloomFilterPolicy() override;
 
-  // Reserve space for new filter
-  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
-                     uint32_t* num_lines);
+  const char* Name() const override;
 
-  // Assuming single threaded access to this function.
-  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+  // Deprecated block-based filter only
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+
+  // Deprecated block-based filter only
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override;
+
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+
+ private:
+  int bits_per_key_;
+  int num_probes_;
+  // Selected implementation for building new SST filters
+  Impl impl_;
 };
 
 }  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 9e9b794f193..9d969fea985 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -55,15 +55,15 @@ static int NextLength(int length) {
   return length;
 }
 
-class BloomTest : public testing::Test {
+class BlockBasedBloomTest : public testing::Test {
  private:
   std::unique_ptr<const FilterPolicy> policy_;
   std::string filter_;
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(
-      NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
+  BlockBasedBloomTest()
+      : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, true)) {}
 
   void Reset() {
     keys_.clear();
@@ -72,7 +72,7 @@ class BloomTest : public testing::Test {
 
   void ResetPolicy(const FilterPolicy* policy = nullptr) {
     if (policy == nullptr) {
-      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key));
+      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, true));
     } else {
       policy_.reset(policy);
     }
@@ -131,12 +131,12 @@ class BloomTest : public testing::Test {
   }
 };
 
-TEST_F(BloomTest, EmptyFilter) {
+TEST_F(BlockBasedBloomTest, EmptyFilter) {
   ASSERT_TRUE(! Matches("hello"));
   ASSERT_TRUE(! Matches("world"));
 }
 
-TEST_F(BloomTest, Small) {
+TEST_F(BlockBasedBloomTest, Small) {
   Add("hello");
   Add("world");
   ASSERT_TRUE(Matches("hello"));
@@ -145,7 +145,7 @@ TEST_F(BloomTest, Small) {
   ASSERT_TRUE(! Matches("foo"));
 }
 
-TEST_F(BloomTest, VaryingLengths) {
+TEST_F(BlockBasedBloomTest, VaryingLengths) {
   char buffer[sizeof(int)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -186,45 +186,45 @@ TEST_F(BloomTest, VaryingLengths) {
 
 // Ensure the implementation doesn't accidentally change in an
 // incompatible way
-TEST_F(BloomTest, Schema) {
+TEST_F(BlockBasedBloomTest, Schema) {
   char buffer[sizeof(int)];
 
-  ResetPolicy(NewBloomFilterPolicy(8));  // num_probes = 5
+  ResetPolicy(NewBloomFilterPolicy(8, true));  // num_probes = 5
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
 
-  ResetPolicy(NewBloomFilterPolicy(9));  // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(9, true));  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 969445585);
 
-  ResetPolicy(NewBloomFilterPolicy(11));  // num_probes = 7
+  ResetPolicy(NewBloomFilterPolicy(11, true));  // num_probes = 7
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 1694458207);
 
-  ResetPolicy(NewBloomFilterPolicy(10));  // num_probes = 6
+  ResetPolicy(NewBloomFilterPolicy(10, true));  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
 
-  ResetPolicy(NewBloomFilterPolicy(10));
+  ResetPolicy(NewBloomFilterPolicy(10, true));
   for (int key = 1; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 1908442116);
 
-  ResetPolicy(NewBloomFilterPolicy(10));
+  ResetPolicy(NewBloomFilterPolicy(10, true));
   for (int key = 1; key < 88; key++) {
     Add(Key(key, buffer));
   }
@@ -251,8 +251,9 @@ class FullBloomTest : public testing::Test {
     Reset();
   }
 
-  FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
-    return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
+  BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
+    // Throws on bad cast
+    return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_.get());
   }
 
   void Reset() {
@@ -322,15 +323,12 @@ class FullBloomTest : public testing::Test {
 };
 
 TEST_F(FullBloomTest, FilterSize) {
-  uint32_t dont_care1, dont_care2;
-  auto full_bits_builder = GetFullFilterBitsBuilder();
-  ASSERT_TRUE(full_bits_builder != nullptr);
+  auto bits_builder = GetBuiltinFilterBitsBuilder();
   for (int n = 1; n < 100; n++) {
-    auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
-    auto n2 = full_bits_builder->CalculateNumEntry(space);
+    auto space = bits_builder->CalculateSpace(n);
+    auto n2 = bits_builder->CalculateNumEntry(space);
     ASSERT_GE(n2, n);
-    auto space2 =
-        full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
+    auto space2 = bits_builder->CalculateSpace(n2);
     ASSERT_EQ(space, space2);
   }
 }

From 18f57f5ef8dc515443e3095c21cf3c306c052fc2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 31 Oct 2019 16:34:51 -0700
Subject: [PATCH 501/572] Add new persistent 64-bit hash (#5984)

Summary:
For upcoming new SST filter implementations, we will use a new
64-bit hash function (XXH3 preview, slightly modified). This change
updates hash.{h,cc} for that change, adds unit tests, and out-of-lines
the implementations to keep hash.h as clean/small as possible.

In developing the unit tests, I discovered that the XXH3 preview always
returns zero for the empty string. Zero is problematic for some
algorithms (including an upcoming SST filter implementation) if it
occurs more often than at the "natural" rate, so it should not be
returned from trivial values using trivial seeds. I modified our fork
of XXH3 to return a modest hash of the seed for the empty string.

With hash function details out-of-lines in hash.h, it makes sense to
enable XXH_INLINE_ALL, so that direct calls to XXH64/XXH32/XXH3p
are inlined. To fix array-bounds warnings on some inline calls, I
injected some casts to uintptr_t in xxhash.cc. (Issue reported to Yann.)
Revised: Reverted using XXH_INLINE_ALL for now.  Some Facebook
checks are unhappy about #include on xxhash.cc file. I would
fix that by rename to xxhash_cc.h, but to best preserve history I want
to do that in a separate commit (PR) from the uintptr casts.

Also updated filter_bench for this change, improving the performance
predictability of dry run hashing and adding support for 64-bit hash
(for upcoming new SST filter implementations, minor dead code in the
tool for now).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5984

Differential Revision: D18246567

Pulled By: pdillinger

fbshipit-source-id: 6162fbf6381d63c8cc611dd7ec70e1ddc883fbb8
---
 HISTORY.md           |   3 +
 table/format.cc      |   1 -
 util/filter_bench.cc |  46 +++++-----
 util/hash.cc         |  28 +++++-
 util/hash.h          |  47 ++++++++--
 util/hash_test.cc    | 202 +++++++++++++++++++++++++++++++++++++++++--
 util/xxh3p.h         |   7 +-
 util/xxhash.cc       |  16 ++--
 8 files changed, 308 insertions(+), 42 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 32628a65caa..448fe3d917b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,9 @@ file_creation_time of the oldest SST file in the DB.
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
 
+### Performance Improvements
+* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
+
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
diff --git a/table/format.cc b/table/format.cc
index 6b9125de762..5e9805f4027 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -27,7 +27,6 @@
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/xxhash.h"
 
 namespace rocksdb {
 
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 3271c8574f5..268cbf3132f 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -101,6 +101,8 @@ using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
 using rocksdb::FullFilterBlockReader;
 using rocksdb::GetSliceHash;
+using rocksdb::GetSliceHash64;
+using rocksdb::Lower32of64;
 using rocksdb::ParsedFullFilterBlock;
 using rocksdb::PlainTableBloomV1;
 using rocksdb::Random32;
@@ -212,7 +214,7 @@ const char *TestModeToString(TestMode tm) {
 
 // Do just enough to keep some data dependence for the
 // compiler / CPU
-static inline uint32_t NoHash(Slice &s) {
+static uint32_t DryRunNoHash(Slice &s) {
   uint32_t sz = static_cast<uint32_t>(s.size());
   if (sz >= 4) {
     return sz + s.data()[3];
@@ -221,6 +223,15 @@ static inline uint32_t NoHash(Slice &s) {
   }
 }
 
+static uint32_t DryRunHash32(Slice &s) {
+  // Same perf characteristics as GetSliceHash()
+  return BloomHash(s);
+}
+
+static uint32_t DryRunHash64(Slice &s) {
+  return Lower32of64(GetSliceHash64(s));
+}
+
 struct FilterBench : public MockBlockBasedTableTester {
   std::vector<KeyMaker> kms_;
   std::vector<FilterInfo> infos_;
@@ -427,6 +438,15 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
     info.false_positives_ = 0;
   }
 
+  auto dry_run_hash_fn = DryRunNoHash;
+  if (!FLAGS_net_includes_hashing) {
+    if (FLAGS_impl < 2 || FLAGS_use_plain_table_bloom) {
+      dry_run_hash_fn = DryRunHash32;
+    } else {
+      dry_run_hash_fn = DryRunHash64;
+    }
+  }
+
   uint32_t num_infos = static_cast<uint32_t>(infos_.size());
   uint32_t dry_run_hash = 0;
   uint64_t max_queries =
@@ -504,11 +524,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
       if (dry_run) {
         for (uint32_t i = 0; i < batch_size; ++i) {
           batch_results[i] = true;
-          if (FLAGS_net_includes_hashing) {
-            dry_run_hash += NoHash(batch_slices[i]);
-          } else {
-            dry_run_hash ^= BloomHash(batch_slices[i]);
-          }
+          dry_run_hash += dry_run_hash_fn(batch_slices[i]);
         }
       } else {
         info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
@@ -526,11 +542,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
         bool may_match;
         if (FLAGS_use_plain_table_bloom) {
           if (dry_run) {
-            if (FLAGS_net_includes_hashing) {
-              dry_run_hash += NoHash(batch_slices[i]);
-            } else {
-              dry_run_hash ^= GetSliceHash(batch_slices[i]);
-            }
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
             may_match = true;
           } else {
             uint32_t hash = GetSliceHash(batch_slices[i]);
@@ -538,11 +550,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
           }
         } else if (FLAGS_use_full_block_reader) {
           if (dry_run) {
-            if (FLAGS_net_includes_hashing) {
-              dry_run_hash += NoHash(batch_slices[i]);
-            } else {
-              dry_run_hash ^= BloomHash(batch_slices[i]);
-            }
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
             may_match = true;
           } else {
             may_match = info.full_block_reader_->KeyMayMatch(
@@ -555,11 +563,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
           }
         } else {
           if (dry_run) {
-            if (FLAGS_net_includes_hashing) {
-              dry_run_hash += NoHash(batch_slices[i]);
-            } else {
-              dry_run_hash ^= BloomHash(batch_slices[i]);
-            }
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
             may_match = true;
           } else {
             may_match = info.reader_->MayMatch(batch_slices[i]);
diff --git a/util/hash.cc b/util/hash.cc
index 852710d73fa..7daffb251d4 100644
--- a/util/hash.cc
+++ b/util/hash.cc
@@ -11,11 +11,14 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/util.h"
+#include "util/xxhash.h"
 
 namespace rocksdb {
 
 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
-  // Similar to murmur hash
+  // MurmurHash1 - fast but mediocre quality
+  // https://github.com/aappleby/smhasher/wiki/MurmurHash1
+  //
   const uint32_t m = 0xc6a4a793;
   const uint32_t r = 24;
   const char* limit = data + n;
@@ -54,4 +57,27 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   return h;
 }
 
+// We are standardizing on a preview release of XXH3, because that's
+// the best available at time of standardizing.
+//
+// In testing (mostly Intel Skylake), this hash function is much more
+// thorough than Hash32 and is almost universally faster. Hash() only
+// seems faster when passing runtime-sized keys of the same small size
+// (less than about 24 bytes) thousands of times in a row; this seems
+// to allow the branch predictor to work some magic. XXH3's speed is
+// much less dependent on branch prediction.
+//
+// Hashing with a prefix extractor is potentially a common case of
+// hashing objects of small, predictable size. We could consider
+// bundling hash functions specialized for particular lengths with
+// the prefix extractors.
+uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
+  return XXH3p_64bits_withSeed(data, n, seed);
+}
+
+uint64_t Hash64(const char* data, size_t n) {
+  // Same as seed = 0
+  return XXH3p_64bits(data, n);
+}
+
 }  // namespace rocksdb
diff --git a/util/hash.h b/util/hash.h
index d55e74fec47..888773c0ee3 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -7,41 +7,76 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
-// Common hash functions with convenient interfaces.
+// Common hash functions with convenient interfaces. If hashing a
+// statically-sized input in a performance-critical context, consider
+// calling a specific hash implementation directly, such as
+// XXH3p_64bits from xxhash.h.
+//
+// Since this is a very common header, implementation details are kept
+// out-of-line. Out-of-lining also aids in tracking the time spent in
+// hashing functions. Inlining is of limited benefit for runtime-sized
+// hash inputs.
 
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
 
 #include "rocksdb/slice.h"
-#include "util/xxhash.h"
 
 namespace rocksdb {
 
+// Stable/persistent 64-bit hash. Higher quality and generally faster than
+// Hash(), especially for inputs > 24 bytes.
+extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+extern uint64_t Hash64(const char* data, size_t n);
+
 // Non-persistent hash. Must only used for in-memory data structure.
 // The hash results are thus applicable to change. (Thus, it rarely makes
 // sense to specify a seed for this function.)
-inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed = 0) {
-  // XXH3 currently experimental, but generally faster than other quality
-  // 64-bit hash functions.
-  return XXH3p_64bits_withSeed(data, n, seed);
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+  // Currently same as Hash64
+  return Hash64(data, n, seed);
+}
+
+// Specific optimization without seed (same as seed = 0)
+inline uint64_t NPHash64(const char* data, size_t n) {
+  // Currently same as Hash64
+  return Hash64(data, n);
 }
 
+// Stable/persistent 32-bit hash. Moderate quality and high speed on
+// small inputs.
+// TODO: consider rename to Hash32
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
+// TODO: consider rename to LegacyBloomHash32
 inline uint32_t BloomHash(const Slice& key) {
   return Hash(key.data(), key.size(), 0xbc9f1d34);
 }
 
+inline uint64_t GetSliceHash64(const Slice& key) {
+  return Hash64(key.data(), key.size());
+}
+
 inline uint64_t GetSliceNPHash64(const Slice& s) {
   return NPHash64(s.data(), s.size());
 }
 
+// TODO: consider rename to GetSliceHash32
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
 }
 
+// Useful for splitting up a 64-bit hash
+inline uint32_t Upper32of64(uint64_t v) {
+  return static_cast<uint32_t>(v >> 32);
+}
+inline uint32_t Lower32of64(uint64_t v) { return static_cast<uint32_t>(v); }
+
 // std::hash compatible interface.
+// TODO: consider rename to SliceHasher32
 struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 27eddfbe5e5..81fcf6bf130 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -7,16 +7,25 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <cstring>
 #include <vector>
 
 #include "test_util/testharness.h"
+#include "util/coding.h"
 #include "util/hash.h"
 
+using rocksdb::EncodeFixed32;
+using rocksdb::GetSliceHash64;
+using rocksdb::Hash;
+using rocksdb::Hash64;
+using rocksdb::Lower32of64;
+using rocksdb::Upper32of64;
+using rocksdb::Slice;
+
 // The hash algorithm is part of the file format, for example for the Bloom
 // filters. Test that the hash values are stable for a set of random strings of
 // varying lengths.
 TEST(HashTest, Values) {
-  using rocksdb::Hash;
   constexpr uint32_t kSeed = 0xbc9f1d34;  // Same as BloomHash.
 
   EXPECT_EQ(Hash("", 0, kSeed), 3164544308u);
@@ -70,6 +79,192 @@ TEST(HashTest, Values) {
             3382479516u);
 }
 
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters.
+TEST(HashTest, Hash64Misc) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  for (char fill : {'\0', 'a', '1', '\xff'}) {
+    const size_t max_size = 1000;
+    const std::string str(max_size, fill);
+
+    for (size_t size = 0; size <= max_size; ++size) {
+      uint64_t here = Hash64(str.data(), size, kSeed);
+
+      // Must be same as GetSliceHash64
+      EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
+
+      // Upper and Lower must reconstruct hash
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) | Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) + Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) ^ Lower32of64(here));
+
+      // Seed changes hash value (with high probability)
+      for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+        EXPECT_NE(here, Hash64(str.data(), size, var_seed));
+      }
+
+      // Size changes hash value (with high probability)
+      size_t max_smaller_by = std::min(size_t{30}, size);
+      for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+        EXPECT_NE(here, Hash64(str.data(), size - smaller_by, kSeed));
+      }
+    }
+  }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash64Trivial) {
+  // Thorough test too slow for regression testing
+  constexpr bool thorough = false;
+
+  // For various seeds, make sure hash of empty string is not zero.
+  constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+  for (uint64_t seed = 0; seed < max_seed; ++seed) {
+    uint64_t here = Hash64("", 0, seed);
+    EXPECT_NE(Lower32of64(here), 0u);
+    EXPECT_NE(Upper32of64(here), 0u);
+  }
+
+  // For standard seed, make sure hash of small strings are not zero
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+  char input[4];
+  constexpr int max_len = thorough ? 3 : 2;
+  for (int len = 1; len <= max_len; ++len) {
+    for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+      EncodeFixed32(input, i);
+      uint64_t here = Hash64(input, len, kSeed);
+      EXPECT_NE(Lower32of64(here), 0u);
+      EXPECT_NE(Upper32of64(here), 0u);
+    }
+  }
+}
+
+// Test that the hash values are stable for a set of random strings of
+// varying small lengths.
+TEST(HashTest, Hash64SmallValueSchema) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  EXPECT_EQ(Hash64("", 0, kSeed), uint64_t{5999572062939766020u});
+  EXPECT_EQ(Hash64("\x08", 1, kSeed), uint64_t{583283813901344696u});
+  EXPECT_EQ(Hash64("\x17", 1, kSeed), uint64_t{16175549975585474943u});
+  EXPECT_EQ(Hash64("\x9a", 1, kSeed), uint64_t{16322991629225003903u});
+  EXPECT_EQ(Hash64("\x1c", 1, kSeed), uint64_t{13269285487706833447u});
+  EXPECT_EQ(Hash64("\x4d\x76", 2, kSeed), uint64_t{6859542833406258115u});
+  EXPECT_EQ(Hash64("\x52\xd5", 2, kSeed), uint64_t{4919611532550636959u});
+  EXPECT_EQ(Hash64("\x91\xf7", 2, kSeed), uint64_t{14199427467559720719u});
+  EXPECT_EQ(Hash64("\xd6\x27", 2, kSeed), uint64_t{12292689282614532691u});
+  EXPECT_EQ(Hash64("\x30\x46\x0b", 3, kSeed), uint64_t{11404699285340020889u});
+  EXPECT_EQ(Hash64("\x56\xdc\xd6", 3, kSeed), uint64_t{12404347133785524237u});
+  EXPECT_EQ(Hash64("\xd4\x52\x33", 3, kSeed), uint64_t{15853805298481534034u});
+  EXPECT_EQ(Hash64("\x6a\xb5\xf4", 3, kSeed), uint64_t{16863488758399383382u});
+  EXPECT_EQ(Hash64("\x67\x53\x81\x1c", 4, kSeed),
+            uint64_t{9010661983527562386u});
+  EXPECT_EQ(Hash64("\x69\xb8\xc0\x88", 4, kSeed),
+            uint64_t{6611781377647041447u});
+  EXPECT_EQ(Hash64("\x1e\x84\xaf\x2d", 4, kSeed),
+            uint64_t{15290969111616346501u});
+  EXPECT_EQ(Hash64("\x46\xdc\x54\xbe", 4, kSeed),
+            uint64_t{7063754590279313623u});
+  EXPECT_EQ(Hash64("\xd0\x7a\x6e\xea\x56", 5, kSeed),
+            uint64_t{6384167718754869899u});
+  EXPECT_EQ(Hash64("\x86\x83\xd5\xa4\xd8", 5, kSeed),
+            uint64_t{16874407254108011067u});
+  EXPECT_EQ(Hash64("\xb7\x46\xbb\x77\xce", 5, kSeed),
+            uint64_t{16809880630149135206u});
+  EXPECT_EQ(Hash64("\x6c\xa8\xbc\xe5\x99", 5, kSeed),
+            uint64_t{1249038833153141148u});
+  EXPECT_EQ(Hash64("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed),
+            uint64_t{17358142495308219330u});
+  EXPECT_EQ(Hash64("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed),
+            uint64_t{4237646583134806322u});
+  EXPECT_EQ(Hash64("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed),
+            uint64_t{4373664924115234051u});
+  EXPECT_EQ(Hash64("\x73\xe1\xff\x56\x9c\xce", 6, kSeed),
+            uint64_t{12012981210634596029u});
+  EXPECT_EQ(Hash64("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed),
+            uint64_t{5716522398211028826u});
+  EXPECT_EQ(Hash64("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed),
+            uint64_t{15604531309862565013u});
+  EXPECT_EQ(Hash64("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed),
+            uint64_t{8601330687345614172u});
+  EXPECT_EQ(Hash64("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed),
+            uint64_t{8088079329364056942u});
+  EXPECT_EQ(Hash64("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed),
+            uint64_t{9844314944338447628u});
+  EXPECT_EQ(Hash64("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed),
+            uint64_t{10973293517982163143u});
+  EXPECT_EQ(Hash64("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed),
+            uint64_t{9986007080564743219u});
+  EXPECT_EQ(Hash64("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed),
+            uint64_t{1729303145008254458u});
+  EXPECT_EQ(Hash64("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            uint64_t{13253403748084181481u});
+  EXPECT_EQ(Hash64("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            uint64_t{7768754303876232188u});
+  EXPECT_EQ(Hash64("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            uint64_t{12439346786701492u});
+  EXPECT_EQ(Hash64("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            uint64_t{10841838338450144690u});
+  EXPECT_EQ(Hash64("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+            uint64_t{12883919702069153152u});
+  EXPECT_EQ(Hash64("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+            uint64_t{12692903507676842188u});
+  EXPECT_EQ(Hash64("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+            uint64_t{6540985900674032620u});
+  EXPECT_EQ(Hash64("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+            uint64_t{10551812464348219044u});
+}
+
+std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
+  const char *mod61_encode =
+      "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::string input;
+  while (input.size() < limit) {
+    input.append(repeat);
+  }
+  std::string rv;
+  for (size_t i = 0; i < limit; ++i) {
+    uint64_t h = GetSliceHash64(Slice(input.data(), i));
+    rv.append(1, mod61_encode[static_cast<size_t>(h % 61)]);
+  }
+  return rv;
+}
+
+// XXH3p changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash64LargeValueSchema) {
+  // Each of these derives a "descriptor" from the hash values for all
+  // lengths up to 430.
+  // Note that "c" is common for the zero-length string.
+  EXPECT_EQ(
+      Hash64TestDescriptor("foo", 430),
+      "cRhyWsY67B6klRA1udmOuiYuX7IthyGBKqbeosz2hzVglWCmQx8nEdnpkvPfYX56Up2OWOTV"
+      "lTzfAoYwvtqKzjD8E9xttR2unelbXbIV67NUe6bOO23BxaSFRcA3njGu5cUWfgwOqNoTsszp"
+      "uPvKRP6qaUR5VdoBkJUCFIefd7edlNK5mv6JYWaGdwxehg65hTkTmjZoPKxTZo4PLyzbL9U4"
+      "xt12ITSfeP2MfBHuLI2z2pDlBb44UQKVMx27LEoAHsdLp3WfWfgH3sdRBRCHm33UxCM4QmE2"
+      "xJ7gqSvNwTeH7v9GlC8zWbGroyD3UVNeShMLx29O7tH1biemLULwAHyIw8zdtLMDpEJ8m2ic"
+      "l6Lb4fDuuFNAs1GCVUthjK8CV8SWI8Rsz5THSwn5CGhpqUwSZcFknjwWIl5rNCvDxXJqYr");
+  // Note that "1EeRk" is common for "Rocks"
+  EXPECT_EQ(
+      Hash64TestDescriptor("Rocks", 430),
+      "c1EeRkrzgOYWLA8PuhJrwTePJewoB44WdXYDfhbk3ZxTqqg25WlPExDl7IKIQLJvnA6gJxxn"
+      "9TCSLkFGfJeXehaSS1GBqWSzfhEH4VXiXIUCuxJXxtKXcSC6FrNIQGTZbYDiUOLD6Y5inzrF"
+      "9etwQhXUBanw55xAUdNMFQAm2GjJ6UDWp2mISLiMMkLjANWMKLaZMqaFLX37qB4MRO1ooVRv"
+      "zSvaNRSCLxlggQCasQq8icWjzf3HjBlZtU6pd4rkaUxSzHqmo9oM5MghbU5Rtxg8wEfO7lVN"
+      "5wdMONYecslQTwjZUpO1K3LDf3K3XK6sUXM6ShQQ3RHmMn2acB4YtTZ3QQcHYJSOHn2DuWpa"
+      "Q8RqzX5lab92YmOLaCdOHq1BPsM7SIBzMdLgePNsJ1vvMALxAaoDUHPxoFLO2wx18IXnyX");
+  EXPECT_EQ(
+      Hash64TestDescriptor("RocksDB", 430),
+      "c1EeRkukbkb28wLTahwD2sfUhZzaBEnF8SVrxnPVB6A7b8CaAl3UKsDZISF92GSq2wDCukOq"
+      "Jgrsp7A3KZhDiLW8dFXp8UPqPxMCRlMdZeVeJ2dJxrmA6cyt99zkQFj7ELbut6jAeVqARFnw"
+      "fnWVXOsaLrq7bDCbMcns2DKvTaaqTCLMYxI7nhtLpFN1jR755FRQFcOzrrDbh7QhypjdvlYw"
+      "cdAMSZgp9JMHxbM23wPSuH6BOFgxejz35PScZfhDPvTOxIy1jc3MZsWrMC3P324zNolO7JdW"
+      "CX2I5UDKjjaEJfxbgVgJIXxtQGlmj2xkO5sPpjULQV4X2HlY7FQleJ4QRaJIB4buhCA4vUTF"
+      "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
+}
+
 TEST(Fastrange32Test, Values) {
   using rocksdb::fastrange32;
   // Zero range
@@ -175,11 +370,6 @@ size_t fastrange64(uint64_t hash, size_t range) {
   return rocksdb::fastrange64(hash, range);
 }
 
-// for inspection of disassembly
-uint64_t NPHash64(const char* data, size_t n) {
-  return rocksdb::NPHash64(data, n);
-}
-
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
diff --git a/util/xxh3p.h b/util/xxh3p.h
index 7d28ad3f244..0a3cd980810 100644
--- a/util/xxh3p.h
+++ b/util/xxh3p.h
@@ -487,7 +487,12 @@ XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64
     {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
         if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
         if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
-        return 0;
+        /*
+         * RocksDB modification from XXH3 preview: zero result for empty
+         * string can be problematic for multiplication-based algorithms.
+         * Return a hash of the seed instead.
+         */
+        return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
     }
 }
 
diff --git a/util/xxhash.cc b/util/xxhash.cc
index 1cae65a2874..6620ae8b686 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -591,8 +591,10 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
             state->memsize = 0;
         }
 
-        if (p <= bEnd-16) {
-            const xxh_u8* const limit = bEnd - 16;
+        // uintptr_t casts added to avoid array-bounds error on
+        // some inlined calls
+        if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
+            const uintptr_t limit = (uintptr_t)bEnd - 16;
             xxh_u32 v1 = state->v1;
             xxh_u32 v2 = state->v2;
             xxh_u32 v3 = state->v3;
@@ -603,7 +605,7 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
                 v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
                 v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
                 v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
-            } while (p<=limit);
+            } while ((uintptr_t)p <= limit);
 
             state->v1 = v1;
             state->v2 = v2;
@@ -1072,8 +1074,10 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
             state->memsize = 0;
         }
 
-        if (p+32 <= bEnd) {
-            const xxh_u8* const limit = bEnd - 32;
+        // uintptr_t casts added to avoid array-bounds error on
+        // some inlined calls
+        if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
+            const uintptr_t limit = (uintptr_t)bEnd - 32;
             xxh_u64 v1 = state->v1;
             xxh_u64 v2 = state->v2;
             xxh_u64 v3 = state->v3;
@@ -1084,7 +1088,7 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
                 v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
                 v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
                 v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
-            } while (p<=limit);
+            } while ((uintptr_t)p <= limit);
 
             state->v1 = v1;
             state->v2 = v2;

From 5b656584afd90f6e165d48831b42371bf0b7ea1e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 31 Oct 2019 17:26:46 -0700
Subject: [PATCH 502/572] crash_test: disable periodic compaction in FIFO
 compaction. (#5993)

Summary:
A recent commit make periodic compaction option valid in FIFO, which means TTL. But we fail to disable it in crash test, causing assert failure. Fix it by having it disabled.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5993

Test Plan: Restart "make crash_test" many times and make sure --periodic_compaction_seconds=0 is always the case when --compaction_style=2

Differential Revision: D18263223

fbshipit-source-id: c91a802017d83ae89ac43827d1b0012861933814
---
 tools/db_crashtest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 11ecfb2b955..0716fdb2bae 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -175,6 +175,7 @@ def finalize_and_sanitize(src_params):
         # Disable compaction TTL in FIFO compaction, because right
         # now assertion failures are triggered.
         dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
     if dest_params["partition_filters"] == 1:
         if dest_params["index_type"] != 2:
             dest_params["partition_filters"] = 0

From a44670e71b4d80bbe7acfdeddf6877e018358a70 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 1 Nov 2019 11:44:59 -0700
Subject: [PATCH 503/572] Use aggregate initialization for
 FlushJobInfo/CompactionJobInfo (#5997)

Summary:
FlushJobInfo and CompactionJobInfo are aggregates; we should use the
aggregate initialization syntax to ensure members (specifically those of
built-in types) are value-initialized.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5997

Test Plan: make check

Differential Revision: D18273398

Pulled By: ltamasi

fbshipit-source-id: 35b1a63ad9ca01605d288329858af72fffd7f392
---
 db/compact_files_test.cc               | 2 +-
 db/db_impl/db_impl_compaction_flush.cc | 6 +++---
 db/flush_job.cc                        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index 92975da87c1..4152cb37939 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -387,7 +387,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
-  CompactionJobInfo compaction_job_info;
+  CompactionJobInfo compaction_job_info{};
   ASSERT_OK(
       db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
   ASSERT_EQ(compaction_job_info.base_input_level, 0);
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 0497e859a6a..f7423224406 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -570,7 +570,7 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   // release lock while notifying events
   mutex_.Unlock();
   {
-    FlushJobInfo info;
+    FlushJobInfo info{};
     info.cf_id = cfd->GetID();
     info.cf_name = cfd->GetName();
     // TODO(yhchiang): make db_paths dynamic in case flush does not
@@ -1107,7 +1107,7 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
-    CompactionJobInfo info;
+    CompactionJobInfo info{};
     info.cf_name = cfd->GetName();
     info.status = st;
     info.thread_id = env_->GetThreadID();
@@ -1181,7 +1181,7 @@ void DBImpl::NotifyOnCompactionCompleted(
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
   {
-    CompactionJobInfo info;
+    CompactionJobInfo info{};
     BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
                            &info);
     for (auto listener : immutable_db_options_.listeners) {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 83ce7e74f7c..fc14b5b92bd 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -431,7 +431,7 @@ Status FlushJob::WriteLevel0Table() {
 #ifndef ROCKSDB_LITE
 std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
   db_mutex_->AssertHeld();
-  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo);
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
   info->cf_id = cfd_->GetID();
   info->cf_name = cfd_->GetName();
 

From e4e1d35cc22a77697d77f8625cfc03bcde5bb334 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 1 Nov 2019 15:55:54 -0700
Subject: [PATCH 504/572] Revert "Disable pre-5.5 versions in the format
 compatibility test (#5990)" (#5999)

Summary:
This reverts commit 351e25401b4ca1c9a607a65dc9dcae7da2d418b6.

All branches have been fixed to buildable on FB environments, so we can revert it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5999

Differential Revision: D18281947

fbshipit-source-id: 6deaaf1b5df2349eee5d6ed9b91208cd7e23ec8e
---
 tools/check_format_compatible.sh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index cc6a54ea973..30be49f0800 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -53,13 +53,10 @@ with open('${sorted_input_data}', 'w') as f:
     print >> f, k + " ==> " + v
 EOF
 
-# Disable pre-5.5 versions that have not been ported to gcc7.
-#declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-#declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
-declare -a forward_compatible_checkout_objs=("5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
+declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
 declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
-#declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a checkout_objs=(${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
 declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 
 generate_db()

From 0d91a981e929e652dd11cb882545471e85175ed4 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 1 Nov 2019 18:31:42 -0700
Subject: [PATCH 505/572] Fix assertion in universal compaction periodic
 compaction (#6000)

Summary:
We recently added periodic compaction to universal compaction. An old assertion that we can't onlyl compact the last sorted run triggered. However, with periodic compaction, it is possible that we only compact the last sorted run, so the assertion now became stricter than needed. Relaxing this assertion.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6000

Test Plan: This should be a low risk change. Will observe whether stress test will pass after it.

Differential Revision: D18285396

fbshipit-source-id: 9a6863debdf104c40a7f6c46ab62d84cdf5d8592
---
 db/compaction/compaction_picker_universal.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 4acc5c893b5..4115af39507 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -965,7 +965,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
 
 Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
     size_t start_index, CompactionReason compaction_reason) {
-  assert(start_index < sorted_runs_.size() - 1);
+  assert(start_index < sorted_runs_.size());
 
   // Estimate total file size
   uint64_t estimated_total_size = 0;

From 52733b44984b67584121efb6730a9020cab3a67c Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Mon, 4 Nov 2019 16:22:26 -0800
Subject: [PATCH 506/572] WritePrepared: Fix flaky test
 MaxCatchupWithNewSnapshot (#5850)

Summary:
MaxCatchupWithNewSnapshot tests that the snapshot sequence number will be larger than the max sequence number when the snapshot was taken. However since the test does not have access to the max sequence number when the snapshot was taken, it uses max sequence number after that, which could have advanced the snapshot by then, thus making the test flaky.
The fix is to compare with max sequence number before the snapshot was taken, which is a lower bound for the value when the snapshot was taken.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5850

Test Plan: ~/gtest-parallel/gtest-parallel --repeat=12800 ./write_prepared_transaction_test --gtest_filter="*MaxCatchupWithNewSnapshot*"

Differential Revision: D17608926

Pulled By: maysamyabandeh

fbshipit-source-id: b122ae5a27f982b290bd60da852e28d3c5eb0136
---
 utilities/transactions/write_prepared_transaction_test.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 9881b356ccc..7144a7919f1 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1386,9 +1386,12 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
       std::this_thread::yield();
     }
     for (int i = 0; i < 10; i++) {
+      SequenceNumber max_lower_bound = wp_db->max_evicted_seq_;
       auto snap = db->GetSnapshot();
       if (snap->GetSequenceNumber() != 0) {
-        ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+        // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus
+        // compare with the lower bound instead as an approximation.
+        ASSERT_LT(max_lower_bound, snap->GetSequenceNumber());
       }  // seq 0 is ok to be less than max since nothing is visible to it
       db->ReleaseSnapshot(snap);
     }

From 230bcae7b61c03ded4fd65e9d157c66348440c95 Mon Sep 17 00:00:00 2001
From: Sergei Petrunia <psergey@askmonty.org>
Date: Tue, 5 Nov 2019 11:29:31 -0800
Subject: [PATCH 507/572] Add a limited support for iteration bounds into
 BaseDeltaIterator (#5403)

Summary:
For MDEV-19670: MyRocks: key lookups into deleted data are very slow

BaseDeltaIterator remembers iterate_upper_bound and will not let delta_iterator_
walk above the iterate_upper_bound if base_iterator_ is not valid
anymore.

== Rationale ==
The most straightforward way would be to make the delta_iterator
(which is a rocksdb::WBWIIterator) to support iterator bounds. But
checking for bounds has an extra CPU overhead.

So we put the check into BaseDeltaIterator, and only make it when
base_iterator_ is not valid.

(note: We could take it even further, and move the check a few lines
down, and only check iterator bounds ourselves if base_iterator_ is
not valid AND delta_iterator_ hit a tombstone).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5403

Differential Revision: D15863092

Pulled By: maysamyabandeh

fbshipit-source-id: 8da458e7b9af95ff49356666f69664b4a6ccf49b
---
 db/c_test.c                                   |  1 +
 .../utilities/write_batch_with_index.h        |  3 ++-
 utilities/transactions/transaction_base.cc    |  3 ++-
 .../write_batch_with_index.cc                 | 22 +++++++++++++++----
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/db/c_test.c b/db/c_test.c
index 4b4b165c879..e851aad53f2 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -1467,6 +1467,7 @@ int main(int argc, char** argv) {
        CheckCondition(!rocksdb_iter_valid(iter));
 
        rocksdb_iter_destroy(iter);
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
     }
   }
 
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 586088d7519..26fb795176e 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -165,7 +165,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // the write batch update finishes. The state may recover after Next() is
   // called.
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
-                                Iterator* base_iterator);
+                                Iterator* base_iterator,
+                                const ReadOptions *opts = nullptr);
   // default column family
   Iterator* NewIteratorWithBase(Iterator* base_iterator);
 
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index c10c1795f4e..04e664f6fcf 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -369,7 +369,8 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
   Iterator* db_iter = db_->NewIterator(read_options, column_family);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+  return write_batch_.NewIteratorWithBase(column_family, db_iter,
+                                          &read_options);
 }
 
 Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 2f51f4f9f57..3eb14ebe8d7 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -33,14 +33,18 @@ namespace rocksdb {
 class BaseDeltaIterator : public Iterator {
  public:
   BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
-                    const Comparator* comparator)
+                    const Comparator* comparator,
+                    const ReadOptions* read_options = nullptr)
       : forward_(true),
         current_at_base_(true),
         equal_keys_(false),
         status_(Status::OK()),
         base_iterator_(base_iterator),
         delta_iterator_(delta_iterator),
-        comparator_(comparator) {}
+        comparator_(comparator),
+        iterate_upper_bound_(read_options? read_options->iterate_upper_bound :
+                                           nullptr)
+        {}
 
   ~BaseDeltaIterator() override {}
 
@@ -279,6 +283,13 @@ class BaseDeltaIterator : public Iterator {
           // Finished
           return;
         }
+        if (iterate_upper_bound_) {
+          if (comparator_->Compare(delta_entry.key,
+                                   *iterate_upper_bound_) >= 0) {
+            // out of upper bound -> finished.
+            return;
+          }
+        }
         if (delta_entry.type == kDeleteRecord ||
             delta_entry.type == kSingleDeleteRecord) {
           AdvanceDelta();
@@ -326,6 +337,7 @@ class BaseDeltaIterator : public Iterator {
   std::unique_ptr<Iterator> base_iterator_;
   std::unique_ptr<WBWIIterator> delta_iterator_;
   const Comparator* comparator_;  // not owned
+  const Slice* iterate_upper_bound_;
 };
 
 typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
@@ -647,13 +659,15 @@ WBWIIterator* WriteBatchWithIndex::NewIterator(
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(
-    ColumnFamilyHandle* column_family, Iterator* base_iterator) {
+    ColumnFamilyHandle* column_family, Iterator* base_iterator,
+    const ReadOptions *read_options) {
   if (rep->overwrite_key == false) {
     assert(false);
     return nullptr;
   }
   return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
-                               GetColumnFamilyUserComparator(column_family));
+                               GetColumnFamilyUserComparator(column_family),
+                               read_options);
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {

From 67e735dbf949b9a0f5c36390ce3fa10cdf03c9e4 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 5 Nov 2019 17:17:36 -0800
Subject: [PATCH 508/572] Rename BlockBasedTable::ReadMetaBlock (#6009)

Summary:
According to
https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format,
the block read by BlockBasedTable::ReadMetaBlock is actually the meta index
block. Therefore, it is better to rename the function to ReadMetaIndexBlock.

This PR also applies some format change to existing code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6009

Test Plan: make check

Differential Revision: D18333238

Pulled By: riversand963

fbshipit-source-id: 2c4340a29b3edba53d19c132cbfd04caf6242aed
---
 .../utilities/write_batch_with_index.h        |  2 +-
 table/block_based/block_based_table_reader.cc | 85 ++++++++++---------
 table/block_based/block_based_table_reader.h  |  6 +-
 .../write_batch_with_index.cc                 | 11 ++-
 4 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 26fb795176e..a0b3bac99df 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -166,7 +166,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // called.
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
                                 Iterator* base_iterator,
-                                const ReadOptions *opts = nullptr);
+                                const ReadOptions* opts = nullptr);
   // default column family
   Iterator* NewIteratorWithBase(Iterator* base_iterator);
 
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 0189df36c78..082a0ef864f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1201,28 +1201,29 @@ Status BlockBasedTable::Open(
   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
 
   // Read metaindex
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  s = new_table->ReadMetaBlock(prefetch_buffer.get(), &meta, &meta_iter);
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
+                                    &metaindex_iter);
   if (!s.ok()) {
     return s;
   }
 
   // Populates table_properties and some fields that depend on it,
   // such as index_type.
-  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), meta_iter.get(),
-                                     largest_seqno);
+  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
+                                     metaindex_iter.get(), largest_seqno);
   if (!s.ok()) {
     return s;
   }
-  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), meta_iter.get(),
+  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
                                    internal_comparator, &lookup_context);
   if (!s.ok()) {
     return s;
   }
   s = new_table->PrefetchIndexAndFilterBlocks(
-      prefetch_buffer.get(), meta_iter.get(), new_table.get(), prefetch_all,
-      table_options, level, &lookup_context);
+      prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, &lookup_context);
 
   if (s.ok()) {
     // Update tail prefetch stats
@@ -1634,17 +1635,19 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const {
   return usage;
 }
 
-// Load the meta-block from the file. On success, return the loaded meta block
-// and its iterator.
-Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
-                                      std::unique_ptr<Block>* meta_block,
-                                      std::unique_ptr<InternalIterator>* iter) {
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+    FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<Block>* metaindex_block,
+    std::unique_ptr<InternalIterator>* iter) {
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
-  std::unique_ptr<Block> meta;
+  std::unique_ptr<Block> metaindex;
   Status s = ReadBlockFromFile(
       rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
-      rep_->footer.metaindex_handle(), &meta, rep_->ioptions,
+      rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
       kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
@@ -1659,10 +1662,10 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
     return s;
   }
 
-  *meta_block = std::move(meta);
+  *metaindex_block = std::move(metaindex);
   // meta block uses bytewise comparator.
-  iter->reset(meta_block->get()->NewDataIterator(BytewiseComparator(),
-                                                 BytewiseComparator()));
+  iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
+                                                      BytewiseComparator()));
   return Status::OK();
 }
 
@@ -3743,11 +3746,12 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
                                        TableReaderCaller caller) {
   Status s;
   // Check Meta blocks
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(nullptr /* prefetch buffer */, &meta, &meta_iter);
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
+                         &metaindex_iter);
   if (s.ok()) {
-    s = VerifyChecksumInMetaBlocks(meta_iter.get());
+    s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
     if (!s.ok()) {
       return s;
     }
@@ -3938,11 +3942,12 @@ Status BlockBasedTable::CreateIndexReader(
                                              index_reader);
     }
     case BlockBasedTableOptions::kHashSearch: {
-      std::unique_ptr<Block> meta_guard;
-      std::unique_ptr<InternalIterator> meta_iter_guard;
+      std::unique_ptr<Block> metaindex_guard;
+      std::unique_ptr<InternalIterator> metaindex_iter_guard;
       auto meta_index_iter = preloaded_meta_index_iter;
       if (meta_index_iter == nullptr) {
-        auto s = ReadMetaBlock(prefetch_buffer, &meta_guard, &meta_iter_guard);
+        auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
+                                    &metaindex_iter_guard);
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
@@ -3953,7 +3958,7 @@ Status BlockBasedTable::CreateIndexReader(
                                                  use_cache, prefetch, pin,
                                                  lookup_context, index_reader);
         }
-        meta_index_iter = meta_iter_guard.get();
+        meta_index_iter = metaindex_iter_guard.get();
       }
 
       return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
@@ -4110,31 +4115,33 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   out_file->Append(
       "Metaindex Details:\n"
       "--------------------------------------\n");
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  Status s = ReadMetaBlock(nullptr /* prefetch_buffer */, &meta, &meta_iter);
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
+                                &metaindex_iter);
   if (s.ok()) {
-    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
-      s = meta_iter->status();
+    for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+         metaindex_iter->Next()) {
+      s = metaindex_iter->status();
       if (!s.ok()) {
         return s;
       }
-      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
+      if (metaindex_iter->key() == rocksdb::kPropertiesBlock) {
         out_file->Append("  Properties block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
         out_file->Append("\n");
-      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
+      } else if (metaindex_iter->key() == rocksdb::kCompressionDictBlock) {
         out_file->Append("  Compression dictionary block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
         out_file->Append("\n");
-      } else if (strstr(meta_iter->key().ToString().c_str(),
+      } else if (strstr(metaindex_iter->key().ToString().c_str(),
                         "filter.rocksdb.") != nullptr) {
         out_file->Append("  Filter block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
         out_file->Append("\n");
-      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
+      } else if (metaindex_iter->key() == rocksdb::kRangeDelBlock) {
         out_file->Append("  Range deletion block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
         out_file->Append("\n");
       }
     }
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index dd496d35466..68c76ddb78a 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -413,9 +413,9 @@ class BlockBasedTable : public TableReader {
       TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
       const bool preload_all,
       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
-  Status ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer,
-                       std::unique_ptr<Block>* meta_block,
-                       std::unique_ptr<InternalIterator>* iter);
+  Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
+                            std::unique_ptr<Block>* metaindex_block,
+                            std::unique_ptr<InternalIterator>* iter);
   Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
                                           const Slice& handle_value,
                                           TableProperties** table_properties);
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 3eb14ebe8d7..4cc6b9bca03 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -42,9 +42,8 @@ class BaseDeltaIterator : public Iterator {
         base_iterator_(base_iterator),
         delta_iterator_(delta_iterator),
         comparator_(comparator),
-        iterate_upper_bound_(read_options? read_options->iterate_upper_bound :
-                                           nullptr)
-        {}
+        iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+                                          : nullptr) {}
 
   ~BaseDeltaIterator() override {}
 
@@ -284,8 +283,8 @@ class BaseDeltaIterator : public Iterator {
           return;
         }
         if (iterate_upper_bound_) {
-          if (comparator_->Compare(delta_entry.key,
-                                   *iterate_upper_bound_) >= 0) {
+          if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >=
+              0) {
             // out of upper bound -> finished.
             return;
           }
@@ -660,7 +659,7 @@ WBWIIterator* WriteBatchWithIndex::NewIterator(
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(
     ColumnFamilyHandle* column_family, Iterator* base_iterator,
-    const ReadOptions *read_options) {
+    const ReadOptions* read_options) {
   if (rep->overwrite_key == false) {
     assert(false);
     return nullptr;

From 834feaff05a4bf7ae49c736305d5eb180aed4011 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 5 Nov 2019 20:19:17 -0800
Subject: [PATCH 509/572] Add clarifying/instructive header to TARGETS and
 defs.bzl

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/6008

Differential Revision: D18343273

Pulled By: pdillinger

fbshipit-source-id: f7d1c78d711bbfb0deea9ec88212c19ab2ec91b8
---
 TARGETS                  | 5 +++++
 buckifier/targets_cfg.py | 8 +++++++-
 defs.bzl                 | 3 +++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/TARGETS b/TARGETS
index 267b3cea8a4..ab1f24cd76c 100644
--- a/TARGETS
+++ b/TARGETS
@@ -1,3 +1,8 @@
+# This file @generated by `python buckifier/buckify_rocksdb.py`
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
 load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
 load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
 load(":defs.bzl", "test_binary")
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 19ea777270d..0ecd6fdda76 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -3,7 +3,13 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+
+rocksdb_target_header = """# This file \100generated by `python buckifier/buckify_rocksdb.py`
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
+load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
 load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
 load(":defs.bzl", "test_binary")
 
diff --git a/defs.bzl b/defs.bzl
index d5b7b6af718..83e9a579f9e 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -1,4 +1,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# defs.bzl - Definitions for Facebook-specific buck build integration
+# in TARGETS
 
 load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary")
 load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest")

From 50804656d20bcfe4c3b9f77d05856a927b059b88 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Wed, 6 Nov 2019 11:11:51 -0800
Subject: [PATCH 510/572] Enable write-conflict snapshot in stress tests
 (#5897)

Summary:
DBImpl extends the public GetSnapshot() with GetSnapshotForWriteConflictBoundary() method that takes snapshots specially for write-write conflict checking. Compaction treats such snapshots differently to avoid GCing a value written after that, so that the write conflict remains visible even after the compaction. The patch extends stress tests with such snapshots.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5897

Differential Revision: D17937476

Pulled By: maysamyabandeh

fbshipit-source-id: bd8b0c578827990302194f63ae0181e15752951d
---
 tools/db_stress_tool.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index ab2c9d7d015..8454166023b 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2227,7 +2227,15 @@ class StressTest {
 
         if (FLAGS_acquire_snapshot_one_in > 0 &&
             thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
-          auto snapshot = db_->GetSnapshot();
+#ifndef ROCKSDB_LITE
+          auto db_impl = reinterpret_cast<DBImpl*>(db_->GetRootDB());
+          const bool ww_snapshot = thread->rand.OneIn(10);
+          const Snapshot* snapshot =
+              ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary()
+                          : db_->GetSnapshot();
+#else
+          const Snapshot* snapshot = db_->GetSnapshot();
+#endif  // !ROCKSDB_LITE
           ReadOptions ropt;
           ropt.snapshot = snapshot;
           std::string value_at;

From 8ea087ad16c853ab24af2f73a9b6a0c9ae04a6e8 Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Wed, 6 Nov 2019 12:50:33 -0800
Subject: [PATCH 511/572] Workload generator (Mixgraph) based on prefix hotness
 (#5953)

Summary:
In the previous PR https://github.com/facebook/rocksdb/issues/4788, user can use db_bench mix_graph option to generate the workload that is from the social graph. The key is generated based on the key access hotness. In this PR, user can further model the key-range hotness and fit those to two-term-exponential distribution. First, user cuts the whole key space into small key ranges (e.g., key-ranges are the same size and the key-range number is the number of SST files). Then, user calculates the average access count per key of each key-range as the key-range hotness. Next, user fits the key-range hotness to two-term-exponential distribution (f(x) = f(x) = a*exp(b*x) + c*exp(d*x)) and generate the value of a, b, c, and d. They are the parameters in db_bench: prefix_dist_a, prefix_dist_b, prefix_dist_c, and prefix_dist_d. Finally, user can run db_bench by specify the parameters.
For example:
`./db_bench --benchmarks="mixgraph" -use_direct_io_for_flush_and_compaction=true -use_direct_reads=true -cache_size=268435456 -key_dist_a=0.002312 -key_dist_b=0.3467 -keyrange_dist_a=14.18 -keyrange_dist_b=-2.917 -keyrange_dist_c=0.0164 -keyrange_dist_d=-0.08082 -keyrange_num=30 -value_k=0.2615 -value_sigma=25.45 -iter_k=2.517 -iter_sigma=14.236 -mix_get_ratio=0.85 -mix_put_ratio=0.14 -mix_seek_ratio=0.01 -sine_mix_rate_interval_milliseconds=5000 -sine_a=350 -sine_b=0.0105 -sine_d=50000 --perf_level=2 -reads=1000000 -num=5000000 -key_size=48`
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5953

Test Plan: run db_bench with different parameters and checked the results.

Differential Revision: D18053527

Pulled By: zhichao-cao

fbshipit-source-id: 171f8b3142bd76462f1967c58345ad7e4f84bab7
---
 tools/db_bench_tool.cc | 203 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 194 insertions(+), 9 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 743752b717a..d0540cd0cfc 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1005,6 +1005,22 @@ DEFINE_uint64(
     "is the global rate in bytes/second.");
 
 // the parameters of mix_graph
+DEFINE_double(keyrange_dist_a, 0.0,
+              "The parameter 'a' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_b, 0.0,
+              "The parameter 'b' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_c, 0.0,
+              "The parameter 'c' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_d, 0.0,
+              "The parameter 'd' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_int64(keyrange_num, 1,
+             "The number of key ranges that are in the same prefix "
+             "group, each prefix range will have its key acccess "
+             "distribution");
 DEFINE_double(key_dist_a, 0.0,
               "The parameter 'a' of key access distribution model "
               "f(x)=a*x^b");
@@ -4962,7 +4978,7 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
-  // THe reverse function of Pareto function
+  // The inverse function of Pareto distribution
   int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
     double ret;
     if (k == 0.0) {
@@ -4972,7 +4988,7 @@ class Benchmark {
     }
     return static_cast<int64_t>(ceil(ret));
   }
-  // inversion of y=ax^b
+  // The inverse function of power distribution (y=ax^b)
   int64_t PowerCdfInversion(double u, double a, double b) {
     double ret;
     ret = std::pow((u / a), (1 / b));
@@ -4993,7 +5009,7 @@ class Benchmark {
     }
   }
 
-  // decide the query type
+  // Decide the ratio of different query types
   // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
   class QueryDecider {
    public:
@@ -5034,7 +5050,157 @@ class Benchmark {
     }
   };
 
-  // The graph wokrload mixed with Get, Put, Iterator
+  // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
+  // to transfer a random value to one keyrange based on the hotness.
+  struct KeyrangeUnit {
+    int64_t keyrange_start;
+    int64_t keyrange_access;
+    int64_t keyrange_keys;
+  };
+
+  // From our observations, the prefix hotness (key-range hotness) follows
+  // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
+  // However, we cannot directly use the inverse function to decide a
+  // key-range from a random distribution. To achieve it, we create a list of
+  // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
+  // decided based on the hotness of the key-range. When a random value is
+  // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
+  // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
+  // selected is the same as the hotness of this KeyrangeUnit. After that, the
+  // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
+  // can based on the power distribution (y=ax^b) to generate the offset of
+  // the key in the selected key-range. In this way, we generate the keyID
+  // based on the hotness of the prefix and also the key hotness distribution.
+  class GenerateTwoTermExpKeys {
+   public:
+    int64_t keyrange_rand_max_;
+    int64_t keyrange_size_;
+    int64_t keyrange_num_;
+    bool initiated_;
+    std::vector<KeyrangeUnit> keyrange_set_;
+
+    GenerateTwoTermExpKeys() {
+      keyrange_rand_max_ = FLAGS_num;
+      initiated_ = false;
+    }
+
+    ~GenerateTwoTermExpKeys() {}
+
+    // Initiate the KeyrangeUnit vector and calculate the size of each
+    // KeyrangeUnit.
+    Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
+                                   double prefix_b, double prefix_c,
+                                   double prefix_d) {
+      int64_t amplify = 0;
+      int64_t keyrange_start = 0;
+      initiated_ = true;
+      if (FLAGS_keyrange_num <= 0) {
+        keyrange_num_ = 1;
+      } else {
+        keyrange_num_ = FLAGS_keyrange_num;
+      }
+      keyrange_size_ = total_keys / keyrange_num_;
+
+      // Calculate the key-range shares size based on the input parameters
+      for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
+        // Step 1. Calculate the probability that this key range will be
+        // accessed in a query. It is based on the two-term expoential
+        // distribution
+        double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
+                            prefix_c * std::exp(prefix_d * pfx);
+        if (keyrange_p < std::pow(10.0, -16.0)) {
+          keyrange_p = 0.0;
+        }
+        // Step 2. Calculate the amplify
+        // In order to allocate a query to a key-range based on the random
+        // number generated for this query, we need to extend the probability
+        // of each key range from [0,1] to [0, amplify]. Amplify is calculated
+        // by 1/(smallest key-range probability). In this way, we ensure that
+        // all key-ranges are assigned with an Integer that  >=0
+        if (amplify == 0 && keyrange_p > 0) {
+          amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
+        }
+
+        // Step 3. For each key-range, we calculate its position in the
+        // [0, amplify] range, including the start, the size (keyrange_access)
+        KeyrangeUnit p_unit;
+        p_unit.keyrange_start = keyrange_start;
+        if (0.0 >= keyrange_p) {
+          p_unit.keyrange_access = 0;
+        } else {
+          p_unit.keyrange_access =
+              static_cast<int64_t>(std::floor(amplify * keyrange_p));
+        }
+        p_unit.keyrange_keys = keyrange_size_;
+        keyrange_set_.push_back(p_unit);
+        keyrange_start += p_unit.keyrange_access;
+      }
+      keyrange_rand_max_ = keyrange_start;
+
+      // Step 4. Shuffle the key-ranges randomly
+      // Since the access probability is calculated from small to large,
+      // If we do not re-allocate them, hot key-ranges are always at the end
+      // and cold key-ranges are at the begin of the key space. Therefore, the
+      // key-ranges are shuffled and the rand seed is only decide by the
+      // key-range hotness distribution. With the same distribution parameters
+      // the shuffle results are the same.
+      Random64 rand_loca(keyrange_rand_max_);
+      for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
+        int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
+        assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
+               pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
+        std::swap(keyrange_set_[i], keyrange_set_[pos]);
+      }
+
+      // Step 5. Recalculate the prefix start postion after shuffling
+      int64_t offset = 0;
+      for (auto& p_unit : keyrange_set_) {
+        p_unit.keyrange_start = offset;
+        offset += p_unit.keyrange_access;
+      }
+
+      return Status::OK();
+    }
+
+    // Generate the Key ID according to the input ini_rand and key distribution
+    int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
+                         double key_dist_b) {
+      int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
+
+      // Calculate and select one key-range that contains the new key
+      int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
+      while (start + 1 < end) {
+        int64_t mid = start + (end - start) / 2;
+        assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
+        if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
+          end = mid;
+        } else {
+          start = mid;
+        }
+      }
+      int64_t keyrange_id = start;
+
+      // Select one key in the key-range and compose the keyID
+      int64_t key_offset = 0, key_seed;
+      if (key_dist_a == 0.0 && key_dist_b == 0.0) {
+        key_offset = ini_rand % keyrange_size_;
+      } else {
+        key_seed = static_cast<int64_t>(
+            ceil(std::pow((ini_rand / key_dist_a), (1 / key_dist_b))));
+        Random64 rand_key(key_seed);
+        key_offset = static_cast<int64_t>(rand_key.Next()) % keyrange_size_;
+      }
+      return keyrange_size_ * keyrange_id + key_offset;
+    }
+  };
+
+  // The social graph wokrload mixed with Get, Put, Iterator queries.
+  // The value size and iterator length follow Pareto distribution.
+  // The overall key access follow power distribution. If user models the
+  // workload based on different key-ranges (or different prefixes), user
+  // can use two-term-exponential distribution to fit the workload. User
+  // needs to decides the ratio between Get, Put, Iterator queries before
+  // starting the benchmark.
   void MixGraph(ThreadState* thread) {
     int64_t read = 0;  // including single gets and Next of iterators
     int64_t gets = 0;
@@ -5048,6 +5214,8 @@ class Benchmark {
     int64_t scan_len_max = FLAGS_mix_max_scan_len;
     double write_rate = 1000000.0;
     double read_rate = 1000000.0;
+    bool use_prefix_modeling = false;
+    GenerateTwoTermExpKeys gen_exp;
     std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
                               FLAGS_mix_seek_ratio};
     char value_buffer[default_value_max];
@@ -5073,15 +5241,32 @@ class Benchmark {
           NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
     }
 
+    // Decide if user wants to use prefix based key generation
+    if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
+        FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
+      use_prefix_modeling = true;
+      gen_exp.InitiateExpDistribution(
+          FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
+          FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
+    }
+
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
-      int64_t rand_v, key_rand, key_seed;
-      rand_v = GetRandomKey(&thread->rand) % FLAGS_num;
+      int64_t ini_rand, rand_v, key_rand, key_seed;
+      ini_rand = GetRandomKey(&thread->rand);
+      rand_v = ini_rand % FLAGS_num;
       double u = static_cast<double>(rand_v) / FLAGS_num;
-      key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
-      Random64 rand(key_seed);
-      key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+
+      // Generate the keyID based on the key hotness and prefix hotness
+      if (use_prefix_modeling) {
+        key_rand =
+            gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
+      } else {
+        key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
+        Random64 rand(key_seed);
+        key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+      }
       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
       int query_type = query.GetType(rand_v);
 

From 111ebf3161e3ef03986f02a16f1b2207be2567fe Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 6 Nov 2019 17:37:07 -0800
Subject: [PATCH 512/572] db_stress: improve TestGet() failure printing (#5989)

Summary:
Right now, in db_stress's CF consistency test's TestGet case, if failure happens, we do normal string printing, rather than hex printing, so that some text is not printed out, which makes debugging harder. Fix it by printing hex instead.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5989

Test Plan: Build db_stress and see t passes.

Differential Revision: D18363552

fbshipit-source-id: 09d1b8f6fbff37441cbe7e63a1aef27551226cec
---
 tools/db_stress_tool.cc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 8454166023b..1444b146053 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -4277,27 +4277,31 @@ class CfConsistencyStressTest : public StressTest {
           }
           if (!found && s.ok()) {
             fprintf(stderr, "Get() return different results with key %s\n",
-                    key_str.c_str());
+                    Slice(key_str).ToString(true).c_str());
             fprintf(stderr, "CF %s is not found\n",
                     column_family_names_[0].c_str());
             fprintf(stderr, "CF %s returns value %s\n",
-                    column_family_names_[i].c_str(), value1.c_str());
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
             is_consistent = false;
           } else if (found && s.IsNotFound()) {
             fprintf(stderr, "Get() return different results with key %s\n",
-                    key_str.c_str());
+                    Slice(key_str).ToString(true).c_str());
             fprintf(stderr, "CF %s returns value %s\n",
-                    column_family_names_[0].c_str(), value0.c_str());
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
             fprintf(stderr, "CF %s is not found\n",
                     column_family_names_[i].c_str());
             is_consistent = false;
           } else if (s.ok() && value0 != value1) {
             fprintf(stderr, "Get() return different results with key %s\n",
-                    key_str.c_str());
+                    Slice(key_str).ToString(true).c_str());
             fprintf(stderr, "CF %s returns value %s\n",
-                    column_family_names_[0].c_str(), value0.c_str());
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
             fprintf(stderr, "CF %s returns value %s\n",
-                    column_family_names_[i].c_str(), value1.c_str());
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
             is_consistent = false;
           }
           if (!is_consistent) {

From 7b3222e10aad7b82e3d80b3545cc8c3bd4d65d73 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 7 Nov 2019 09:49:41 -0800
Subject: [PATCH 513/572] Partial rebalance of TEST_GROUPs for Travis (#6010)

Summary:
TEST_GROUP=1 has sometimes been timing out but generally taking
45-50 minutes vs. 20-25 for groups 2-4. Beyond the compilation time, tests in
group 1 consist of about 19 minutes of db_test, and 7 minutes of everything
else. This change moves most of that "everything else" to group 2.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6010

Test Plan: Travis for this PR, oncall watch Travis

Differential Revision: D18373536

Pulled By: pdillinger

fbshipit-source-id: 0b3af004c71e4fd6bc01a94dac34cc3079fc9ce1
---
 .travis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8ddafdf0c3a..7af91483bca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -89,13 +89,13 @@ script:
       OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some
       ;;
     1)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=db_iter_test make -j4 check_some
       ;;
     2)
-      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some
+      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_iter_test ROCKSDBTESTS_END=options_file_test make -j4 check_some
       ;;
     3)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=options_file_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
       ;;
     4)
       OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some

From f0b469e563aaecea561b5396c955a4afa5492d73 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 7 Nov 2019 10:56:25 -0800
Subject: [PATCH 514/572] Turn on periodic compaction in universal by default
 if compaction filter is used. (#5994)

Summary:
Recently, periodic compaction got turned on by default for leveled compaction is compaction filter is used. Since periodic compaction is now supported in universal compaction too, we do the same default for universal now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5994

Test Plan: Add a new unit test.

Differential Revision: D18363744

fbshipit-source-id: 5093288ce990ee3cab0e44ffd92d8489fbcd6a48
---
 db/column_family.cc                | 14 ++++++++------
 db/db_universal_compaction_test.cc | 23 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 7c3b0276dd5..f6a012d8faa 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -349,12 +349,14 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   // Turn on periodic compactions and set them to occur once every 30 days if
   // compaction filters are used and periodic_compaction_seconds is set to the
   // default value.
-  if (result.compaction_style == kCompactionStyleLevel &&
-      (result.compaction_filter != nullptr ||
-       result.compaction_filter_factory != nullptr) &&
-      result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
-    result.periodic_compaction_seconds = kDefaultTtlSecs;
-  } else if (result.compaction_style == kCompactionStyleFIFO) {
+  if (result.compaction_style != kCompactionStyleFIFO) {
+    if ((result.compaction_filter != nullptr ||
+         result.compaction_filter_factory != nullptr) &&
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+      result.periodic_compaction_seconds = kDefaultTtlSecs;
+    }
+  } else {
+    // result.compaction_style == kCompactionStyleFIFO
     if (result.ttl == 0) {
       if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
         result.periodic_compaction_seconds = kDefaultTtlSecs;
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 4d42084e6d5..2fe03650bcf 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -2149,6 +2149,29 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) {
   ASSERT_GT(NumTableFilesAtLevel(5), 0);
 }
 
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  options.compaction_filter_factory.reset(filter);
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  KeepFilter df;
+  options.compaction_filter_factory.reset();
+  options.compaction_filter = &df;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_EQ(options.periodic_compaction_seconds,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
 TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
   Options opts = CurrentOptions();
   opts.env = env_;

From 982a7532a72bf94b4f1b340db960a8c0ba53c71c Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 7 Nov 2019 11:12:50 -0800
Subject: [PATCH 515/572] Add two test cases for single sorted universal
 periodic compaction (#6002)

Summary:
It's useful to add test coverage for universal compaction's periodic compaction. Add two tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6002

Test Plan: Run the two tests

Differential Revision: D18363544

fbshipit-source-id: bbd04b54057315f64f959709006412db1f76d170
---
 db/compaction/compaction_picker_test.cc | 48 +++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index c3c9263971e..793261bfc15 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -613,6 +613,54 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
               compaction->start_level() != compaction->output_level());
 }
 
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+  // Test single L0 file periodic compaction triggering.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+  // Test single sorted run non-L0 periodic compaction
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+  Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
 TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const int kFileCount =

From 1da1f04231fbe3678178f6dbf5bcfbd7c6a00910 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 7 Nov 2019 11:13:36 -0800
Subject: [PATCH 516/572] Stress test to relax the iterator verification case
 for lower bound (#5869)

Summary:
In stress test, all iterator verification is turned off is lower bound is enabled. This might be stricter than needed. This PR relaxes the condition and include the case where lower bound is lower than both of seek key and upper bound. It seems to work mostly fine when I run crash test locally.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5869

Test Plan: Run crash_test

Differential Revision: D18363578

fbshipit-source-id: 23d57e11ea507949b8100f4190ddfbe8db052d5a
---
 tools/db_stress_tool.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 1444b146053..1dcd64ea1c5 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2531,9 +2531,13 @@ class StressTest {
       return;
     }
 
-    if (ro.iterate_lower_bound != nullptr) {
-      // Lower bound would create a lot of discrepency for now so disabling
-      // the verification for now.
+    if (ro.iterate_lower_bound != nullptr &&
+        (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >= 0 ||
+         (ro.iterate_upper_bound != nullptr &&
+          options_.comparator->Compare(*ro.iterate_lower_bound,
+                                       *ro.iterate_upper_bound) >= 0))) {
+      // Lower bound behavior is not well defined if it is larger than
+      // seek key or upper bound. Disable the check for now.
       *diverged = true;
       return;
     }

From 9836a1fa33bf62103b46106b2548e7697bdcbd0f Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Thu, 7 Nov 2019 12:00:45 -0800
Subject: [PATCH 517/572] Fix MultiGet crash when no_block_cache is set (#5991)

Summary:
This PR fixes https://github.com/facebook/rocksdb/issues/5975. In ```BlockBasedTable::RetrieveMultipleBlocks()```, we were calling ```MaybeReadBlocksAndLoadToCache()```, which is a no-op if neither uncompressed nor compressed block cache are configured.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5991

Test Plan:
1. Add unit tests that fail with the old code and pass with the new
2. make check and asan_check

Cc spetrunia

Differential Revision: D18272744

Pulled By: anand1976

fbshipit-source-id: e62fa6090d1a6adf84fcd51dfd6859b03c6aebfe
---
 HISTORY.md                                    |  3 +
 db/db_basic_test.cc                           | 30 +++++++---
 table/block_based/block_based_table_reader.cc | 56 +++++++++++--------
 3 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 448fe3d917b..df2e99e5ae2 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -12,6 +12,9 @@ file_creation_time of the oldest SST file in the DB.
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
 
+### Bug Fixes
+* Fix a assertion failure in MultiGe4t() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
+
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 70ecb74ef00..0f6d106f1f5 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1577,8 +1577,12 @@ class DBBasicTestWithParallelIO
     Options options = CurrentOptions();
     Random rnd(301);
     BlockBasedTableOptions table_options;
-    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
     table_options.block_cache = uncompressed_cache_;
+    if (table_options.block_cache == nullptr) {
+      table_options.no_block_cache = true;
+    } else {
+      table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    }
     table_options.block_cache_compressed = compressed_cache_;
     table_options.flush_block_policy_factory.reset(
         new MyFlushBlockPolicyFactory());
@@ -1614,6 +1618,9 @@ class DBBasicTestWithParallelIO
   int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
 
   bool fill_cache() { return fill_cache_; }
+  bool compression_enabled() { return compression_enabled_; }
+  bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+  bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
 
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
@@ -1788,7 +1795,16 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
   ASSERT_TRUE(CheckValue(1, values[0].ToString()));
   ASSERT_TRUE(CheckValue(51, values[1].ToString()));
 
-  int expected_reads = random_reads + (fill_cache() ? 0 : 2);
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads + (read_from_cache ? 0 : 2);
   ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
 
   keys.resize(10);
@@ -1806,7 +1822,7 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
     ASSERT_OK(statuses[i]);
     ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
   }
-  expected_reads += (fill_cache() ? 2 : 4);
+  expected_reads += (read_from_cache ? 2 : 4);
   ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
 }
 
@@ -1817,12 +1833,8 @@ INSTANTIATE_TEST_CASE_P(
     // Param 1 - Uncompressed cache enabled
     // Param 2 - Data compression enabled
     // Param 3 - ReadOptions::fill_cache
-    ::testing::Values(std::make_tuple(false, true, true, true),
-                      std::make_tuple(true, true, true, true),
-                      std::make_tuple(false, true, false, true),
-                      std::make_tuple(false, true, true, false),
-                      std::make_tuple(true, true, true, false),
-                      std::make_tuple(false, true, false, false)));
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                       ::testing::Bool(), ::testing::Bool()));
 
 class DBBasicTestWithTimestampWithParam
     : public DBTestBase,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 082a0ef864f..c3a2e129317 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2398,34 +2398,42 @@ void BlockBasedTable::RetrieveMultipleBlocks(
             nullptr, options, handle, uncompression_dict, block_entry,
             BlockType::kData, mget_iter->get_context,
             &lookup_data_block_context, &raw_block_contents);
+
+        // block_entry value could be null if no block cache is present, i.e
+        // BlockBasedTableOptions::no_block_cache is true and no compressed
+        // block cache is configured. In that case, fall
+        // through and set up the block explicitly
+        if (block_entry->GetValue() != nullptr) {
+          continue;
+        }
+      }
+
+      CompressionType compression_type =
+          raw_block_contents.get_compression_type();
+      BlockContents contents;
+      if (compression_type != kNoCompression) {
+        UncompressionContext context(compression_type);
+        UncompressionInfo info(context, uncompression_dict, compression_type);
+        s = UncompressBlockContents(info, req.result.data(), handle.size(),
+                                    &contents, footer.version(),
+                                    rep_->ioptions, memory_allocator);
       } else {
-        CompressionType compression_type =
-            raw_block_contents.get_compression_type();
-        BlockContents contents;
-        if (compression_type != kNoCompression) {
-          UncompressionContext context(compression_type);
-          UncompressionInfo info(context, uncompression_dict, compression_type);
-          s = UncompressBlockContents(info, req.result.data(), handle.size(),
-                                      &contents, footer.version(),
-                                      rep_->ioptions, memory_allocator);
+        if (scratch != nullptr) {
+          // If we used the scratch buffer, then the contents need to be
+          // copied to heap
+          Slice raw = Slice(req.result.data(), handle.size());
+          contents = BlockContents(
+              CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
+              handle.size());
         } else {
-          if (scratch != nullptr) {
-            // If we used the scratch buffer, then the contents need to be
-            // copied to heap
-            Slice raw = Slice(req.result.data(), handle.size());
-            contents = BlockContents(
-                CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
-                handle.size());
-          } else {
-            contents = std::move(raw_block_contents);
-          }
-        }
-        if (s.ok()) {
-          (*results)[idx_in_batch].SetOwnedValue(
-              new Block(std::move(contents), global_seqno,
-                        read_amp_bytes_per_bit, ioptions.statistics));
+          contents = std::move(raw_block_contents);
         }
       }
+      if (s.ok()) {
+        (*results)[idx_in_batch].SetOwnedValue(
+            new Block(std::move(contents), global_seqno,
+                      read_amp_bytes_per_bit, ioptions.statistics));
+      }
     }
     (*statuses)[idx_in_batch] = s;
   }

From 07a0ad3c2901bc14cb461ef843cd7bc01bf1e263 Mon Sep 17 00:00:00 2001
From: Yun Tang <myasuka@live.com>
Date: Thu, 7 Nov 2019 12:49:39 -0800
Subject: [PATCH 518/572] Download bzip2 packages from sourceforge (#5995)

Summary:
From bzip2's official [download page](http://www.bzip.org/downloads.html), we could download it from sourceforge. This source would be more credible than previous web archive.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5995

Differential Revision: D18377662

fbshipit-source-id: e8353f83d5d6ea6067f78208b7bfb7f0d5b49c05
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 9d0ffab5150..b440f408c0a 100644
--- a/Makefile
+++ b/Makefile
@@ -1738,7 +1738,7 @@ ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
 ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.6
 BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd
-BZIP2_DOWNLOAD_BASE ?= https://web.archive.org/web/20180624184835/http://www.bzip.org
+BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2
 SNAPPY_VER ?= 1.1.7
 SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
@@ -1800,7 +1800,7 @@ endif
 libbz2.a:
 	-rm -rf bzip2-$(BZIP2_VER)
 ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
-	curl --output bzip2-$(BZIP2_VER).tar.gz -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz
+	curl --output bzip2-$(BZIP2_VER).tar.gz -L ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
 endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \

From f80050fa8fa1bdcf9964cf3dffcaef7279c2d665 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Thu, 7 Nov 2019 14:02:16 -0800
Subject: [PATCH 519/572] Add file number/oldest referenced blob file number to
 {Sst,Live}FileMetaData (#6011)

Summary:
The patch exposes the file numbers of the SSTs as well as the oldest blob
files they contain a reference to through the GetColumnFamilyMetaData/
GetLiveFilesMetaData interface.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6011

Test Plan:
Fixed and extended the existing unit tests. (The earlier ColumnFamilyMetaDataTest
wasn't really testing anything because the generated memtables were never
flushed, so the metadata structure was essentially empty.)

Differential Revision: D18361697

Pulled By: ltamasi

fbshipit-source-id: d5ed1d94ac70858b84393c48711441ddfe1251e9
---
 HISTORY.md                                    |   1 +
 db/db_test.cc                                 | 122 ++++++++++++++++--
 db/version_set.cc                             |  17 +--
 include/rocksdb/metadata.h                    |  25 ++--
 .../blob_db/blob_db_impl_filesnapshot.cc      |   5 +-
 utilities/blob_db/blob_db_test.cc             |   1 +
 utilities/checkpoint/checkpoint_impl.cc       |   3 +
 7 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index df2e99e5ae2..e16c279bed7 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,7 @@ file_creation_time of the oldest SST file in the DB.
 
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
+* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
 
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
diff --git a/db/db_test.cc b/db/db_test.cc
index c39345d8ea8..56f7b3dbf7d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -24,6 +24,7 @@
 #endif
 
 #include "cache/lru_cache.h"
+#include "db/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -1019,39 +1020,132 @@ TEST_F(DBTest, FailMoreDbPaths) {
   ASSERT_TRUE(TryReopen(options).IsNotSupported());
 }
 
-void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
+void CheckColumnFamilyMeta(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
+  ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
   uint64_t cf_size = 0;
-  uint64_t cf_csize = 0;
   size_t file_count = 0;
-  for (auto level_meta : cf_meta.levels) {
+
+  for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+    const auto& level_meta_from_cf = cf_meta.levels[i];
+    const auto& level_meta_from_files = files_by_level[i];
+
+    ASSERT_EQ(level_meta_from_cf.level, i);
+    ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+    file_count += level_meta_from_cf.files.size();
+
     uint64_t level_size = 0;
-    uint64_t level_csize = 0;
-    file_count += level_meta.files.size();
-    for (auto file_meta : level_meta.files) {
-      level_size += file_meta.size;
+    for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+      const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+      const auto& file_meta_from_files = level_meta_from_files[j];
+
+      level_size += file_meta_from_cf.size;
+
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                file_meta_from_files.fd.GetNumber());
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                TableFileNameToNumber(file_meta_from_cf.name));
+      ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+      ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+                file_meta_from_files.fd.smallest_seqno);
+      ASSERT_EQ(file_meta_from_cf.largest_seqno,
+                file_meta_from_files.fd.largest_seqno);
+      ASSERT_EQ(file_meta_from_cf.smallestkey,
+                file_meta_from_files.smallest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.largestkey,
+                file_meta_from_files.largest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+                file_meta_from_files.oldest_blob_file_number);
     }
-    ASSERT_EQ(level_meta.size, level_size);
+
+    ASSERT_EQ(level_meta_from_cf.size, level_size);
     cf_size += level_size;
-    cf_csize += level_csize;
   }
+
   ASSERT_EQ(cf_meta.file_count, file_count);
   ASSERT_EQ(cf_meta.size, cf_size);
 }
 
+void CheckLiveFilesMeta(
+    const std::vector<LiveFileMetaData>& live_file_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  size_t total_file_count = 0;
+  for (const auto& f : files_by_level) {
+    total_file_count += f.size();
+  }
+
+  ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+  int level = 0;
+  int i = 0;
+
+  for (const auto& meta : live_file_meta) {
+    if (level != meta.level) {
+      level = meta.level;
+      i = 0;
+    }
+
+    ASSERT_LT(i, files_by_level[level].size());
+
+    const auto& expected_meta = files_by_level[level][i];
+
+    ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+    ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+    ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+    ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+    ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+    ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+    ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+    ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+    ASSERT_EQ(meta.oldest_blob_file_number,
+              expected_meta.oldest_blob_file_number);
+
+    ++i;
+  }
+}
+
 #ifndef ROCKSDB_LITE
-TEST_F(DBTest, ColumnFamilyMetaDataTest) {
+TEST_F(DBTest, MetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
+  options.disable_auto_compactions = true;
   DestroyAndReopen(options);
 
   Random rnd(301);
   int key_index = 0;
-  ColumnFamilyMetaData cf_meta;
   for (int i = 0; i < 100; ++i) {
-    GenerateNewFile(&rnd, &key_index);
-    db_->GetColumnFamilyMetaData(&cf_meta);
-    CheckColumnFamilyMeta(cf_meta);
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    Flush();
   }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  CheckColumnFamilyMeta(cf_meta, files_by_level);
+
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  CheckLiveFilesMeta(live_file_meta, files_by_level);
 }
 
 namespace {
diff --git a/db/version_set.cc b/db/version_set.cc
index 61d140a6fb6..a73806b8199 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1451,16 +1451,14 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
         assert(!ioptions->cf_paths.empty());
         file_path = ioptions->cf_paths.back().path;
       }
+      const uint64_t file_number = file->fd.GetNumber();
       files.emplace_back(SstFileMetaData{
-          MakeTableFileName("", file->fd.GetNumber()),
-          file_path,
-          static_cast<size_t>(file->fd.GetFileSize()),
-          file->fd.smallest_seqno,
-          file->fd.largest_seqno,
-          file->smallest.user_key().ToString(),
+          MakeTableFileName("", file_number), file_number, file_path,
+          static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted});
+          file->being_compacted, file->oldest_blob_file_number});
       files.back().num_entries = file->num_entries;
       files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
@@ -5393,7 +5391,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
           assert(!cfd->ioptions()->cf_paths.empty());
           filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
         }
-        filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
+        const uint64_t file_number = file->fd.GetNumber();
+        filemetadata.name = MakeTableFileName("", file_number);
+        filemetadata.file_number = file_number;
         filemetadata.level = level;
         filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
         filemetadata.smallestkey = file->smallest.user_key().ToString();
@@ -5405,6 +5405,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         filemetadata.being_compacted = file->being_compacted;
         filemetadata.num_entries = file->num_entries;
         filemetadata.num_deletions = file->num_deletions;
+        filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
         metadata->push_back(filemetadata);
       }
     }
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 7b251eb7203..52b5657c3b6 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -55,25 +55,24 @@ struct LevelMetaData {
 struct SstFileMetaData {
   SstFileMetaData()
       : size(0),
-        name(""),
-        db_path(""),
+        file_number(0),
         smallest_seqno(0),
         largest_seqno(0),
-        smallestkey(""),
-        largestkey(""),
         num_reads_sampled(0),
         being_compacted(false),
         num_entries(0),
-        num_deletions(0) {}
+        num_deletions(0),
+        oldest_blob_file_number(0) {}
 
-  SstFileMetaData(const std::string& _file_name, const std::string& _path,
-                  size_t _size, SequenceNumber _smallest_seqno,
-                  SequenceNumber _largest_seqno,
+  SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+                  const std::string& _path, size_t _size,
+                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
-                  bool _being_compacted)
+                  bool _being_compacted, uint64_t _oldest_blob_file_number)
       : size(_size),
         name(_file_name),
+        file_number(_file_number),
         db_path(_path),
         smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno),
@@ -82,12 +81,15 @@ struct SstFileMetaData {
         num_reads_sampled(_num_reads_sampled),
         being_compacted(_being_compacted),
         num_entries(0),
-        num_deletions(0) {}
+        num_deletions(0),
+        oldest_blob_file_number(_oldest_blob_file_number) {}
 
   // File size in bytes.
   size_t size;
   // The name of the file.
   std::string name;
+  // The id of the file.
+  uint64_t file_number;
   // The full path where the file locates.
   std::string db_path;
 
@@ -100,6 +102,9 @@ struct SstFileMetaData {
 
   uint64_t num_entries;
   uint64_t num_deletions;
+
+  uint64_t oldest_blob_file_number;  // The id of the oldest blob file
+                                     // referenced by the file.
 };
 
 // The full set of metadata associated with each SST file.
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index e74396a33d6..4544ecb5a2e 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -94,9 +94,10 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
     auto blob_file = bfile_pair.second;
     LiveFileMetaData filemetadata;
     filemetadata.size = static_cast<size_t>(blob_file->GetFileSize());
+    const uint64_t file_number = blob_file->BlobFileNumber();
     // Path should be relative to db_name, but begin with slash.
-    filemetadata.name =
-        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber());
+    filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+    filemetadata.file_number = file_number;
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
     filemetadata.column_family_name = cfh->GetName();
     metadata->emplace_back(filemetadata);
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index c45001c44ae..b8e6766146b 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -999,6 +999,7 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   // Path should be relative to db_name, but begin with slash.
   std::string filename = "/blob_dir/000001.blob";
   ASSERT_EQ(filename, metadata[0].name);
+  ASSERT_EQ(1, metadata[0].file_number);
   ASSERT_EQ("default", metadata[0].column_family_name);
   std::vector<std::string> livefile;
   uint64_t mfs;
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 6d025662e0f..35344a988ad 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -420,11 +420,14 @@ Status CheckpointImpl::ExportColumnFamily(
         LiveFileMetaData live_file_metadata;
         live_file_metadata.size = file_metadata.size;
         live_file_metadata.name = std::move(file_metadata.name);
+        live_file_metadata.file_number = file_metadata.file_number;
         live_file_metadata.db_path = export_dir;
         live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
         live_file_metadata.largest_seqno = file_metadata.largest_seqno;
         live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
         live_file_metadata.largestkey = std::move(file_metadata.largestkey);
+        live_file_metadata.oldest_blob_file_number =
+            file_metadata.oldest_blob_file_number;
         live_file_metadata.level = level_metadata.level;
         result_metadata->files.push_back(live_file_metadata);
       }

From 72de842ac94e458f3fce9d2516be730d615fac0e Mon Sep 17 00:00:00 2001
From: Yi Wu <yiwu@pingcap.com>
Date: Fri, 8 Nov 2019 13:45:31 -0800
Subject: [PATCH 520/572] Fix
 DBFlushTest::FireOnFlushCompletedAfterCommittedResult hang (#6018)

Summary:
The test would fire two flushes to let them run in parallel. Previously it wait for the first job to be scheduled before firing the second. It is possible the job is not started before the second job being scheduled, making the two job combine into one. Change to wait for the first job being started.

Fixes https://github.com/facebook/rocksdb/issues/6017
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6018

Test Plan:
```
while ./db_flush_test --gtest_filter=*FireOnFlushCompletedAfterCommittedResult*; do :; done
```
and let it run for a while.

Signed-off-by: Yi Wu <yiwu@pingcap.com>

Differential Revision: D18405576

Pulled By: riversand963

fbshipit-source-id: 6ebb6262e033d5dc2ef81cb3eb410b314f2de4c9
---
 db/db_flush_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index c586cd3222d..08a1d8d1be1 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -370,7 +370,7 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
   std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+      {{"DBImpl::BackgroundCallFlush:start",
         "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
        {"DBImpl::FlushMemTableToOutputFile:Finish",
         "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
@@ -401,7 +401,7 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
     // flush_opts.wait = true
     ASSERT_OK(db_->Flush(FlushOptions()));
   });
-  // Wait for first flush scheduled.
+  // Wait for first flush started.
   TEST_SYNC_POINT(
       "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
   // The second flush will exit early without commit its result. The work

From aa63abf698c68ba8448b2c850a71ec1ec14c2255 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Fri, 8 Nov 2019 19:13:41 -0800
Subject: [PATCH 521/572] Auto-GarbageCollect on PurgeOldBackups and
 DeleteBackup (#6015)

Summary:
Only if there is a crash, power failure, or I/O error in
DeleteBackup, shared or private files from the backup might be left
behind that are not cleaned up by PurgeOldBackups or DeleteBackup-- only
by GarbageCollect. This makes the BackupEngine API "leaky by default."
Even if it means a modest performance hit, I think we should make
Delete and Purge do as they say, with ongoing best effort: i.e. future
calls will attempt to finish any incomplete work from earlier calls.

This change does that by having DeleteBackup and PurgeOldBackups do a
GarbageCollect, unless (to minimize performance hit) this BackupEngine
has already done a GarbageCollect and there have been no
deletion-related I/O errors in that GarbageCollect or since then.

Rejected alternative 1: remove meta file last instead of first. This would in theory turn partially deleted backups into corrupted backups, but code changes would be needed to allow the missing files and consider it acceptably corrupt, rather than failing to open the BackupEngine. This might be a reasonable choice, but I mostly rejected it because it doesn't solve the legacy problem of cleaning up existing lingering files.

Rejected alternative 2: use a deletion marker file. If deletion started with creating a file that marks a backup as flagged for deletion, then we could reliably detect partially deleted backups and efficiently finish removing them. In addition to not solving the legacy problem, this could be precarious if there's a disk full situation, and we try to create a new file in order to delete some files. Ugh.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6015

Test Plan: Updated unit tests

Differential Revision: D18401333

Pulled By: pdillinger

fbshipit-source-id: 12944e372ce6809f3f5a4c416c3b321a8927d925
---
 HISTORY.md                                 |   1 +
 include/rocksdb/utilities/backupable_db.h  |   5 +-
 utilities/backupable/backupable_db.cc      | 121 ++++++++++++----
 utilities/backupable/backupable_db_test.cc | 153 ++++++++++++++-------
 4 files changed, 207 insertions(+), 73 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e16c279bed7..1f4e61786fe 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -15,6 +15,7 @@ file_creation_time of the oldest SST file in the DB.
 
 ### Bug Fixes
 * Fix a assertion failure in MultiGe4t() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
+* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 1ca4fc9a670..1c810d7afe7 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -325,7 +325,10 @@ class BackupEngine {
 
   // Will delete all the files we don't need anymore
   // It will do the full scan of the files/ directory and delete all the
-  // files that are not referenced.
+  // files that are not referenced. PurgeOldBackups() and DeleteBackup()
+  // will do a similar operation as needed to clean up from any incomplete
+  // deletions, so this function is not really needed if calling one of
+  // those.
   virtual Status GarbageCollect() = 0;
 };
 
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index aca50a2b6c5..02f33da7ed4 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -9,21 +9,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "rocksdb/utilities/backupable_db.h"
-#include "file/filename.h"
-#include "file/sequence_file_reader.h"
-#include "file/writable_file_writer.h"
-#include "logging/logging.h"
-#include "port/port.h"
-#include "rocksdb/rate_limiter.h"
-#include "rocksdb/transaction_log.h"
-#include "test_util/sync_point.h"
-#include "util/channel.h"
-#include "util/coding.h"
-#include "util/crc32c.h"
-#include "util/string_util.h"
-#include "utilities/checkpoint/checkpoint_impl.h"
-
 #include <stdlib.h>
 #include <algorithm>
 #include <atomic>
@@ -40,6 +25,21 @@
 #include <unordered_set>
 #include <vector>
 
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "test_util/sync_point.h"
+#include "util/channel.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+#include "utilities/checkpoint/checkpoint_impl.h"
+
 namespace rocksdb {
 
 void BackupStatistics::IncrementNumberSuccessBackup() {
@@ -121,6 +121,7 @@ class BackupEngineImpl : public BackupEngine {
 
  private:
   void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
+  Status DeleteBackupInternal(BackupID backup_id);
 
   // Extends the "result" map with pathname->size mappings for the contents of
   // "dir" in "env". Pathnames are prefixed with "dir".
@@ -457,6 +458,10 @@ class BackupEngineImpl : public BackupEngine {
   std::mutex byte_report_mutex_;
   channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
   std::vector<port::Thread> threads_;
+  // Certain operations like PurgeOldBackups and DeleteBackup will trigger
+  // automatic GarbageCollect (true) unless we've already done one in this
+  // session and have not failed to delete backup files since then (false).
+  bool might_need_garbage_collect_ = true;
 
   // Adds a file to the backup work queue to be copied or created if it doesn't
   // already exist.
@@ -560,6 +565,9 @@ Status BackupEngineImpl::Initialize() {
   options_.Dump(options_.info_log);
 
   if (!read_only_) {
+    // we might need to clean up from previous crash or I/O errors
+    might_need_garbage_collect_ = true;
+
     // gather the list of directories that we need to create
     std::vector<std::pair<std::string, std::unique_ptr<Directory>*>>
         directories;
@@ -929,8 +937,8 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
     ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n",
                    backup_statistics_.ToString().c_str());
     // delete files that we might have already written
+    might_need_garbage_collect_ = true;
     DeleteBackup(new_backup_id);
-    GarbageCollect();
     return s;
   }
 
@@ -958,6 +966,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
 Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
   assert(initialized_);
   assert(!read_only_);
+
+  // Best effort deletion even with errors
+  Status overall_status = Status::OK();
+
   ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u",
                  num_backups_to_keep);
   std::vector<BackupID> to_delete;
@@ -967,17 +979,44 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
     itr++;
   }
   for (auto backup_id : to_delete) {
-    auto s = DeleteBackup(backup_id);
+    auto s = DeleteBackupInternal(backup_id);
     if (!s.ok()) {
-      return s;
+      overall_status = s;
     }
   }
-  return Status::OK();
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    auto s = GarbageCollect();
+    if (!s.ok() && overall_status.ok()) {
+      overall_status = s;
+    }
+  }
+  return overall_status;
 }
 
 Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  auto s1 = DeleteBackupInternal(backup_id);
+  auto s2 = Status::OK();
+
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    s2 = GarbageCollect();
+  }
+
+  if (!s1.ok()) {
+    return s1;
+  } else {
+    return s2;
+  }
+}
+
+// Does not auto-GarbageCollect
+Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
   assert(initialized_);
   assert(!read_only_);
+
   ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
   if (backup != backups_.end()) {
@@ -998,6 +1037,10 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
     corrupt_backups_.erase(corrupt);
   }
 
+  // After removing meta file, best effort deletion even with errors.
+  // (Don't delete other files if we can't delete the meta file right
+  // now.)
+
   if (options_.max_valid_backups_to_open == port::kMaxInt32) {
     std::vector<std::string> to_delete;
     for (auto& itr : backuped_file_infos_) {
@@ -1006,6 +1049,10 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
         ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                        itr.first.c_str(), s.ToString().c_str());
         to_delete.push_back(itr.first);
+        if (!s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
       }
     }
     for (auto& td : to_delete) {
@@ -1024,6 +1071,10 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
   ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s",
                  private_dir.c_str(), s.ToString().c_str());
+  if (!s.ok()) {
+    // Full gc or trying again later might work
+    might_need_garbage_collect_ = true;
+  }
   return Status::OK();
 }
 
@@ -1506,8 +1557,15 @@ Status BackupEngineImpl::InsertPathnameToSizeBytes(
 
 Status BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
+
+  // We will make a best effort to remove all garbage even in the presence
+  // of inconsistencies or I/O failures that inhibit finding garbage.
+  Status overall_status = Status::OK();
+  // If all goes well, we don't need another auto-GC this session
+  might_need_garbage_collect_ = false;
+
   ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
-  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
+  if (options_.max_valid_backups_to_open != port::kMaxInt32) {
     ROCKS_LOG_WARN(
         options_.info_log,
         "Garbage collection is limited since `max_valid_backups_to_open` "
@@ -1534,7 +1592,9 @@ Status BackupEngineImpl::GarbageCollect() {
         s = Status::OK();
       }
       if (!s.ok()) {
-        return s;
+        overall_status = s;
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
       }
     }
     for (auto& child : shared_children) {
@@ -1554,6 +1614,10 @@ Status BackupEngineImpl::GarbageCollect() {
         ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                        rel_fname.c_str(), s.ToString().c_str());
         backuped_file_infos_.erase(rel_fname);
+        if (!s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
       }
     }
   }
@@ -1564,7 +1628,9 @@ Status BackupEngineImpl::GarbageCollect() {
     auto s = backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
                                       &private_children);
     if (!s.ok()) {
-      return s;
+      overall_status = s;
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
     }
   }
   for (auto& child : private_children) {
@@ -1588,14 +1654,23 @@ Status BackupEngineImpl::GarbageCollect() {
       ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                      (full_private_path + subchild).c_str(),
                      s.ToString().c_str());
+      if (!s.ok()) {
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
+      }
     }
     // finally delete the private dir
     Status s = backup_env_->DeleteDir(full_private_path);
     ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s",
                    full_private_path.c_str(), s.ToString().c_str());
+    if (!s.ok()) {
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
+    }
   }
 
-  return Status::OK();
+  assert(overall_status.ok() || might_need_garbage_collect_);
+  return overall_status;
 }
 
 // ------- BackupMeta class --------
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 725bc0740ff..19cff6648b8 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -838,7 +838,7 @@ INSTANTIATE_TEST_CASE_P(BackupableDBTestWithParam, BackupableDBTestWithParam,
                         ::testing::Bool());
 
 // this will make sure that backup does not copy the same file twice
-TEST_F(BackupableDBTest, NoDoubleCopy) {
+TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
   OpenDBAndBackupEngine(true, true);
 
   // should write 5 DB files + one meta file
@@ -856,23 +856,30 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   AppendPath(backupdir_, should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
-  // should write 4 new DB files + one meta file
-  // should not write/copy 00010.sst, since it's already there!
-  test_backup_env_->SetLimitWrittenFiles(6);
-  test_backup_env_->ClearWrittenFiles();
+  char db_number = '1';
 
-  dummy_db_->live_files_ = {"/00010.sst", "/00015.sst", "/CURRENT",
-                            "/MANIFEST-01"};
-  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
-  test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
-  // should not open 00010.sst - it's already there
+  for (std::string other_sst : {"00015.sst", "00017.sst", "00019.sst"}) {
+    // should write 4 new DB files + one meta file
+    // should not write/copy 00010.sst, since it's already there!
+    test_backup_env_->SetLimitWrittenFiles(6);
+    test_backup_env_->ClearWrittenFiles();
 
-  should_have_written = {"/shared/.00015.sst.tmp", "/private/2/CURRENT",
-                         "/private/2/MANIFEST-01", "/private/2/00011.log",
-                         "/meta/.2.tmp"};
-  AppendPath(backupdir_, should_have_written);
-  test_backup_env_->AssertWrittenFiles(should_have_written);
+    dummy_db_->live_files_ = {"/00010.sst", "/" + other_sst, "/CURRENT",
+                              "/MANIFEST-01"};
+    dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+    test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+    // should not open 00010.sst - it's already there
+
+    ++db_number;
+    std::string private_dir = std::string("/private/") + db_number;
+    should_have_written = {
+        "/shared/." + other_sst + ".tmp", private_dir + "/CURRENT",
+        private_dir + "/MANIFEST-01", private_dir + "/00011.log",
+        std::string("/meta/.") + db_number + ".tmp"};
+    AppendPath(backupdir_, should_have_written);
+    test_backup_env_->AssertWrittenFiles(should_have_written);
+  }
 
   ASSERT_OK(backup_engine_->DeleteBackup(1));
   ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
@@ -889,6 +896,42 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
   ASSERT_EQ(200UL, size);
 
+  CloseBackupEngine();
+
+  //
+  // Now simulate incomplete delete by removing just meta
+  //
+  ASSERT_OK(test_backup_env_->DeleteFile(backupdir_ + "/meta/2"));
+
+  OpenBackupEngine();
+
+  // 1 appears to be removed, so
+  // 2 non-corrupt and 0 corrupt seen
+  std::vector<BackupInfo> backup_info;
+  std::vector<BackupID> corrupt_backup_ids;
+  backup_engine_->GetBackupInfo(&backup_info);
+  backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+  ASSERT_EQ(2UL, backup_info.size());
+  ASSERT_EQ(0UL, corrupt_backup_ids.size());
+
+  // Keep the two we see, but this should suffice to purge unreferenced
+  // shared files from incomplete delete.
+  ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+  // Make sure dangling sst file has been removed (somewhere along this
+  // process). GarbageCollect should not be needed.
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+  // Now actually purge a good one
+  ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
   CloseDBAndBackupEngine();
 }
 
@@ -971,7 +1014,8 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   ASSERT_OK(backup_engine_->DeleteBackup(4));
   ASSERT_OK(backup_engine_->DeleteBackup(3));
   ASSERT_OK(backup_engine_->DeleteBackup(2));
-  (void)backup_engine_->GarbageCollect();
+  // Should not be needed anymore with auto-GC on DeleteBackup
+  //(void)backup_engine_->GarbageCollect();
   ASSERT_EQ(Status::NotFound(),
             file_manager_->FileExists(backupdir_ + "/meta/5"));
   ASSERT_EQ(Status::NotFound(),
@@ -1221,41 +1265,52 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
 }
 
 TEST_F(BackupableDBTest, DeleteTmpFiles) {
-  for (bool shared_checksum : {false, true}) {
-    if (shared_checksum) {
+  for (int cleanup_fn : {1, 2, 3}) {
+    for (bool shared_checksum : {false, true}) {
       OpenDBAndBackupEngineShareWithChecksum(
           false /* destroy_old_data */, false /* dummy */,
-          true /* share_table_files */, true /* share_with_checksums */);
-    } else {
-      OpenDBAndBackupEngine();
-    }
-    CloseDBAndBackupEngine();
-    std::string shared_tmp = backupdir_;
-    if (shared_checksum) {
-      shared_tmp += "/shared_checksum";
-    } else {
-      shared_tmp += "/shared";
-    }
-    shared_tmp += "/.00006.sst.tmp";
-    std::string private_tmp_dir = backupdir_ + "/private/10";
-    std::string private_tmp_file = private_tmp_dir + "/00003.sst";
-    file_manager_->WriteToFile(shared_tmp, "tmp");
-    file_manager_->CreateDir(private_tmp_dir);
-    file_manager_->WriteToFile(private_tmp_file, "tmp");
-    ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
-    if (shared_checksum) {
-      OpenDBAndBackupEngineShareWithChecksum(
-          false /* destroy_old_data */, false /* dummy */,
-          true /* share_table_files */, true /* share_with_checksums */);
-    } else {
-      OpenDBAndBackupEngine();
+          true /* share_table_files */, shared_checksum);
+      CloseDBAndBackupEngine();
+      std::string shared_tmp = backupdir_;
+      if (shared_checksum) {
+        shared_tmp += "/shared_checksum";
+      } else {
+        shared_tmp += "/shared";
+      }
+      shared_tmp += "/.00006.sst.tmp";
+      std::string private_tmp_dir = backupdir_ + "/private/10";
+      std::string private_tmp_file = private_tmp_dir + "/00003.sst";
+      file_manager_->WriteToFile(shared_tmp, "tmp");
+      file_manager_->CreateDir(private_tmp_dir);
+      file_manager_->WriteToFile(private_tmp_file, "tmp");
+      ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
+      if (shared_checksum) {
+        OpenDBAndBackupEngineShareWithChecksum(
+            false /* destroy_old_data */, false /* dummy */,
+            true /* share_table_files */, true /* share_with_checksums */);
+      } else {
+        OpenDBAndBackupEngine();
+      }
+      // Need to call one of these explicitly to delete tmp files
+      switch (cleanup_fn) {
+        case 1:
+          (void)backup_engine_->GarbageCollect();
+          break;
+        case 2:
+          (void)backup_engine_->DeleteBackup(1);
+          break;
+        case 3:
+          (void)backup_engine_->PurgeOldBackups(1);
+          break;
+        default:
+          assert(false);
+      }
+      CloseDBAndBackupEngine();
+      ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
+      ASSERT_EQ(Status::NotFound(),
+                file_manager_->FileExists(private_tmp_file));
+      ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
     }
-    // Need to call this explicitly to delete tmp files
-    (void)backup_engine_->GarbageCollect();
-    CloseDBAndBackupEngine();
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_file));
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
   }
 }
 

From 8e7aa628138bf2ba3fcc26155beac0e0af99fd08 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 11 Nov 2019 14:00:25 -0800
Subject: [PATCH 522/572] BlobDB: Maintain mapping between blob files and SSTs
 (#6020)

Summary:
The patch adds logic to BlobDB to maintain the mapping between blob files
and SSTs for which the blob file in question is the oldest blob file referenced
by the SST file. The mapping is initialized during database open based on the
information retrieved using `GetLiveFilesMetaData`, and updated after
flushes/compactions based on the information received through the `EventListener`
interface (or, in the case of manual compactions issued through the `CompactFiles`
API, the `CompactionJobInfo` object).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6020

Test Plan: Added a unit test; also tested using the BlobDB mode of `db_bench`.

Differential Revision: D18410508

Pulled By: ltamasi

fbshipit-source-id: dd9e778af781cfdb0d7056298c54ba9cebdd54a5
---
 utilities/blob_db/blob_db.h          |  22 +++
 utilities/blob_db/blob_db_impl.cc    | 211 ++++++++++++++++++++++++++-
 utilities/blob_db/blob_db_impl.h     |  49 ++++++-
 utilities/blob_db/blob_db_listener.h |  22 ++-
 utilities/blob_db/blob_db_test.cc    | 105 +++++++++++++
 utilities/blob_db/blob_file.h        |  24 +++
 6 files changed, 429 insertions(+), 4 deletions(-)

diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 8a7f7e0842a..d6ebca2db48 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -204,6 +204,28 @@ class BlobDB : public StackableDB {
     return NewIterator(options);
   }
 
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override = 0;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+
+    return CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  }
+
   using rocksdb::StackableDB::Close;
   virtual Status Close() override = 0;
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index d3a4d8e4aec..ea68178f82c 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -141,6 +141,15 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   // compactions irrespective of the user set value.
   cf_options_.periodic_compaction_seconds = 0;
 
+  // Temporarily disable compactions in the base DB during open; save the user
+  // defined value beforehand so we can restore it once BlobDB is initialized.
+  // Note: this is only needed if garbage collection is enabled.
+  const bool disable_auto_compactions = cf_options_.disable_auto_compactions;
+
+  if (bdb_options_.enable_garbage_collection) {
+    cf_options_.disable_auto_compactions = true;
+  }
+
   Status s;
 
   // Create info log.
@@ -175,7 +184,9 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   }
 
   // Update options
-  db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
+  db_options_.listeners.push_back(bdb_options_.enable_garbage_collection
+                                      ? std::make_shared<BlobDBListenerGC>(this)
+                                      : std::make_shared<BlobDBListener>(this));
   cf_options_.compaction_filter_factory.reset(
       new BlobIndexCompactionFilterFactory(this, env_, statistics_));
 
@@ -187,6 +198,26 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   }
   db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
 
+  // Initialize SST file <-> oldest blob file mapping if garbage collection
+  // is enabled.
+  if (bdb_options_.enable_garbage_collection) {
+    std::vector<LiveFileMetaData> live_files;
+    db_->GetLiveFilesMetaData(&live_files);
+
+    InitializeBlobFileToSstMapping(live_files);
+
+    if (!disable_auto_compactions) {
+      s = db_->EnableAutoCompaction(*handles);
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Failed to enable automatic compactions during open, status: %s",
+            s.ToString().c_str());
+        return s;
+      }
+    }
+  }
+
   // Add trash files in blob dir to file delete scheduler.
   SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
       db_impl_->immutable_db_options().sst_file_manager.get());
@@ -302,6 +333,137 @@ Status BlobDBImpl::OpenAllBlobFiles() {
   return s;
 }
 
+template <typename Linker>
+void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                                       uint64_t blob_file_number,
+                                       Linker linker) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to link "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  linker(blob_file, sst_file_number);
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " linked to SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number,
+                                   uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                                         uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                                       uint64_t blob_file_number) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to unlink "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->UnlinkSstFile(sst_file_number);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " unlinked from SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  for (const auto& live_file : live_files) {
+    const uint64_t sst_file_number = live_file.file_number;
+    const uint64_t blob_file_number = live_file.oldest_blob_file_number;
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    LinkSstToBlobFileNoLock(sst_file_number, blob_file_number);
+  }
+}
+
+void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  if (info.oldest_blob_file_number == kInvalidBlobFileNumber) {
+    return;
+  }
+
+  {
+    ReadLock lock(&mutex_);
+    LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
+  }
+}
+
+void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Note: the same SST file may appear in both the input and the output
+  // file list in case of a trivial move. We process the inputs first
+  // to ensure the blob file still has a link after processing all updates.
+
+  {
+    ReadLock lock(&mutex_);
+
+    for (const auto& input : info.input_file_infos) {
+      if (input.oldest_blob_file_number == kInvalidBlobFileNumber) {
+        continue;
+      }
+
+      UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+    }
+
+    for (const auto& output : info.output_file_infos) {
+      if (output.oldest_blob_file_number == kInvalidBlobFileNumber) {
+        continue;
+      }
+
+      LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+    }
+  }
+}
+
 void BlobDBImpl::CloseRandomAccessLocked(
     const std::shared_ptr<BlobFile>& bfile) {
   bfile->CloseRandomAccessLocked();
@@ -777,6 +939,34 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   return *compression_output;
 }
 
+Status BlobDBImpl::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<std::string>& input_file_names, const int output_level,
+    const int output_path_id, std::vector<std::string>* const output_file_names,
+    CompactionJobInfo* compaction_job_info) {
+  // Note: we need CompactionJobInfo to be able to track updates to the
+  // blob file <-> SST mappings, so we provide one if the user hasn't,
+  // assuming that GC is enabled.
+  CompactionJobInfo info{};
+  if (bdb_options_.enable_garbage_collection && !compaction_job_info) {
+    compaction_job_info = &info;
+  }
+
+  const Status s =
+      db_->CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (bdb_options_.enable_garbage_collection) {
+    assert(compaction_job_info);
+    ProcessCompactionJobInfo(*compaction_job_info);
+  }
+
+  return s;
+}
+
 void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
   ReadLock l(&mutex_);
 
@@ -1902,6 +2092,11 @@ Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
   return GetBlobValue(key, index_entry, value);
 }
 
+void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number) {
+  blob_files_[blob_file_number] = std::make_shared<BlobFile>(
+      this, blob_dir_, blob_file_number, db_options_.info_log.get());
+}
+
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
   ReadLock l(&mutex_);
   std::vector<std::shared_ptr<BlobFile>> blob_files;
@@ -1948,6 +2143,20 @@ void BlobDBImpl::TEST_EvictExpiredFiles() {
 }
 
 uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); }
+
+void BlobDBImpl::TEST_InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  InitializeBlobFileToSstMapping(live_files);
+}
+
+void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) {
+  ProcessFlushJobInfo(info);
+}
+
+void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  ProcessCompactionJobInfo(info);
+}
+
 #endif  //  !NDEBUG
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 0a22c0acd9b..49e39c89f22 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -77,6 +77,8 @@ struct GCStats {
 class BlobDBImpl : public BlobDB {
   friend class BlobFile;
   friend class BlobDBIterator;
+  friend class BlobDBListener;
+  friend class BlobDBListenerGC;
 
  public:
   // deletions check period
@@ -148,6 +150,14 @@ class BlobDBImpl : public BlobDB {
   Status PutUntil(const WriteOptions& options, const Slice& key,
                   const Slice& value, uint64_t expiration) override;
 
+  using BlobDB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
   BlobDBOptions GetBlobDBOptions() const override;
 
   BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
@@ -169,14 +179,14 @@ class BlobDBImpl : public BlobDB {
 
   Status SyncBlobFiles() override;
 
-  void UpdateLiveSSTSize();
-
   void GetCompactionContext(BlobCompactionContext* context);
 
 #ifndef NDEBUG
   Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
                            PinnableSlice* value);
 
+  void TEST_AddDummyBlobFile(uint64_t blob_file_number);
+
   std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
 
   std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
@@ -199,6 +209,14 @@ class BlobDBImpl : public BlobDB {
   uint64_t TEST_live_sst_size();
 
   const std::string& TEST_blob_dir() const { return blob_dir_; }
+
+  void TEST_InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  void TEST_ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
 #endif  //  !NDEBUG
 
  private:
@@ -284,6 +302,33 @@ class BlobDBImpl : public BlobDB {
   // Open all blob files found in blob_dir.
   Status OpenAllBlobFiles();
 
+  // Link an SST to a blob file. Comes in locking and non-locking varieties
+  // (the latter is used during Open).
+  template <typename Linker>
+  void LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                             uint64_t blob_file_number, Linker linker);
+
+  void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number);
+
+  void LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                               uint64_t blob_file_number);
+
+  // Unlink an SST from a blob file.
+  void UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                             uint64_t blob_file_number);
+
+  // Initialize the mapping between blob files and SSTs during Open.
+  void InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  // Update the mapping between blob files and SSTs after a flush.
+  void ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  // Update the mapping between blob files and SSTs after a compaction.
+  void ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+  void UpdateLiveSSTSize();
+
   Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
                            std::shared_ptr<RandomAccessFileReader>* reader);
 
diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h
index f096d238ba3..af976af810c 100644
--- a/utilities/blob_db/blob_db_listener.h
+++ b/utilities/blob_db/blob_db_listener.h
@@ -37,10 +37,30 @@ class BlobDBListener : public EventListener {
     blob_db_impl_->UpdateLiveSSTSize();
   }
 
- private:
+ protected:
   BlobDBImpl* blob_db_impl_;
 };
 
+class BlobDBListenerGC : public BlobDBListener {
+ public:
+  explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl)
+      : BlobDBListener(blob_db_impl) {}
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    BlobDBListener::OnFlushCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessFlushJobInfo(info);
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    BlobDBListener::OnCompactionCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessCompactionJobInfo(info);
+  }
+};
+
 }  // namespace blob_db
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index b8e6766146b..ce18f144ee1 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -1614,6 +1614,111 @@ TEST_F(BlobDBTest, DisableFileDeletions) {
   }
 }
 
+TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
+  BlobDBOptions bdb_options;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  // Register some dummy blob files.
+  blob_db_impl()->TEST_AddDummyBlobFile(1);
+  blob_db_impl()->TEST_AddDummyBlobFile(2);
+  blob_db_impl()->TEST_AddDummyBlobFile(3);
+  blob_db_impl()->TEST_AddDummyBlobFile(4);
+  blob_db_impl()->TEST_AddDummyBlobFile(5);
+
+  // Initialize the blob <-> SST file mapping. First, add some SST files with
+  // blob file references, then some without.
+  std::vector<LiveFileMetaData> live_files;
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+    live_file.oldest_blob_file_number = ((i - 1) % 5) + 1;
+
+    live_files.emplace_back(live_file);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+
+    live_files.emplace_back(live_file);
+  }
+
+  blob_db_impl()->TEST_InitializeBlobFileToSstMapping(live_files);
+
+  // Check that the blob <-> SST mappings have been correctly initialized.
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+
+  ASSERT_EQ(blob_files.size(), 5);
+
+  {
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+    }
+  }
+
+  // Simulate a flush where the SST does not reference any blob files.
+  {
+    FlushJobInfo info{};
+    info.file_number = 21;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+    }
+  }
+
+  // Simulate a flush where the SST references a blob file.
+  {
+    FlushJobInfo info{};
+    info.file_number = 22;
+    info.oldest_blob_file_number = 5;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+    }
+  }
+
+  // Simulate a compaction. Some inputs and outputs have blob file references,
+  // some don't. There is also a trivial move (which means the SST appears on
+  // both the input and the output list).
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2});
+    info.input_file_infos.emplace_back(
+        CompactionFileInfo{1, 11, kInvalidBlobFileNumber});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 5, 22});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3});
+    info.output_file_infos.emplace_back(
+        CompactionFileInfo{2, 24, kInvalidBlobFileNumber});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 5, 22});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {6}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+    }
+  }
+}
+
 TEST_F(BlobDBTest, ShutdownWait) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 5442b727fed..53467619952 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -7,6 +7,7 @@
 
 #include <atomic>
 #include <memory>
+#include <unordered_set>
 
 #include "file/random_access_file_reader.h"
 #include "port/port.h"
@@ -38,6 +39,10 @@ class BlobFile {
   // after that
   uint64_t file_number_;
 
+  // The file numbers of the SST files whose oldest blob file reference
+  // points to this blob file.
+  std::unordered_set<uint64_t> linked_sst_files_;
+
   // Info log.
   Logger* info_log_;
 
@@ -116,6 +121,25 @@ class BlobFile {
   // once the file is created, this never changes
   uint64_t BlobFileNumber() const { return file_number_; }
 
+  // Get the set of SST files whose oldest blob file reference points to
+  // this file.
+  const std::unordered_set<uint64_t>& GetLinkedSstFiles() const {
+    return linked_sst_files_;
+  }
+
+  // Link an SST file whose oldest blob file reference points to this file.
+  void LinkSstFile(uint64_t sst_file_number) {
+    assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end());
+    linked_sst_files_.insert(sst_file_number);
+  }
+
+  // Unlink an SST file whose oldest blob file reference points to this file.
+  void UnlinkSstFile(uint64_t sst_file_number) {
+    auto it = linked_sst_files_.find(sst_file_number);
+    assert(it != linked_sst_files_.end());
+    linked_sst_files_.erase(it);
+  }
+
   // the following functions are atomic, and don't need
   // read lock
   uint64_t BlobCount() const {

From c17384fea451e4f0f1c16192051fbbf0da50a952 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Mon, 11 Nov 2019 14:07:36 -0800
Subject: [PATCH 523/572] Cascade TTL Compactions to move expired key ranges to
 bottom levels faster (#5992)

Summary:
When users use Level-Compaction-with-TTL by setting `cf_options.ttl`, the ttl-expired data could take n*ttl time to reach the bottom level (where n is the number of levels) due to how the `creation_time` table property was calculated for the newly created files during compaction. The creation time of new files was set to a max of all compaction-input-files-creation-times which essentially resulted in resetting the ttl as the key range moves across levels. This behavior is now fixed by changing the `creation_time` to be based on minimum of all compaction-input-files-creation-times; this will cause cascading compactions across levels for the ttl-expired data to move to the bottom level, resulting in getting rid of tombstones/deleted-data faster.

This will help start cascading compactions to move the expired key range to the bottom-most level faster.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5992

Test Plan: `make check`

Differential Revision: D18257883

Pulled By: sagar0

fbshipit-source-id: 00df0bb8d0b7e14d9fc239df2cba8559f3e54cbc
---
 HISTORY.md                      |  4 +-
 db/compaction/compaction.cc     |  8 +--
 db/compaction/compaction.h      |  2 +-
 db/compaction/compaction_job.cc |  9 ++--
 db/db_compaction_test.cc        | 87 +++++++++++++++++++++++++++++++++
 5 files changed, 98 insertions(+), 12 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 1f4e61786fe..1b6a7186df6 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,10 +1,10 @@
 # Rocksdb Change Log
 ## Unreleased
 ### Public API Change
+* TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
-* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the
-file_creation_time of the oldest SST file in the DB. 
+* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
 
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 6d7a3561660..3e2b6079bf3 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -545,17 +545,17 @@ bool Compaction::ShouldFormSubcompactions() const {
   }
 }
 
-uint64_t Compaction::MaxInputFileCreationTime() const {
-  uint64_t max_creation_time = 0;
+uint64_t Compaction::MinInputFileCreationTime() const {
+  uint64_t min_creation_time = port::kMaxUint64;
   for (const auto& file : inputs_[0].files) {
     if (file->fd.table_reader != nullptr &&
         file->fd.table_reader->GetTableProperties() != nullptr) {
       uint64_t creation_time =
           file->fd.table_reader->GetTableProperties()->creation_time;
-      max_creation_time = std::max(max_creation_time, creation_time);
+      min_creation_time = std::min(min_creation_time, creation_time);
     }
   }
-  return max_creation_time;
+  return min_creation_time;
 }
 
 int Compaction::GetInputBaseLevel() const {
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 598b08e7c65..875570ac412 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -291,7 +291,7 @@ class Compaction {
 
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
-  uint64_t MaxInputFileCreationTime() const;
+  uint64_t MinInputFileCreationTime() const;
 
  private:
   // mark (or clear) all files that are being compacted
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 4204c402c09..225fb10cf09 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1511,10 +1511,9 @@ Status CompactionJob::OpenCompactionOutputFile(
   }
   uint64_t current_time = static_cast<uint64_t>(temp_current_time);
 
-  uint64_t latest_key_time =
-      sub_compact->compaction->MaxInputFileCreationTime();
-  if (latest_key_time == 0) {
-    latest_key_time = current_time;
+  uint64_t creation_time = sub_compact->compaction->MinInputFileCreationTime();
+  if (creation_time == port::kMaxUint64) {
+    creation_time = current_time;
   }
 
   sub_compact->builder.reset(NewTableBuilder(
@@ -1524,7 +1523,7 @@ Status CompactionJob::OpenCompactionOutputFile(
       sub_compact->compaction->output_compression(),
       0 /*sample_for_compression */,
       sub_compact->compaction->output_compression_opts(),
-      sub_compact->compaction->output_level(), skip_filters, latest_key_time,
+      sub_compact->compaction->output_level(), skip_filters, creation_time,
       0 /* oldest_key_time */, sub_compact->compaction->max_output_file_size(),
       current_time));
   LogFlush(db_options_.info_log);
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index bf301d9834a..f8c25e89637 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3521,6 +3521,93 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  int ttl_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+  Random rnd(301);
+  for (int i = 1; i <= 100; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+  }
+  Flush();
+  for (int i = 101; i <= 200; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+  }
+  Flush();
+  MoveFilesToLevel(6);
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+  // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+  for (int i = 1; i <= 50; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+  }
+  Flush();
+  for (int i = 51; i <= 150; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+  }
+  Flush();
+  MoveFilesToLevel(4);
+  ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+  // Add one L1 file with key range: [26, 75].
+  for (int i = 26; i <= 75; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+  // LSM tree:
+  // L1:         [26 .. 75]
+  // L4:     [1 .. 50][51 ..... 150]
+  // L6:     [1 ........ 100][101 .... 200]
+  //
+  // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+  // compactions should keep going on until the key range hits bottom level.
+  // In other words: the compaction on this data range "cascasdes" until
+  // reaching the bottom level.
+  //
+  // Order of events on TTL expiry:
+  // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the ttl
+  //    compaction.
+  // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+  // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+  //    by the ttl compaction.
+  // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+  // Add 25 hours and do a write
+  env_->addon_time_.fetch_add(25 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(5, ttl_compactions);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
   const int kNumKeysPerFile = 32;
   const int kNumLevelFiles = 2;

From f29e6b3be29523e09e49e68f7dc8ed5f92586324 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=94=A1=E6=B8=A0=E6=A3=A0?= <caiqutang@kuaishang.com.cn>
Date: Mon, 11 Nov 2019 15:56:07 -0800
Subject: [PATCH 524/572] =?UTF-8?q?bugfix:=20MemTableList::RemoveOldMemTab?=
 =?UTF-8?q?les=20invalid=20iterator=20after=20remov=E2=80=A6=20(#6013)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Fix issue https://github.com/facebook/rocksdb/issues/6012.

I found that it may be caused by the following codes in function _RemoveOldMemTables()_ in **db/memtable_list.cc**  :
```
  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
    MemTable* mem = *it;
    if (mem->GetNextLogNumber() > log_number) {
      break;
    }
    current_->Remove(mem, to_delete);
```

The iterator **it** turns invalid after `current_->Remove(mem, to_delete);`
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6013

Test Plan:
```
make check
```

Differential Revision: D18401107

Pulled By: riversand963

fbshipit-source-id: bf0da3b868ed70f7aff24cf7b3e2049c0c5c7a4e
---
 db/memtable_list.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index de212b6a504..d9159b7937f 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -736,17 +736,24 @@ void MemTableList::RemoveOldMemTables(uint64_t log_number,
   assert(to_delete != nullptr);
   InstallNewVersion();
   auto& memlist = current_->memlist_;
+  autovector<MemTable*> old_memtables;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* mem = *it;
     if (mem->GetNextLogNumber() > log_number) {
       break;
     }
+    old_memtables.push_back(mem);
+  }
+
+  for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+    MemTable* mem = *it;
     current_->Remove(mem, to_delete);
     --num_flush_not_started_;
     if (0 == num_flush_not_started_) {
       imm_flush_needed.store(false, std::memory_order_release);
     }
   }
+
   UpdateMemoryUsageExcludingLast();
   ResetTrimHistoryNeeded();
 }

From 03ce7fb292cbcb1ce1eb9e461a52910dd37872ac Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Mon, 11 Nov 2019 16:57:49 -0800
Subject: [PATCH 525/572] Fix a buffer overrun problem in
 BlockBasedTable::MultiGet (#6014)

Summary:
The calculation in BlockBasedTable::MultiGet for the required buffer length for reading in compressed blocks is incorrect. It needs to take the 5-byte block trailer into account.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6014

Test Plan: Add a unit test DBBasicTest.MultiGetBufferOverrun that fails in asan_check before the fix, and passes after.

Differential Revision: D18412753

Pulled By: anand1976

fbshipit-source-id: 754dfb66be1d5f161a7efdf87be872198c7e3b72
---
 HISTORY.md                                    |  1 +
 db/db_basic_test.cc                           | 40 +++++++++++++++++++
 table/block_based/block_based_table_reader.cc | 14 +++----
 table/block_based/block_based_table_reader.h  |  5 +++
 table/format.h                                |  5 +++
 5 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 1b6a7186df6..010e2635a30 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -16,6 +16,7 @@
 ### Bug Fixes
 * Fix a assertion failure in MultiGe4t() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
 * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
+* Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
 
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 0f6d106f1f5..c316e3dc81b 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -11,6 +11,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
 #include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
@@ -1553,6 +1554,45 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
 }
 #endif  // !ROCKSDB_LITE
 
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+  Options options = CurrentOptions();
+  Random rnd(301);
+  BlockBasedTableOptions table_options;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.block_size = 16 * 1024;
+  assert(table_options.block_size >
+          BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string zero_str(128, '\0');
+  for (int i = 0; i < 100; ++i) {
+    // Make the value compressible. A purely random string doesn't compress
+    // and the resultant data block will not be compressed
+    std::string value(RandomString(&rnd, 128) + zero_str);
+    assert(Put(Key(i), value) == Status::OK());
+  }
+  Flush();
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+}
+
 class DBBasicTestWithParallelIO
     : public DBTestBase,
       public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index c3a2e129317..ffdbdb1bd5d 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -496,7 +496,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
       return;
     }
     handle = biter.value().handle;
-    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+    uint64_t last_off = handle.offset() + block_size(handle);
     uint64_t prefetch_len = last_off - prefetch_off;
     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
     auto& file = rep->file;
@@ -2327,7 +2327,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
     }
 
     ReadRequest req;
-    req.len = handle.size() + kBlockTrailerSize;
+    req.len = block_size(handle);
     if (scratch == nullptr) {
       req.scratch = new char[req.len];
     } else {
@@ -2354,11 +2354,11 @@ void BlockBasedTable::RetrieveMultipleBlocks(
     ReadRequest& req = read_reqs[read_req_idx++];
     Status s = req.status;
     if (s.ok()) {
-      if (req.result.size() != handle.size() + kBlockTrailerSize) {
+      if (req.result.size() != req.len) {
         s = Status::Corruption("truncated block read from " +
                                rep_->file->file_name() + " offset " +
                                ToString(handle.offset()) + ", expected " +
-                               ToString(handle.size() + kBlockTrailerSize) +
+                               ToString(req.len) +
                                " bytes, got " + ToString(req.result.size()));
       }
     }
@@ -2920,8 +2920,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
             BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
           if (!rep->file->use_direct_io() &&
               (data_block_handle.offset() +
-                   static_cast<size_t>(data_block_handle.size()) +
-                   kBlockTrailerSize >
+                   static_cast<size_t>(block_size(data_block_handle)) >
                readahead_limit_)) {
             // Buffered I/O
             // Discarding the return status of Prefetch calls intentionally, as
@@ -3422,7 +3421,6 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
     autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
-    static const size_t kMultiGetReadStackBufSize = 8192;
     char stack_buf[kMultiGetReadStackBufSize];
     std::unique_ptr<char[]> block_buf;
     {
@@ -3504,7 +3502,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           block_handles.emplace_back(BlockHandle::NullBlockHandle());
         } else {
           block_handles.emplace_back(handle);
-          total_len += handle.size();
+          total_len += block_size(handle);
         }
       }
 
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 68c76ddb78a..bcc7fd1a3da 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -461,8 +461,13 @@ class BlockBasedTable : public TableReader {
   void DumpKeyValue(const Slice& key, const Slice& value,
                     WritableFile* out_file);
 
+  // A cumulative data block file read in MultiGet lower than this size will
+  // use a stack buffer
+  static constexpr size_t kMultiGetReadStackBufSize = 8192;
+
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
+  friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
 };
 
 // Maitaning state of a two-level iteration on a partitioned index structure.
diff --git a/table/format.h b/table/format.h
index 0722d97b8b3..235c6e89401 100644
--- a/table/format.h
+++ b/table/format.h
@@ -214,6 +214,11 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 
+// Make block size calculation for IO less error prone
+inline uint64_t block_size(const BlockHandle& handle) {
+  return handle.size() + kBlockTrailerSize;
+}
+
 inline CompressionType get_block_compression_type(const char* block_data,
                                                   size_t block_size) {
   return static_cast<CompressionType>(block_data[block_size]);

From a19de78da5b8e0ab5ddbcada60e473b84d4dc72f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 11 Nov 2019 17:32:17 -0800
Subject: [PATCH 526/572] db_stress to cover SeekForPrev() (#6022)

Summary:
Right now, db_stress doesn't cover SeekForPrev(). Add the coverage, which mirrors what we do for Seek().
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6022

Test Plan: Run "make crash_test". Do some manual source code hack to simular iterator wrong results and see it caught.

Differential Revision: D18442193

fbshipit-source-id: 879b79000d5e33c625c7e970636de191ccd7776c
---
 tools/db_stress_tool.cc | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 1dcd64ea1c5..d550bdf8de3 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2479,10 +2479,18 @@ class StressTest {
       std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
       bool diverged = false;
 
-      iter->Seek(key);
-      cmp_iter->Seek(key);
+      LastIterateOp last_op;
+      if (thread->rand.OneIn(8)) {
+        iter->SeekForPrev(key);
+        cmp_iter->SeekForPrev(key);
+        last_op = kLastOpSeekForPrev;
+      } else {
+        iter->Seek(key);
+        cmp_iter->Seek(key);
+        last_op = kLastOpSeek;
+      }
       VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
-                     cmp_iter.get(), key, &diverged);
+                     cmp_iter.get(), last_op, key, &diverged);
 
       bool no_reverse =
           (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek &&
@@ -2501,8 +2509,9 @@ class StressTest {
             cmp_iter->Prev();
           }
         }
+        last_op = kLastOpNextOrPrev;
         VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
-                       cmp_iter.get(), key, &diverged);
+                       cmp_iter.get(), last_op, key, &diverged);
       }
 
       if (s.ok()) {
@@ -2518,6 +2527,9 @@ class StressTest {
     return s;
   }
 
+  // Enum used by VerifyIterator() to identify the mode to validate.
+  enum LastIterateOp { kLastOpSeek, kLastOpSeekForPrev, kLastOpNextOrPrev };
+
   // Compare the two iterator, iter and cmp_iter are in the same position,
   // unless iter might be made invalidate or undefined because of
   // upper or lower bounds, or prefix extractor.
@@ -2526,12 +2538,12 @@ class StressTest {
   // True if verification passed, false if not.
   void VerifyIterator(ThreadState* thread, ColumnFamilyHandle* cmp_cfh,
                       const ReadOptions& ro, Iterator* iter, Iterator* cmp_iter,
-                      const Slice& seek_key, bool* diverged) {
+                      LastIterateOp op, const Slice& seek_key, bool* diverged) {
     if (*diverged) {
       return;
     }
 
-    if (ro.iterate_lower_bound != nullptr &&
+    if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
         (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >= 0 ||
          (ro.iterate_upper_bound != nullptr &&
           options_.comparator->Compare(*ro.iterate_lower_bound,
@@ -2542,6 +2554,17 @@ class StressTest {
       return;
     }
 
+    if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
+        (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <= 0 ||
+         (ro.iterate_lower_bound != nullptr &&
+          options_.comparator->Compare(*ro.iterate_lower_bound,
+                                       *ro.iterate_upper_bound) >= 0))) {
+      // Uppder bound behavior is not well defined if it is smaller than
+      // seek key or lower bound. Disable the check for now.
+      *diverged = true;
+      return;
+    }
+
     if (iter->Valid() && !cmp_iter->Valid()) {
       fprintf(stderr,
               "Control interator is invalid but iterator has key %s seek key "

From 6c7b1a0cc79463842d7b66e9a627fd25e9bfa9d7 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 12 Nov 2019 13:51:18 -0800
Subject: [PATCH 527/572] Batched MultiGet API for multiple column families
 (#5816)

Summary:
Add a new API that allows a user to call MultiGet specifying multiple keys belonging to different column families. This is mainly useful for users who want to do a consistent read of keys across column families, with the added performance benefits of batching and returning values using PinnableSlice.

As part of this change, the code in the original multi-column family MultiGet for acquiring the super versions has been refactored into a separate function that can be used by both, the batching and the non-batching versions of MultiGet.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5816

Test Plan:
make check
make asan_check
asan_crash_test

Differential Revision: D18408676

Pulled By: anand1976

fbshipit-source-id: 933e7bec91dd70e7b633be4ff623a1116cc28c8d
---
 HISTORY.md                                    |   1 +
 db/db_basic_test.cc                           |  88 ++-
 db/db_impl/db_impl.cc                         | 505 +++++++++++-------
 db/db_impl/db_impl.h                          |  87 ++-
 db/db_test_util.cc                            |  33 +-
 db/db_test_util.h                             |   3 +-
 include/rocksdb/db.h                          |  41 ++
 table/multiget_context.h                      |  17 +-
 .../write_batch_with_index.cc                 |  10 +-
 9 files changed, 557 insertions(+), 228 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 010e2635a30..b5019d4062c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
 * `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
+* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
 
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index c316e3dc81b..84111aec427 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1019,15 +1019,27 @@ TEST_F(DBBasicTest, DBCloseFlushError) {
   Destroy(options);
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCF) {
+class DBMultiGetTestWithParam : public DBBasicTest,
+                                public testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
                         options);
-
-  for (int i = 0; i < 8; ++i) {
-    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
-                  "cf" + std::to_string(i) + "_val"));
+  // <CF, key, value> tuples
+  std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+  static const int num_keys = 24;
+  cf_kv_vec.reserve(num_keys);
+
+  for (int i = 0; i < num_keys; ++i) {
+    int cf = i / 3;
+    int cf_key = 1 % 3;
+    cf_kv_vec.emplace_back(std::make_tuple(
+        cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+        "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+    ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                  std::get<2>(cf_kv_vec[i])));
   }
 
   int get_sv_count = 0;
@@ -1037,10 +1049,14 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
         if (++get_sv_count == 2) {
           // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
           // is forced to repeat the process
-          for (int i = 0; i < 8; ++i) {
-            ASSERT_OK(Flush(i));
-            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
-                          "cf" + std::to_string(i) + "_val2"));
+          for (int i = 0; i < num_keys; ++i) {
+            int cf = i / 3;
+            int cf_key = i % 8;
+            if (cf_key == 0) {
+              ASSERT_OK(Flush(cf));
+            }
+            ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                          std::get<2>(cf_kv_vec[i]) + "_2"));
           }
         }
         if (get_sv_count == 11) {
@@ -1058,26 +1074,53 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
   std::vector<std::string> keys;
   std::vector<std::string> values;
 
-  for (int i = 0; i < 8; ++i) {
-    cfs.push_back(i);
-    keys.push_back("cf" + std::to_string(i) + "_key");
+  for (int i = 0; i < num_keys; ++i) {
+    cfs.push_back(std::get<0>(cf_kv_vec[i]));
+    keys.push_back(std::get<1>(cf_kv_vec[i]));
   }
 
-  values = MultiGet(cfs, keys, nullptr);
-  ASSERT_EQ(values.size(), 8);
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values.size(), num_keys);
   for (unsigned int j = 0; j < values.size(); ++j) {
-    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val2");
+    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
   }
-  for (int i = 0; i < 8; ++i) {
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[0]));
+  keys.push_back(std::get<1>(cf_kv_vec[0]));
+  cfs.push_back(std::get<0>(cf_kv_vec[3]));
+  keys.push_back(std::get<1>(cf_kv_vec[3]));
+  cfs.push_back(std::get<0>(cf_kv_vec[4]));
+  keys.push_back(std::get<1>(cf_kv_vec[4]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[7]));
+  keys.push_back(std::get<1>(cf_kv_vec[7]));
+  cfs.push_back(std::get<0>(cf_kv_vec[6]));
+  keys.push_back(std::get<1>(cf_kv_vec[6]));
+  cfs.push_back(std::get<0>(cf_kv_vec[1]));
+  keys.push_back(std::get<1>(cf_kv_vec[1]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+  for (int cf = 0; cf < 8; ++cf) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
                     ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
   }
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
@@ -1123,7 +1166,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
     keys.push_back("cf" + std::to_string(i) + "_key");
   }
 
-  values = MultiGet(cfs, keys, nullptr);
+  values = MultiGet(cfs, keys, nullptr, GetParam());
   ASSERT_TRUE(last_try);
   ASSERT_EQ(values.size(), 8);
   for (unsigned int j = 0; j < values.size(); ++j) {
@@ -1138,7 +1181,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
   }
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
@@ -1183,7 +1226,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
   }
 
   const Snapshot* snapshot = db_->GetSnapshot();
-  values = MultiGet(cfs, keys, snapshot);
+  values = MultiGet(cfs, keys, snapshot, GetParam());
   db_->ReleaseSnapshot(snapshot);
   ASSERT_EQ(values.size(), 8);
   for (unsigned int j = 0; j < values.size(); ++j) {
@@ -1197,6 +1240,9 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
   }
 }
 
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Bool());
+
 TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 14d9523f75f..17b31497828 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1645,14 +1645,9 @@ std::vector<Status> DBImpl::MultiGet(
   StopWatch sw(env_, stats_, DB_MULTIGET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
-  SequenceNumber snapshot;
+  SequenceNumber consistent_seqnum;
+  ;
 
-  struct MultiGetColumnFamilyData {
-    ColumnFamilyData* cfd;
-    SuperVersion* super_version;
-    MultiGetColumnFamilyData(ColumnFamilyData* cf, SuperVersion* sv)
-        : cfd(cf), super_version(sv) {}
-  };
   std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
       column_family.size());
   for (auto cf : column_family) {
@@ -1660,86 +1655,20 @@ std::vector<Status> DBImpl::MultiGet(
     auto cfd = cfh->cfd();
     if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
       multiget_cf_data.emplace(cfd->GetID(),
-                               MultiGetColumnFamilyData(cfd, nullptr));
+                               MultiGetColumnFamilyData(cfh, nullptr));
     }
   }
 
-  bool last_try = false;
-  {
-    // If we end up with the same issue of memtable geting sealed during 2
-    // consecutive retries, it means the write rate is very high. In that case
-    // its probably ok to take the mutex on the 3rd try so we can succeed for
-    // sure
-    static const int num_retries = 3;
-    for (auto i = 0; i < num_retries; ++i) {
-      last_try = (i == num_retries - 1);
-      bool retry = false;
-
-      if (i > 0) {
-        for (auto mgd_iter = multiget_cf_data.begin();
-             mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
-          auto super_version = mgd_iter->second.super_version;
-          auto cfd = mgd_iter->second.cfd;
-          if (super_version != nullptr) {
-            ReturnAndCleanupSuperVersion(cfd, super_version);
-          }
-          mgd_iter->second.super_version = nullptr;
-        }
-      }
-
-      if (read_options.snapshot == nullptr) {
-        if (last_try) {
-          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
-          // We're close to max number of retries. For the last retry,
-          // acquire the lock so we're sure to succeed
-          mutex_.Lock();
-        }
-        snapshot = last_seq_same_as_publish_seq_
-                       ? versions_->LastSequence()
-                       : versions_->LastPublishedSequence();
-      } else {
-        snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                       ->number_;
-      }
+  std::function<MultiGetColumnFamilyData*(
+      std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+      iter_deref_lambda =
+          [](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
+                 cf_iter) { return &cf_iter->second; };
 
-      for (auto mgd_iter = multiget_cf_data.begin();
-           mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
-        if (!last_try) {
-          mgd_iter->second.super_version =
-              GetAndRefSuperVersion(mgd_iter->second.cfd);
-        } else {
-          mgd_iter->second.super_version =
-              mgd_iter->second.cfd->GetSuperVersion()->Ref();
-        }
-        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
-        if (read_options.snapshot != nullptr || last_try) {
-          // If user passed a snapshot, then we don't care if a memtable is
-          // sealed or compaction happens because the snapshot would ensure
-          // that older key versions are kept around. If this is the last
-          // retry, then we have the lock so nothing bad can happen
-          continue;
-        }
-        // We could get the earliest sequence number for the whole list of
-        // memtables, which will include immutable memtables as well, but that
-        // might be tricky to maintain in case we decide, in future, to do
-        // memtable compaction.
-        if (!last_try) {
-          auto seq =
-              mgd_iter->second.super_version->mem->GetEarliestSequenceNumber();
-          if (seq > snapshot) {
-            retry = true;
-            break;
-          }
-        }
-      }
-      if (!retry) {
-        if (last_try) {
-          mutex_.Unlock();
-        }
-        break;
-      }
-    }
-  }
+  bool unref_only =
+      MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
+          read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+          &consistent_seqnum);
 
   // Contain a list of merge operations if merge occurs.
   MergeContext merge_context;
@@ -1763,7 +1692,7 @@ std::vector<Status> DBImpl::MultiGet(
     Status& s = stat_list[i];
     std::string* value = &(*values)[i];
 
-    LookupKey lkey(keys[i], snapshot);
+    LookupKey lkey(keys[i], consistent_seqnum);
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
     SequenceNumber max_covering_tombstone_seq = 0;
     auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
@@ -1807,7 +1736,7 @@ std::vector<Status> DBImpl::MultiGet(
 
   for (auto mgd_iter : multiget_cf_data) {
     auto mgd = mgd_iter.second;
-    if (!last_try) {
+    if (!unref_only) {
       ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
     } else {
       mgd.cfd->GetSuperVersion()->Unref();
@@ -1824,125 +1753,330 @@ std::vector<Status> DBImpl::MultiGet(
   return stat_list;
 }
 
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+    const ReadOptions& read_options, ReadCallback* callback,
+    std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+        iter_deref_func,
+    T* cf_list, SequenceNumber* snapshot) {
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  bool last_try = false;
+  if (cf_list->size() == 1) {
+    // Fast path for a single column family. We can simply get the thread loca
+    // super version
+    auto cf_iter = cf_list->begin();
+    auto node = iter_deref_func(cf_iter);
+    node->super_version = GetAndRefSuperVersion(node->cfd);
+    if (read_options.snapshot != nullptr) {
+      // Note: In WritePrepared txns this is not necessary but not harmful
+      // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+      // a snapshot is specified we should be fine with skipping seq numbers
+      // that are greater than that.
+      //
+      // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+      // may skip uncommitted data that should be visible to the transaction for
+      // reading own writes.
+      *snapshot =
+          static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+      if (callback) {
+        *snapshot = std::max(*snapshot, callback->max_visible_seq());
+      }
+    } else {
+      // Since we get and reference the super version before getting
+      // the snapshot number, without a mutex protection, it is possible
+      // that a memtable switch happened in the middle and not all the
+      // data for this snapshot is available. But it will contain all
+      // the data available in the super version we have, which is also
+      // a valid snapshot to read from.
+      // We shouldn't get snapshot before finding and referencing the super
+      // version because a flush happening in between may compact away data for
+      // the snapshot, but the snapshot is earlier than the data overwriting it,
+      // so users may see wrong results.
+      *snapshot = last_seq_same_as_publish_seq_
+                      ? versions_->LastSequence()
+                      : versions_->LastPublishedSequence();
+    }
+  } else {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    static const int num_retries = 3;
+    for (int i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+             ++cf_iter) {
+          auto node = iter_deref_func(cf_iter);
+          SuperVersion* super_version = node->super_version;
+          ColumnFamilyData* cfd = node->cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          node->super_version = nullptr;
+        }
+      }
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        *snapshot = last_seq_same_as_publish_seq_
+                        ? versions_->LastSequence()
+                        : versions_->LastPublishedSequence();
+      } else {
+        *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                        ->number_;
+      }
+      for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+           ++cf_iter) {
+        auto node = iter_deref_func(cf_iter);
+        if (!last_try) {
+          node->super_version = GetAndRefSuperVersion(node->cfd);
+        } else {
+          node->super_version = node->cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          SequenceNumber seq =
+              node->super_version->mem->GetEarliestSequenceNumber();
+          if (seq > *snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, Status* statuses,
+                      const bool sorted_input) {
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+  autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+      multiget_cf_data;
+  size_t cf_start = 0;
+  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+  for (size_t i = 0; i < num_keys; ++i) {
+    KeyContext* key_ctx = sorted_keys[i];
+    if (key_ctx->column_family != cf) {
+      multiget_cf_data.emplace_back(
+          MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+      cf_start = i;
+      cf = key_ctx->column_family;
+    }
+  }
+  {
+    // multiget_cf_data.emplace_back(
+    // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
+    multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+  }
+  std::function<MultiGetColumnFamilyData*(
+      autovector<MultiGetColumnFamilyData,
+                 MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+      iter_deref_lambda =
+          [](autovector<MultiGetColumnFamilyData,
+                        MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<
+      autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+      read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+
+  for (auto cf_iter = multiget_cf_data.begin();
+       cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
+                 cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+    } else {
+      cf_iter->cfd->GetSuperVersion()->Unref();
+    }
+  }
+}
+
+namespace {
 // Order keys by CF ID, followed by key contents
 struct CompareKeyContext {
   inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
-    const Comparator* comparator = cfd->user_comparator();
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id1 = cfh->cfd()->GetID();
+    const Comparator* comparator = cfh->cfd()->user_comparator();
+    cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+    if (cfd_id1 < cfd_id2) {
+      return true;
+    } else if (cfd_id1 > cfd_id2) {
+      return false;
+    }
+
+    // Both keys are from the same column family
     int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
     if (cmp < 0) {
       return true;
     }
     return false;
   }
-  const ColumnFamilyData* cfd;
 };
 
+}  // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+    size_t num_keys, bool sorted_input,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+#ifndef NDEBUG
+  if (sorted_input) {
+    for (size_t index = 0; index < sorted_keys->size(); ++index) {
+      if (index > 0) {
+        KeyContext* lhs = (*sorted_keys)[index - 1];
+        KeyContext* rhs = (*sorted_keys)[index];
+        ColumnFamilyHandleImpl* cfh =
+            reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id1 = cfh->cfd()->GetID();
+        const Comparator* comparator = cfh->cfd()->user_comparator();
+        cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+        assert(cfd_id1 <= cfd_id2);
+        if (cfd_id1 < cfd_id2) {
+          continue;
+        }
+
+        // Both keys are from the same column family
+        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+        assert(cmp <= 0);
+      }
+      index++;
+    }
+  }
+#endif
+  if (!sorted_input) {
+    CompareKeyContext sort_comparator;
+    std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+              sort_comparator);
+  }
+}
+
 void DBImpl::MultiGet(const ReadOptions& read_options,
                       ColumnFamilyHandle* column_family, const size_t num_keys,
                       const Slice* keys, PinnableSlice* values,
                       Status* statuses, const bool sorted_input) {
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    key_context.emplace_back(keys[i], &values[i], &statuses[i]);
+    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
   }
-
-  MultiGetImpl(read_options, column_family, key_context, sorted_input, nullptr,
-               nullptr);
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+  MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
 }
 
-void DBImpl::MultiGetImpl(
+void DBImpl::MultiGetWithCallback(
     const ReadOptions& read_options, ColumnFamilyHandle* column_family,
-    autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
-    bool sorted_input, ReadCallback* callback, bool* is_blob_index) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
-  size_t num_keys = key_context.size();
-
-  PERF_TIMER_GUARD(get_snapshot_time);
-
-  ColumnFamilyHandleImpl* cfh =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  ColumnFamilyData* cfd = cfh->cfd();
-
-  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
-  sorted_keys.resize(num_keys);
-  {
-    size_t index = 0;
-    for (KeyContext& key : key_context) {
+    ReadCallback* callback,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+  multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+  std::function<MultiGetColumnFamilyData*(
+      std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+      iter_deref_lambda =
+          [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  size_t num_keys = sorted_keys->size();
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+      read_options, callback, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
 #ifndef NDEBUG
-      if (index > 0 && sorted_input) {
-        KeyContext* lhs = &key_context[index-1];
-        KeyContext* rhs = &key_context[index];
-        const Comparator* comparator = cfd->user_comparator();
-        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
-        assert(cmp <= 0);
-      }
-#endif
-
-      sorted_keys[index] = &key;
-      index++;
-    }
-    if (!sorted_input) {
-      CompareKeyContext sort_comparator;
-      sort_comparator.cfd = cfd;
-      std::sort(sorted_keys.begin(), sorted_keys.begin() + index,
-                sort_comparator);
-    }
+  assert(!unref_only);
+#else
+  // Silence unused variable warning
+  (void)unref_only;
+#endif  // NDEBUG
+
+  if (callback && read_options.snapshot == nullptr) {
+    // The unprep_seqs are not published for write unprepared, so it could be
+    // that max_visible_seq is larger. Seek to the std::max of the two.
+    // However, we still want our callback to contain the actual snapshot so
+    // that it can do the correct visibility filtering.
+    callback->Refresh(consistent_seqnum);
+
+    // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+    // max_visible_seq = max(max_visible_seq, snapshot)
+    //
+    // Currently, the commented out assert is broken by
+    // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+    // the regular transaction flow, then this special read callback would not
+    // be needed.
+    //
+    // assert(callback->max_visible_seq() >= snapshot);
+    consistent_seqnum = callback->max_visible_seq();
   }
 
-  // Keep track of bytes that we read for statistics-recording later
-  PERF_TIMER_STOP(get_snapshot_time);
-
-  // Acquire SuperVersion
-  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
-  SequenceNumber snapshot;
-  if (read_options.snapshot != nullptr) {
-    // Note: In WritePrepared txns this is not necessary but not harmful
-    // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
-    // a snapshot is specified we should be fine with skipping seq numbers
-    // that are greater than that.
-    //
-    // In WriteUnprepared, we cannot set snapshot in the lookup key because we
-    // may skip uncommitted data that should be visible to the transaction for
-    // reading own writes.
-    snapshot =
-        reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
-    if (callback) {
-      snapshot = std::max(snapshot, callback->max_visible_seq());
-    }
-  } else {
-    // Since we get and reference the super version before getting
-    // the snapshot number, without a mutex protection, it is possible
-    // that a memtable switch happened in the middle and not all the
-    // data for this snapshot is available. But it will contain all
-    // the data available in the super version we have, which is also
-    // a valid snapshot to read from.
-    // We shouldn't get snapshot before finding and referencing the super
-    // version because a flush happening in between may compact away data for
-    // the snapshot, but the snapshot is earlier than the data overwriting it,
-    // so users may see wrong results.
-    snapshot = last_seq_same_as_publish_seq_
-                   ? versions_->LastSequence()
-                   : versions_->LastPublishedSequence();
-    if (callback) {
-      // The unprep_seqs are not published for write unprepared, so it could be
-      // that max_visible_seq is larger. Seek to the std::max of the two.
-      // However, we still want our callback to contain the actual snapshot so
-      // that it can do the correct visibility filtering.
-      callback->Refresh(snapshot);
+  MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+               multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
+               nullptr);
+  ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+                               multiget_cf_data[0].super_version);
+}
 
-      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
-      // max_visible_seq = max(max_visible_seq, snapshot)
-      //
-      // Currently, the commented out assert is broken by
-      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
-      // the regular transaction flow, then this special read callback would not
-      // be needed.
-      //
-      // assert(callback->max_visible_seq() >= snapshot);
-      snapshot = callback->max_visible_seq();
-    }
-  }
+void DBImpl::MultiGetImpl(
+    const ReadOptions& read_options, size_t start_key, size_t num_keys,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+    SuperVersion* super_version, SequenceNumber snapshot,
+    ReadCallback* callback, bool* is_blob_index) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_MULTIGET);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
@@ -1953,8 +2087,8 @@ void DBImpl::MultiGetImpl(
     size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
                             ? MultiGetContext::MAX_BATCH_SIZE
                             : keys_left;
-    MultiGetContext ctx(&sorted_keys[num_keys - keys_left], batch_size,
-                        snapshot);
+    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+                        batch_size, snapshot);
     MultiGetRange range = ctx.GetMultiGetRange();
     bool lookup_current = false;
 
@@ -1992,15 +2126,14 @@ void DBImpl::MultiGetImpl(
   PERF_TIMER_GUARD(get_post_process_time);
   size_t num_found = 0;
   uint64_t bytes_read = 0;
-  for (KeyContext& key : key_context) {
-    if (key.s->ok()) {
-      bytes_read += key.value->size();
+  for (size_t i = start_key; i < start_key + num_keys; ++i) {
+    KeyContext* key = (*sorted_keys)[i];
+    if (key->s->ok()) {
+      bytes_read += key->value->size();
       num_found++;
     }
   }
 
-  ReturnAndCleanupSuperVersion(cfd, super_version);
-
   RecordTick(stats_, NUMBER_MULTIGET_CALLS);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 8f1b6e88a5d..fe97e08bec1 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -199,11 +199,15 @@ class DBImpl : public DB {
                         PinnableSlice* values, Status* statuses,
                         const bool sorted_input = false) override;
 
-  void MultiGetImpl(
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGetWithCallback(
       const ReadOptions& options, ColumnFamilyHandle* column_family,
-      autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
-      bool sorted_input, ReadCallback* callback = nullptr,
-      bool* is_blob_index = nullptr);
+      ReadCallback* callback,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
 
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                     const std::string& column_family,
@@ -1640,6 +1644,81 @@ class DBImpl : public DB {
       const DBOptions& db_options,
       const std::vector<ColumnFamilyDescriptor>& column_families);
 
+  // Utility function to do some debug validation and sort the given vector
+  // of MultiGet keys
+  void PrepareMultiGetKeys(
+      const size_t num_keys, bool sorted,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+  // A structure to hold the information required to process MultiGet of keys
+  // belonging to one column family. For a multi column family MultiGet, there
+  // will be a container of these objects.
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyHandle* cf;
+    ColumnFamilyData* cfd;
+
+    // For the batched MultiGet which relies on sorted keys, start specifies
+    // the index of first key belonging to this column family in the sorted
+    // list.
+    size_t start;
+
+    // For the batched MultiGet case, num_keys specifies the number of keys
+    // belonging to this column family in the sorted list
+    size_t num_keys;
+
+    // SuperVersion for the column family obtained in a manner that ensures a
+    // consistent view across all column families in the DB
+    SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+                             SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(0),
+          num_keys(0),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+                             size_t count, SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(first),
+          num_keys(count),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData() = default;
+  };
+
+  // A common function to obtain a consistent snapshot, which can be implicit
+  // if the user doesn't specify a snapshot in read_options, across
+  // multiple column families for MultiGet. It will attempt to get an implicit
+  // snapshot without acquiring the db_mutes, but will give up after a few
+  // tries and acquire the mutex if a memtable flush happens. The template
+  // allows both the batched and non-batched MultiGet to call this with
+  // either an std::unordered_map or autovector of column families.
+  //
+  // If callback is non-null, the callback is refreshed with the snapshot
+  // sequence number
+  //
+  // A return value of true indicates that the SuperVersions were obtained
+  // from the ColumnFamilyData, whereas false indicates they are thread
+  // local
+  template <class T>
+  bool MultiCFSnapshot(
+      const ReadOptions& read_options, ReadCallback* callback,
+      std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+          iter_deref_func,
+      T* cf_list, SequenceNumber* snapshot);
+
+  // The actual implementation of the batching MultiGet. The caller is expected
+  // to have acquired the SuperVersion and pass in a snapshot sequence number
+  // in order to construct the LookupKeys. The start_key and num_keys specify
+  // the range of keys in the sorted_keys vector for a single column family.
+  void MultiGetImpl(
+      const ReadOptions& read_options, size_t start_key, size_t num_keys,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
+      bool* is_blob_index);
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 41e4c5105d2..88f57275f31 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -777,7 +777,8 @@ std::string DBTestBase::Get(int cf, const std::string& k,
 
 std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
                                               const std::vector<std::string>& k,
-                                              const Snapshot* snapshot) {
+                                              const Snapshot* snapshot,
+                                              const bool batched) {
   ReadOptions options;
   options.verify_checksums = true;
   options.snapshot = snapshot;
@@ -789,12 +790,30 @@ std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
     handles.push_back(handles_[cfs[i]]);
     keys.push_back(k[i]);
   }
-  std::vector<Status> s = db_->MultiGet(options, handles, keys, &result);
-  for (unsigned int i = 0; i < s.size(); ++i) {
-    if (s[i].IsNotFound()) {
-      result[i] = "NOT_FOUND";
-    } else if (!s[i].ok()) {
-      result[i] = s[i].ToString();
+  std::vector<Status> s;
+  if (!batched) {
+    s = db_->MultiGet(options, handles, keys, &result);
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      }
+    }
+  } else {
+    std::vector<PinnableSlice> pin_values(cfs.size());
+    result.resize(cfs.size());
+    s.resize(cfs.size());
+    db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+                  pin_values.data(), s.data());
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      } else {
+        result[i].assign(pin_values[i].data(), pin_values[i].size());
+      }
     }
   }
   return result;
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 6673714872f..f8a01ad156e 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -850,7 +850,8 @@ class DBTestBase : public testing::Test {
 
   std::vector<std::string> MultiGet(std::vector<int> cfs,
                                     const std::vector<std::string>& k,
-                                    const Snapshot* snapshot = nullptr);
+                                    const Snapshot* snapshot,
+                                    const bool batched);
 
   std::vector<std::string> MultiGet(const std::vector<std::string>& k,
                                     const Snapshot* snapshot = nullptr);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index e73ae9c2055..bc93aeda164 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -490,6 +490,47 @@ class DB {
       values++;
     }
   }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
   // If the key definitely does not exist in the database, then this method
   // returns false, else true. If the caller wants to obtain value when the key
   // is found in memory, a bool for 'value_found' must be passed. 'value_found'
diff --git a/table/multiget_context.h b/table/multiget_context.h
index fe6bbc3bf3d..8b5b607b3bf 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -5,6 +5,7 @@
 
 #pragma once
 #include <algorithm>
+#include <array>
 #include <string>
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
@@ -21,6 +22,7 @@ struct KeyContext {
   LookupKey* lkey;
   Slice ukey;
   Slice ikey;
+  ColumnFamilyHandle* column_family;
   Status* s;
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq;
@@ -29,9 +31,11 @@ struct KeyContext {
   PinnableSlice* value;
   GetContext* get_context;
 
-  KeyContext(const Slice& user_key, PinnableSlice* val, Status* stat)
+  KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
+             PinnableSlice* val, Status* stat)
       : key(&user_key),
         lkey(nullptr),
+        column_family(col_family),
         s(stat),
         max_covering_tombstone_seq(0),
         key_exists(false),
@@ -85,10 +89,9 @@ class MultiGetContext {
   // htat need to be performed
   static const int MAX_BATCH_SIZE = 32;
 
-  MultiGetContext(KeyContext** sorted_keys, size_t num_keys,
-                  SequenceNumber snapshot)
-      : sorted_keys_(sorted_keys),
-        num_keys_(num_keys),
+  MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
+                  size_t begin, size_t num_keys, SequenceNumber snapshot)
+      : num_keys_(num_keys),
         value_mask_(0),
         lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
     int index = 0;
@@ -100,6 +103,8 @@ class MultiGetContext {
     }
 
     for (size_t iter = 0; iter != num_keys_; ++iter) {
+      // autovector may not be contiguous storage, so make a copy
+      sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[index])
           LookupKey(*sorted_keys_[iter]->key, snapshot);
       sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
@@ -118,7 +123,7 @@ class MultiGetContext {
   static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
   alignas(alignof(LookupKey))
     char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK];
-  KeyContext** sorted_keys_;
+  std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
   size_t num_keys_;
   uint64_t value_mask_;
   std::unique_ptr<char[]> lookup_key_heap_buf;
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 4cc6b9bca03..fe99e43e625 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -963,6 +963,7 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
           ->immutable_db_options();
 
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   // To hold merges from the write batch
   autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
              MultiGetContext::MAX_BATCH_SIZE>
@@ -1002,14 +1003,17 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
 
     assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
            result == WriteBatchWithIndexInternal::Result::kNotFound);
-    key_context.emplace_back(keys[i], &values[i], &statuses[i]);
+    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+    sorted_keys.emplace_back(&key_context.back());
     merges.emplace_back(result, std::move(merge_context));
   }
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
   static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-      ->MultiGetImpl(read_options, column_family, key_context, sorted_input,
-                     callback);
+      ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
+  static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+      ->MultiGetWithCallback(read_options, column_family, callback,
+                             &sorted_keys);
 
   ColumnFamilyHandleImpl* cfh =
       reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);

From 42b5494ec81f046e4eb41e1928c4ef615eaafd65 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 12 Nov 2019 15:27:19 -0800
Subject: [PATCH 528/572] Fix BloomFilterPolicy changes for unsigned char (ARM)
 (#6024)

Summary:
Bug in PR https://github.com/facebook/rocksdb/issues/5941 when char is unsigned that should only affect
assertion on unused/invalid filter metadata.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6024

Test Plan: on ARM: ./bloom_test && ./db_bloom_filter_test && ./block_based_filter_block_test && ./full_filter_block_test && ./partitioned_filter_block_test

Differential Revision: D18461206

Pulled By: pdillinger

fbshipit-source-id: 68a7c813a0b5791c05265edc03cdf52c78880e9a
---
 table/block_based/filter_policy.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 6a458f72b9d..a4b93596e34 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -338,7 +338,8 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
     return new AlwaysFalseFilter();
   }
 
-  char raw_num_probes = contents.data()[len_with_meta - 5];
+  int8_t raw_num_probes =
+      static_cast<int8_t>(contents.data()[len_with_meta - 5]);
   // NB: *num_probes > 30 and < 128 probably have not been used, because of
   // BloomFilterPolicy::initialize, unless directly calling
   // FullFilterBitsBuilder as an API, but we are leaving those cases in

From bb23bfe63c4f19919ddad10e5f7027eb09f17ef0 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 13 Nov 2019 10:10:09 -0800
Subject: [PATCH 529/572] Fix a regression bug on total order seek with prefix
 enabled and range delete (#6028)

Summary:
Recent change https://github.com/facebook/rocksdb/pull/5861 mistakely use "prefix_extractor_ != nullptr" as the condition to determine whehter prefix bloom filter isused. It fails to consider read_options.total_order_seek, so it is wrong. The result is that an optimization for non-total-order seek is mistakely applied to total order seek, and introduces a bug in following corner case:
Because of RangeDelete(), a file's largest key is extended. Seek key falls into the range deleted file, so level iterator seeks into the previous file without getting any key. The correct behavior is to place the iterator to the first key of the next file. However, an optimization is triggered and invalidates the iterator because it is out of the prefix range, causing wrong results. This behavior is reproduced in the unit test added.
Fix the bug by setting prefix_extractor to be null if total order seek is used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6028

Test Plan: Add a unit test which fails without the fix.

Differential Revision: D18479063

fbshipit-source-id: ac075f013029fcf69eb3a598f14c98cce3e810b3
---
 db/db_test2.cc    | 33 +++++++++++++++++++++++++++++++++
 db/version_set.cc |  3 ++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 368ebe93647..6b0ee157e4c 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -4177,6 +4177,39 @@ TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
                                           options));
   }
 }
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "a"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Put("b", "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("x", "a"));
+  ASSERT_OK(Put("z", "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  {
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    iter->Seek("e");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("x", iter->key().ToString());
+  }
+  db_->ReleaseSnapshot(s1);
+}
 }  // namespace rocksdb
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/version_set.cc b/db/version_set.cc
index a73806b8199..ca6c6fb1040 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -867,7 +867,8 @@ class LevelIterator final : public InternalIterator {
         icomparator_(icomparator),
         user_comparator_(icomparator.user_comparator()),
         flevel_(flevel),
-        prefix_extractor_(prefix_extractor),
+        prefix_extractor_(read_options.total_order_seek ? nullptr
+                                                        : prefix_extractor),
         file_read_hist_(file_read_hist),
         should_sample_(should_sample),
         caller_(caller),

From f382f44e39737f1357f974d675acc7fade514a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20=C5=9Eent=C3=BCrk?= <fatihsenturk@mail.com>
Date: Wed, 13 Nov 2019 11:00:57 -0800
Subject: [PATCH 530/572] fix typo (#6025)

Summary:
fix a typo at java readme page
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6025

Differential Revision: D18481232

fbshipit-source-id: 1c70c2435bcd4b02f25e28cd7e35c42273e07be0
---
 java/RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index cb9aaf987b4..65da3d83a19 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -1,6 +1,6 @@
 ## Cross-building
 
-RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be used on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
 
 Building a cross-platform JAR requires:
 

From f059c7d9b96300091e07429a60f4ad55dac84859 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Wed, 13 Nov 2019 16:31:26 -0800
Subject: [PATCH 531/572] New Bloom filter implementation for full and
 partitioned filters (#6007)

Summary:
Adds an improved, replacement Bloom filter implementation (FastLocalBloom) for full and partitioned filters in the block-based table. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single filter.

Speed

The improved speed, at least on recent x86_64, comes from
* Using fastrange instead of modulo (%)
* Using our new hash function (XXH3 preview, added in a previous commit), which is much faster for large keys and only *slightly* slower on keys around 12 bytes if hashing the same size many thousands of times in a row.
* Optimizing the Bloom filter queries with AVX2 SIMD operations. (Added AVX2 to the USE_SSE=1 build.) Careful design was required to support (a) SIMD-optimized queries, (b) compatible non-SIMD code that's simple and efficient, (c) flexible choice of number of probes, and (d) essentially maximized accuracy for a cache-local Bloom filter. Probes are made eight at a time, so any number of probes up to 8 is the same speed, then up to 16, etc.
* Prefetching cache lines when building the filter. Although this optimization could be applied to the old structure as well, it seems to balance out the small added cost of accumulating 64 bit hashes for adding to the filter rather than 32 bit hashes.

Here's nominal speed data from filter_bench (200MB in filters, about 10k keys each, 10 bits filter data / key, 6 probes, avg key size 24 bytes, includes hashing time) on Skylake DE (relatively low clock speed):

$ ./filter_bench -quick -impl=2 -net_includes_hashing # New Bloom filter
Build avg ns/key: 47.7135
Mixed inside/outside queries...
  Single filter net ns/op: 26.2825
  Random filter net ns/op: 150.459
    Average FP rate %: 0.954651
$ ./filter_bench -quick -impl=0 -net_includes_hashing # Old Bloom filter
Build avg ns/key: 47.2245
Mixed inside/outside queries...
  Single filter net ns/op: 63.2978
  Random filter net ns/op: 188.038
    Average FP rate %: 1.13823

Similar build time but dramatically faster query times on hot data (63 ns to 26 ns), and somewhat faster on stale data (188 ns to 150 ns). Performance differences on batched and skewed query loads are between these extremes as expected.

The only other interesting thing about speed is "inside" (query key was added to filter) vs. "outside" (query key was not added to filter) query times. The non-SIMD implementations are substantially slower when most queries are "outside" vs. "inside". This goes against what one might expect or would have observed years ago, as "outside" queries only need about two probes on average, due to short-circuiting, while "inside" always have num_probes (say 6). The problem is probably the nastily unpredictable branch. The SIMD implementation has few branches (very predictable) and has pretty consistent running time regardless of query outcome.

Accuracy

The generally improved accuracy (re: Issue https://github.com/facebook/rocksdb/issues/5857) comes from a better design for probing indices
within a cache line (re: Issue https://github.com/facebook/rocksdb/issues/4120) and improved accuracy for millions of keys in a single filter from using a 64-bit hash function (XXH3p). Design details in code comments.

Accuracy data (generalizes, except old impl gets worse with millions of keys):
Memory bits per key: FP rate percent old impl -> FP rate percent new impl
6: 5.70953 -> 5.69888
8: 2.45766 -> 2.29709
10: 1.13977 -> 0.959254
12: 0.662498 -> 0.411593
16: 0.353023 -> 0.0873754
24: 0.261552 -> 0.0060971
50: 0.225453 -> ~0.00003 (less than 1 in a million queries are FP)

Fixes https://github.com/facebook/rocksdb/issues/5857
Fixes https://github.com/facebook/rocksdb/issues/4120

Unlike the old implementation, this implementation has a fixed cache line size (64 bytes). At 10 bits per key, the accuracy of this new implementation is very close to the old implementation with 128-byte cache line size. If there's sufficient demand, this implementation could be generalized.

Compatibility

Although old releases would see the new structure as corrupt filter data and read the table as if there's no filter, we've decided only to enable the new Bloom filter with new format_version=5. This provides a smooth path for automatic adoption over time, with an option for early opt-in.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6007

Test Plan: filter_bench has been used thoroughly to validate speed, accuracy, and correctness. Unit tests have been carefully updated to exercise new and old implementations, as well as the logic to select an implementation based on context (format_version).

Differential Revision: D18294749

Pulled By: pdillinger

fbshipit-source-id: d44c9db3696e4d0a17caaec47075b7755c262c5f
---
 CMakeLists.txt                                |   2 +-
 HISTORY.md                                    |   3 +
 Makefile                                      |   2 +-
 TARGETS                                       |   1 +
 buckifier/targets_cfg.py                      |   1 +
 build_tools/build_detect_platform             |  17 +
 db/db_bloom_filter_test.cc                    |  69 +--
 include/rocksdb/filter_policy.h               |  15 +
 include/rocksdb/table.h                       |   3 +
 .../block_based/block_based_table_builder.cc  |   4 +-
 table/block_based/filter_policy.cc            | 430 ++++++++++++++----
 table/block_based/filter_policy_internal.h    |  81 +++-
 table/block_based/full_filter_block_test.cc   |  12 +-
 table/block_based/parsed_full_filter_block.h  |   2 +-
 .../partitioned_filter_block_test.cc          |   2 +-
 table/format.h                                |   2 +-
 test_util/testutil.cc                         |   2 +-
 util/bloom_impl.h                             | 214 ++++++++-
 util/bloom_test.cc                            | 278 ++++++++---
 util/filter_bench.cc                          |  34 +-
 20 files changed, 955 insertions(+), 219 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45693df5024..94aad1a191d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -470,7 +470,7 @@ endif()
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
-if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+if(NOT ROCKSDB_LITE)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
 endif()
 find_package(Threads REQUIRED)
diff --git a/HISTORY.md b/HISTORY.md
index b5019d4062c..ac96d92a763 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,11 +5,14 @@
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
+* An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
 
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
 * `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
 * A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
+* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
+* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
 
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
diff --git a/Makefile b/Makefile
index b440f408c0a..ebdb53f1f98 100644
--- a/Makefile
+++ b/Makefile
@@ -320,7 +320,7 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
-ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
 	FOLLY_DIR = ./third-party/folly
 	# AIX: pre-defined system headers are surrounded by an extern "C" block
 	ifeq ($(PLATFORM), OS_AIX)
diff --git a/TARGETS b/TARGETS
index ab1f24cd76c..07e5fe6c889 100644
--- a/TARGETS
+++ b/TARGETS
@@ -77,6 +77,7 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
+    "-I" + REPO_PATH + "third-party/folly/",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 0ecd6fdda76..b93f79bb7b8 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -83,6 +83,7 @@
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
+    "-I" + REPO_PATH + "third-party/folly/",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index dcc4dfee6b2..45fdbe258ba 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -572,6 +572,8 @@ if test "$TRY_SSE_ETC"; then
   TRY_SSE42="-msse4.2"
   # PCLMUL available since westmere, ca. 2010-2011
   TRY_PCLMUL="-mpclmul"
+  # AVX2 available since haswell, ca. 2013-2015
+  TRY_AVX2="-mavx2"
 fi
 
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null <<EOF
@@ -605,6 +607,21 @@ elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
 fi
 
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main() {
+    const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
+    const auto b = _mm256_permutevar8x32_epi32(a, a);
+    (void)b;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
+fi
+
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   int main() {
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index d2e88b0e465..7d6c9172950 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -7,6 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+#include <folly/Optional.h>
+#endif  // ROCKSDB_LITE
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
@@ -27,10 +30,10 @@ class DBBloomFilterTest : public DBTestBase {
 
 class DBBloomFilterTestWithParam : public DBTestBase,
                                    public testing::WithParamInterface<
-                                       std::tuple<BFP::Impl, bool, uint32_t>> {
+                                       std::tuple<BFP::Mode, bool, uint32_t>> {
   //                             public testing::WithParamInterface<bool> {
  protected:
-  BFP::Impl bfp_impl_;
+  BFP::Mode bfp_impl_;
   bool partition_filters_;
   uint32_t format_version_;
 
@@ -506,23 +509,26 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestDefFormatVersion,
     ::testing::Values(
-        std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestWithParam,
     ::testing::Values(
-        std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, DBBloomFilterTestWithParam,
     ::testing::Values(
-        std::make_tuple(BFP::kBlock, false, test::kLatestFormatVersion),
-        std::make_tuple(BFP::kFull, true, test::kLatestFormatVersion),
-        std::make_tuple(BFP::kFull, false, test::kLatestFormatVersion)));
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
@@ -861,27 +867,27 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
 #ifndef ROCKSDB_LITE
 class BloomStatsTestWithParam
     : public DBBloomFilterTest,
-      public testing::WithParamInterface<std::tuple<bool, BFP::Impl, bool>> {
+      public testing::WithParamInterface<
+          std::tuple<folly::Optional<BFP::Mode>, bool>> {
  public:
   BloomStatsTestWithParam() {
-    use_block_table_ = std::get<0>(GetParam());
-    bfp_impl_ = std::get<1>(GetParam());
-    partition_filters_ = std::get<2>(GetParam());
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
 
     options_.create_if_missing = true;
     options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
     options_.memtable_prefix_bloom_size_ratio =
         8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
-    if (use_block_table_) {
+    if (bfp_impl_) {
       BlockBasedTableOptions table_options;
       table_options.hash_index_allow_collision = false;
       if (partition_filters_) {
-        assert(bfp_impl_ != BFP::kBlock);
+        assert(*bfp_impl_ != BFP::kDeprecatedBlock);
         table_options.partition_filters = partition_filters_;
         table_options.index_type =
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
       }
-      table_options.filter_policy.reset(new BFP(10, bfp_impl_));
+      table_options.filter_policy.reset(new BFP(10, *bfp_impl_));
       options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
     } else {
       assert(!partition_filters_);  // not supported in plain table
@@ -903,8 +909,7 @@ class BloomStatsTestWithParam
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  bool use_block_table_;
-  BFP::Impl bfp_impl_;
+  folly::Optional<BFP::Mode> bfp_impl_;
   bool partition_filters_;
   Options options_;
 };
@@ -1008,7 +1013,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_EQ(value3, iter->value().ToString());
   // The seek doesn't check block-based bloom filter because last index key
   // starts with the same prefix we're seeking to.
-  uint64_t expected_hits = bfp_impl_ == BFP::kBlock ? 1 : 2;
+  uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2;
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 
   iter->Seek(key2);
@@ -1020,10 +1025,12 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
 
 INSTANTIATE_TEST_CASE_P(
     BloomStatsTestWithParam, BloomStatsTestWithParam,
-    ::testing::Values(std::make_tuple(true, BFP::kBlock, false),
-                      std::make_tuple(true, BFP::kFull, false),
-                      std::make_tuple(true, BFP::kFull, true),
-                      std::make_tuple(false, BFP::kFull, false)));
+    ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false),
+                      std::make_tuple(BFP::kLegacyBloom, false),
+                      std::make_tuple(BFP::kLegacyBloom, true),
+                      std::make_tuple(BFP::kFastLocalBloom, false),
+                      std::make_tuple(BFP::kFastLocalBloom, true),
+                      std::make_tuple(folly::Optional<BFP::Mode>(), false)));
 
 namespace {
 void PrefixScanInit(DBBloomFilterTest* dbtest) {
@@ -1330,8 +1337,8 @@ int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
 // into the same string, or 2) the transformed seek key is of the same length
 // as the upper bound and two keys are adjacent according to the comparator.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
-  for (auto bfp_impl : BFP::kAllImpls) {
-    int using_full_builder = bfp_impl != BFP::kBlock;
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewCappedPrefixTransform(4));
@@ -1461,8 +1468,8 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
 // Create multiple SST files each with a different prefix_extractor config,
 // verify iterators can read all SST files using the latest config.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
-  for (auto bfp_impl : BFP::kAllImpls) {
-    int using_full_builder = bfp_impl != BFP::kBlock;
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1596,7 +1603,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
 // as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
   int iteration = 0;
-  for (auto bfp_impl : BFP::kAllImpls) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1654,7 +1661,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
 // Verify it's possible to change prefix_extractor at runtime and iterators
 // behaves as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
-  for (auto bfp_impl : BFP::kAllImpls) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 6cc7bff6730..2e72d83f671 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -80,6 +80,9 @@ class FilterBitsReader {
   }
 };
 
+// Internal type required for FilterPolicy
+struct FilterBuildingContext;
+
 // We add a new format of filter block called full filter block
 // This new interface gives you more space of customization
 //
@@ -131,6 +134,18 @@ class FilterPolicy {
       const Slice& /*contents*/) const {
     return nullptr;
   }
+
+ protected:
+  // An internal-use-only variant of GetFilterBitsBuilder that allows
+  // a built-in FilterPolicy to customize the builder for contextual
+  // constraints and hints. (Name changed to avoid triggering
+  // -Werror=overloaded-virtual.)
+  virtual FilterBitsBuilder* GetFilterBitsBuilderInternal(
+      const FilterBuildingContext&) const {
+    return GetFilterBitsBuilder();
+  }
+
+  friend FilterBuildingContext;
 };
 
 // Return a new filter policy that uses a bloom filter with approximately
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 712c604ad35..63dce41efc8 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -269,6 +269,9 @@ struct BlockBasedTableOptions {
   // probably use this as it would reduce the index size.
   // This option only affects newly written tables. When reading existing
   // tables, the information about version is read from the footer.
+  // 5 -- Can be read by RocksDB's versions since X.X.X (something after 6.4.6)
+  // Full and partitioned filters use a generally faster and more accurate
+  // Bloom filter implementation, with a different schema.
   uint32_t format_version = 2;
 
   // Store index blocks on disk in compressed format. Changing this option to
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index c137581ee3e..3e40549c488 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -25,7 +25,6 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
-#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
@@ -36,6 +35,7 @@
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
 #include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
@@ -68,7 +68,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
   if (table_opt.filter_policy == nullptr) return nullptr;
 
   FilterBitsBuilder* filter_bits_builder =
-      table_opt.filter_policy->GetFilterBitsBuilder();
+      FilterBuildingContext(table_opt).GetBuilder();
   if (filter_bits_builder == nullptr) {
     return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
                                             table_opt);
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index a4b93596e34..dc7c299854d 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <array>
+
 #include "rocksdb/filter_policy.h"
 
 #include "rocksdb/slice.h"
@@ -22,34 +24,179 @@ namespace rocksdb {
 
 namespace {
 
-typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  FastLocalBloomBitsBuilder(const int bits_per_key, const int num_probes)
+      : bits_per_key_(bits_per_key), num_probes_(num_probes) {
+    assert(bits_per_key_);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len_with_metadata =
+        CalculateSpace(static_cast<uint32_t>(hash_entries_.size()));
+    char* data = new char[len_with_metadata];
+    memset(data, 0, len_with_metadata);
+
+    assert(data);
+    assert(len_with_metadata >= 5);
+
+    uint32_t len = len_with_metadata - 5;
+    if (len > 0) {
+      AddAllEntries(data, len);
+    }
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    data[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    data[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    data[len + 2] = static_cast<char>(num_probes_);
+    // rest of metadata stays zero
+
+    const char* const_data = data;
+    buf->reset(const_data);
+    hash_entries_.clear();
+
+    return Slice(data, len_with_metadata);
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
+    return static_cast<int>(uint64_t{8} * bytes_no_meta / bits_per_key_);
+  }
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t num_cache_lines = 0;
+    if (bits_per_key_ > 0 && num_entry > 0) {
+      num_cache_lines = static_cast<uint32_t>(
+          (int64_t{num_entry} * bits_per_key_ + 511) / 512);
+    }
+    return num_cache_lines * 64 + /*metadata*/ 5;
+  }
+
+ private:
+  void AddAllEntries(char* data, uint32_t len) const {
+    // Simple version without prefetching:
+    //
+    // for (auto h : hash_entries_) {
+    //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+    //                               num_probes_, data);
+    // }
+
+    const size_t num_entries = hash_entries_.size();
+    constexpr size_t kBufferMask = 7;
+    static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+                  "Must be power of 2 minus 1");
+
+    std::array<uint32_t, kBufferMask + 1> hashes;
+    std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+    // Prime the buffer
+    size_t i = 0;
+    for (; i <= kBufferMask && i < num_entries; ++i) {
+      uint64_t h = hash_entries_[i];
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+
+    // Process and buffer
+    for (; i < num_entries; ++i) {
+      uint32_t& hash_ref = hashes[i & kBufferMask];
+      uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+      // Process (add)
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_,
+                                          data + byte_offset_ref);
+      // And buffer
+      uint64_t h = hash_entries_[i];
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offset_ref);
+      hash_ref = Upper32of64(h);
+    }
 
-class FullFilterBitsBuilder : public BuiltinFilterBitsBuilder {
+    // Finish processing
+    for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_,
+                                          data + byte_offsets[i]);
+    }
+  }
+
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint64_t> hash_entries_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public FilterBitsReader {
  public:
-  explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes);
+  FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+      : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
 
   // No Copy allowed
-  FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete;
-  void operator=(const FullFilterBitsBuilder&) = delete;
+  FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+  void operator=(const FastLocalBloomBitsReader&) = delete;
+
+  ~FastLocalBloomBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    uint32_t byte_offset;
+    FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                    /*out*/ &byte_offset);
+    return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+                                                    data_ + byte_offset);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      uint64_t h = GetSliceHash64(*keys[i]);
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i]);
+    }
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t len_bytes_;
+};
 
-  ~FullFilterBitsBuilder() override;
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit LegacyBloomBitsBuilder(const int bits_per_key, const int num_probes);
+
+  // No Copy allowed
+  LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+  void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+  ~LegacyBloomBitsBuilder() override;
 
   void AddKey(const Slice& key) override;
 
-  // Create a filter that for hashes [0, n-1], the filter is allocated here
-  // When creating filter, it is ensured that
-  // total_bits = num_lines * CACHE_LINE_SIZE * 8
-  // dst len is >= 5, 1 for num_probes, 4 for num_lines
-  // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
-  // +----------------------------------------------------------------+
-  // |              filter data with length total_bits/8              |
-  // +----------------------------------------------------------------+
-  // |                                                                |
-  // | ...                                                            |
-  // |                                                                |
-  // +----------------------------------------------------------------+
-  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
-  // +----------------------------------------------------------------+
   Slice Finish(std::unique_ptr<const char[]>* buf) override;
 
   int CalculateNumEntry(const uint32_t bytes) override;
@@ -81,22 +228,22 @@ class FullFilterBitsBuilder : public BuiltinFilterBitsBuilder {
   void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
 };
 
-FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
-                                             const int num_probes)
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
+                                               const int num_probes)
     : bits_per_key_(bits_per_key), num_probes_(num_probes) {
   assert(bits_per_key_);
 }
 
-FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
 
-void FullFilterBitsBuilder::AddKey(const Slice& key) {
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
   uint32_t hash = BloomHash(key);
   if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
     hash_entries_.push_back(hash);
   }
 }
 
-Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
   uint32_t total_bits, num_lines;
   char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), &total_bits,
                             &num_lines);
@@ -107,6 +254,7 @@ Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
       AddHash(h, data, num_lines, total_bits);
     }
   }
+  // See BloomFilterPolicy::GetFilterBitsReader for metadata
   data[total_bits / 8] = static_cast<char>(num_probes_);
   EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
 
@@ -117,7 +265,7 @@ Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
   return Slice(data, total_bits / 8 + 5);
 }
 
-uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
   uint32_t num_lines =
       (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
 
@@ -129,9 +277,9 @@ uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
   return num_lines * (CACHE_LINE_SIZE * 8);
 }
 
-uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
-                                               uint32_t* total_bits,
-                                               uint32_t* num_lines) {
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+                                                uint32_t* total_bits,
+                                                uint32_t* num_lines) {
   assert(bits_per_key_);
   if (num_entry != 0) {
     uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
@@ -151,16 +299,16 @@ uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
   return sz;
 }
 
-char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
-                                          uint32_t* total_bits,
-                                          uint32_t* num_lines) {
+char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+                                           uint32_t* total_bits,
+                                           uint32_t* num_lines) {
   uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
   char* data = new char[sz];
   memset(data, 0, sz);
   return data;
 }
 
-int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
+int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
   assert(bits_per_key_);
   assert(bytes > 0);
   int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
@@ -175,43 +323,32 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
   return n;
 }
 
-inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
-    uint32_t num_lines, uint32_t total_bits) {
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+                                            uint32_t num_lines,
+                                            uint32_t total_bits) {
 #ifdef NDEBUG
   static_cast<void>(total_bits);
 #endif
   assert(num_lines > 0 && total_bits > 0);
 
-  LegacyFullFilterImpl::AddHash(h, num_lines, num_probes_, data,
-                                folly::constexpr_log2(CACHE_LINE_SIZE));
+  LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+                           folly::constexpr_log2(CACHE_LINE_SIZE));
 }
 
-class AlwaysTrueFilter : public FilterBitsReader {
- public:
-  bool MayMatch(const Slice&) override { return true; }
-  using FilterBitsReader::MayMatch;  // inherit overload
-};
-
-class AlwaysFalseFilter : public FilterBitsReader {
- public:
-  bool MayMatch(const Slice&) override { return false; }
-  using FilterBitsReader::MayMatch;  // inherit overload
-};
-
-class FullFilterBitsReader : public FilterBitsReader {
+class LegacyBloomBitsReader : public FilterBitsReader {
  public:
-  FullFilterBitsReader(const char* data, int num_probes, uint32_t num_lines,
-                       uint32_t log2_cache_line_size)
+  LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                        uint32_t log2_cache_line_size)
       : data_(data),
         num_probes_(num_probes),
         num_lines_(num_lines),
         log2_cache_line_size_(log2_cache_line_size) {}
 
   // No Copy allowed
-  FullFilterBitsReader(const FullFilterBitsReader&) = delete;
-  void operator=(const FullFilterBitsReader&) = delete;
+  LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+  void operator=(const LegacyBloomBitsReader&) = delete;
 
-  ~FullFilterBitsReader() override {}
+  ~LegacyBloomBitsReader() override {}
 
   // "contents" contains the data built by a preceding call to
   // FilterBitsBuilder::Finish. MayMatch must return true if the key was
@@ -221,23 +358,23 @@ class FullFilterBitsReader : public FilterBitsReader {
   bool MayMatch(const Slice& key) override {
     uint32_t hash = BloomHash(key);
     uint32_t byte_offset;
-    LegacyFullFilterImpl::PrepareHashMayMatch(
+    LegacyBloomImpl::PrepareHashMayMatch(
         hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
-    return LegacyFullFilterImpl::HashMayMatchPrepared(
+    return LegacyBloomImpl::HashMayMatchPrepared(
         hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
   }
 
   virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
-    uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
-    uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
     for (int i = 0; i < num_keys; ++i) {
       hashes[i] = BloomHash(*keys[i]);
-      LegacyFullFilterImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
-                                                /*out*/ &byte_offsets[i],
-                                                log2_cache_line_size_);
+      LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                           /*out*/ &byte_offsets[i],
+                                           log2_cache_line_size_);
     }
     for (int i = 0; i < num_keys; ++i) {
-      may_match[i] = LegacyFullFilterImpl::HashMayMatchPrepared(
+      may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
           hashes[i], num_probes_, data_ + byte_offsets[i],
           log2_cache_line_size_);
     }
@@ -250,15 +387,33 @@ class FullFilterBitsReader : public FilterBitsReader {
   const uint32_t log2_cache_line_size_;
 };
 
+class AlwaysTrueFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
 }  // namespace
 
-const std::vector<BloomFilterPolicy::Impl> BloomFilterPolicy::kAllImpls = {
-    kFull,
-    kBlock,
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
+    kLegacyBloom,
+    kDeprecatedBlock,
+    kFastLocalBloom,
 };
 
-BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Impl impl)
-    : bits_per_key_(bits_per_key), impl_(impl) {
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
+    kDeprecatedBlock,
+    kAuto,
+};
+
+BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Mode mode)
+    : bits_per_key_(bits_per_key), mode_(mode) {
   // We intentionally round down to reduce probing cost a little bit
   num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
   if (num_probes_ < 1) num_probes_ = 1;
@@ -275,7 +430,7 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
                                      std::string* dst) const {
   // We should ideally only be using this deprecated interface for
   // appropriately constructed BloomFilterPolicy
-  assert(impl_ == kBlock);
+  assert(mode_ == kDeprecatedBlock);
 
   // Compute bloom filter size (in both bits and bytes)
   uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
@@ -321,11 +476,41 @@ bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
 }
 
 FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
-  if (impl_ == kBlock) {
-    return nullptr;
-  } else {
-    return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
+  // This code path should no longer be used, for the built-in
+  // BloomFilterPolicy. Internal to RocksDB and outside BloomFilterPolicy,
+  // only get a FilterBitsBuilder with FilterBuildingContext::GetBuilder(),
+  // which will call BloomFilterPolicy::GetFilterBitsBuilderInternal.
+  // RocksDB users have been warned (HISTORY.md) that they can no longer
+  // call this on the built-in BloomFilterPolicy (unlikely).
+  assert(false);
+  return GetFilterBitsBuilderInternal(
+      FilterBuildingContext(BlockBasedTableOptions()));
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal(
+    const FilterBuildingContext& context) const {
+  Mode cur = mode_;
+  // Unusual code construction so that we can have just
+  // one exhaustive switch without (risky) recursion
+  for (int i = 0; i < 2; ++i) {
+    switch (cur) {
+      case kAuto:
+        if (context.table_options_.format_version < 5) {
+          cur = kLegacyBloom;
+        } else {
+          cur = kFastLocalBloom;
+        }
+        break;
+      case kDeprecatedBlock:
+        return nullptr;
+      case kFastLocalBloom:
+        return new FastLocalBloomBitsBuilder(bits_per_key_, num_probes_);
+      case kLegacyBloom:
+        return new LegacyBloomBitsBuilder(bits_per_key_, num_probes_);
+    }
   }
+  assert(false);
+  return nullptr;  // something legal
 }
 
 // Read metadata to determine what kind of FilterBitsReader is needed
@@ -338,19 +523,37 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
     return new AlwaysFalseFilter();
   }
 
+  // Legacy Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | byte for num_probes or            |
+  //               |   marker for new implementations  |
+  //         len+1 +-----------------------------------+
+  //               | four bytes for number of cache    |
+  //               |   lines                           |
+  // len_with_meta +-----------------------------------+
+
   int8_t raw_num_probes =
       static_cast<int8_t>(contents.data()[len_with_meta - 5]);
   // NB: *num_probes > 30 and < 128 probably have not been used, because of
   // BloomFilterPolicy::initialize, unless directly calling
-  // FullFilterBitsBuilder as an API, but we are leaving those cases in
-  // limbo with FullFilterBitsReader for now.
+  // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+  // limbo with LegacyBloomBitsReader for now.
 
   if (raw_num_probes < 1) {
+    // Note: < 0 (or unsigned > 127) indicate special new implementations
+    // (or reserved for future use)
+    if (raw_num_probes == -1) {
+      // Marker for newer Bloom implementations
+      return GetBloomBitsReader(contents);
+    }
+    // otherwise
     // Treat as zero probes (always FP) for now.
-    // NB: < 0 (or unsigned > 127) effectively reserved for future use.
     return new AlwaysTrueFilter();
   }
-  // else attempt decode for FullFilterBitsReader
+  // else attempt decode for LegacyBloomBitsReader
 
   int num_probes = raw_num_probes;
   assert(num_probes >= 1);
@@ -382,17 +585,82 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
     }
   }
   // if not early return
-  return new FullFilterBitsReader(contents.data(), num_probes, num_lines,
-                                  log2_cache_line_size);
+  return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+                                   log2_cache_line_size);
+}
+
+// For newer Bloom filter implementations
+FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - 5;
+
+  assert(len > 0);  // precondition
+
+  // New Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | char{-1} byte -> new Bloom filter |
+  //         len+1 +-----------------------------------+
+  //               | byte for subimplementation        |
+  //               |   0: FastLocalBloom               |
+  //               |   other: reserved                 |
+  //         len+2 +-----------------------------------+
+  //               | byte for block_and_probes         |
+  //               |   0 in top 3 bits -> 6 -> 64-byte |
+  //               |   reserved:                       |
+  //               |   1 in top 3 bits -> 7 -> 128-byte|
+  //               |   2 in top 3 bits -> 8 -> 256-byte|
+  //               |   ...                             |
+  //               |   num_probes in bottom 5 bits,    |
+  //               |     except 0 and 31 reserved      |
+  //         len+3 +-----------------------------------+
+  //               | two bytes reserved                |
+  //               |   possibly for hash seed          |
+  // len_with_meta +-----------------------------------+
+
+  // Read more metadata (see above)
+  char sub_impl_val = contents.data()[len_with_meta - 4];
+  char block_and_probes = contents.data()[len_with_meta - 3];
+  int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+  int num_probes = (block_and_probes & 31);
+  if (num_probes < 1 || num_probes > 30) {
+    // Reserved / future safe
+    return new AlwaysTrueFilter();
+  }
+
+  uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+  if (rest != 0) {
+    // Reserved, possibly for hash seed
+    // Future safe
+    return new AlwaysTrueFilter();
+  }
+
+  if (sub_impl_val == 0) {        // FastLocalBloom
+    if (log2_block_bytes == 6) {  // Only block size supported for now
+      return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+    }
+  }
+  // otherwise
+  // Reserved / future safe
+  return new AlwaysTrueFilter();
 }
 
 const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
                                          bool use_block_based_builder) {
+  BloomFilterPolicy::Mode m;
   if (use_block_based_builder) {
-    return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kBlock);
+    m = BloomFilterPolicy::kDeprecatedBlock;
   } else {
-    return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kFull);
+    m = BloomFilterPolicy::kAuto;
   }
+  assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
+                   BloomFilterPolicy::kAllUserModes.end(),
+                   m) != BloomFilterPolicy::kAllUserModes.end());
+  return new BloomFilterPolicy(bits_per_key, m);
 }
 
 FilterPolicy::~FilterPolicy() { }
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 017c8b1d259..2129a781eb0 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
 
 namespace rocksdb {
 
@@ -28,27 +29,69 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
   virtual uint32_t CalculateSpace(const int num_entry) = 0;
 };
 
+// Current information passed to BloomFilterPolicy at filter building
+// time. Subject to change.
+struct FilterBuildingContext {
+  explicit FilterBuildingContext(const BlockBasedTableOptions& table_options)
+      : table_options_(table_options) {}
+
+  // A convenience function to save boilerplate
+  FilterBitsBuilder* GetBuilder() const {
+    if (table_options_.filter_policy) {
+      return table_options_.filter_policy->GetFilterBitsBuilderInternal(*this);
+    } else {
+      return nullptr;
+    }
+  }
+
+  const BlockBasedTableOptions& table_options_;
+};
+
 // RocksDB built-in filter policy for Bloom or Bloom-like filters.
 // This class is considered internal API and subject to change.
 // See NewBloomFilterPolicy.
 class BloomFilterPolicy : public FilterPolicy {
  public:
-  // An internal marker for which Bloom filter implementation to use.
-  // This makes it easier for tests to track or to walk over the built-in
-  // set of Bloom filter policies. The only variance in BloomFilterPolicy
-  // by implementation is in GetFilterBitsBuilder(), so an enum is practical
-  // here vs. subclasses.
-  enum Impl {
-    // Implementation of Bloom filter for full and partitioned filters.
+  // An internal marker for operating modes of BloomFilterPolicy, in terms
+  // of selecting an implementation. This makes it easier for tests to track
+  // or to walk over the built-in set of Bloom filter implementations. The
+  // only variance in BloomFilterPolicy by mode/implementation is in
+  // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses.
+  //
+  // This enum is essentially the union of all the different kinds of return
+  // value from GetFilterBitsBuilder, or "underlying implementation", and
+  // higher-level modes that choose an underlying implementation based on
+  // context information.
+  enum Mode {
+    // Legacy implementation of Bloom filter for full and partitioned filters.
     // Set to 0 in case of value confusion with bool use_block_based_builder
-    kFull = 0,
+    // NOTE: TESTING ONLY as this mode does not use best compatible
+    // implementation
+    kLegacyBloom = 0,
     // Deprecated block-based Bloom filter implementation.
     // Set to 1 in case of value confusion with bool use_block_based_builder
-    kBlock = 1,
+    // NOTE: DEPRECATED but user exposed
+    kDeprecatedBlock = 1,
+    // A fast, cache-local Bloom filter implementation. See description in
+    // FastLocalBloomImpl.
+    // NOTE: TESTING ONLY as this mode does not check format_version
+    kFastLocalBloom = 2,
+    // Automatically choose from the above (except kDeprecatedBlock) based on
+    // context at build time, including compatibility with format_version.
+    // NOTE: This is currently the only recommended mode that is user exposed.
+    kAuto = 100,
   };
-  static const std::vector<Impl> kAllImpls;
+  // All the different underlying implementations that a BloomFilterPolicy
+  // might use, as a mode that says "always use this implementation."
+  // Only appropriate for unit tests.
+  static const std::vector<Mode> kAllFixedImpls;
+
+  // All the different modes of BloomFilterPolicy that are exposed from
+  // user APIs. Only appropriate for higher-level unit tests. Integration
+  // tests should prefer using NewBloomFilterPolicy (user-exposed).
+  static const std::vector<Mode> kAllUserModes;
 
-  explicit BloomFilterPolicy(int bits_per_key, Impl impl);
+  explicit BloomFilterPolicy(int bits_per_key, Mode mode);
 
   ~BloomFilterPolicy() override;
 
@@ -68,11 +111,23 @@ class BloomFilterPolicy : public FilterPolicy {
   // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
   FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
 
+ protected:
+  // To use this function, call FilterBuildingContext::GetBuilder().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetFilterBitsBuilderInternal(
+      const FilterBuildingContext&) const override;
+
  private:
   int bits_per_key_;
   int num_probes_;
-  // Selected implementation for building new SST filters
-  Impl impl_;
+  // Selected mode (a specific implementation or way of selecting an
+  // implementation) for building new SST filters.
+  Mode mode_;
+
+  // For newer Bloom filter implementation(s)
+  FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 3d9655644e5..eb4506f89a5 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -111,7 +111,7 @@ class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
 
 TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
   FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
@@ -131,7 +131,7 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
 
 TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
   FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
   builder.Add("foo");
   builder.Add("bar");
   builder.Add("box");
@@ -189,7 +189,7 @@ class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
 
 TEST_F(FullFilterBlockTest, EmptyBuilder) {
   FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
@@ -239,7 +239,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
     std::unique_ptr<const SliceTransform> prefix_extractor(
         NewFixedPrefixTransform(0));
     auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
-        table_options_.filter_policy->GetFilterBitsBuilder());
+        FilterBuildingContext(table_options_).GetBuilder());
     const bool WHOLE_KEY = true;
     FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                    bits_builder);
@@ -263,7 +263,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(7));
   auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
-      table_options_.filter_policy->GetFilterBitsBuilder());
+      FilterBuildingContext(table_options_).GetBuilder());
   const bool WHOLE_KEY = true;
   FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                  bits_builder);
@@ -280,7 +280,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
   FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
   ASSERT_EQ(0, builder.NumAdded());
   builder.Add("foo");
   builder.Add("bar");
diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h
index 74f8ccf5a28..8f15f935058 100644
--- a/table/block_based/parsed_full_filter_block.h
+++ b/table/block_based/parsed_full_filter_block.h
@@ -25,7 +25,7 @@ class ParsedFullFilterBlock {
     return filter_bits_reader_.get();
   }
 
-  // TODO: consider memory usage of FullFilterBitsReader
+  // TODO: consider memory usage of the FilterBitsReader
   size_t ApproximateMemoryUsage() const {
     return block_contents_.ApproximateMemoryUsage();
   }
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 999dbb7f9ab..b621b58893f 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -126,7 +126,7 @@ class PartitionedFilterBlockTest
     const bool kValueDeltaEncoded = true;
     return new PartitionedFilterBlockBuilder(
         prefix_extractor, table_options_.whole_key_filtering,
-        table_options_.filter_policy->GetFilterBitsBuilder(),
+        FilterBuildingContext(table_options_).GetBuilder(),
         table_options_.index_block_restart_interval, !kValueDeltaEncoded,
         p_index_builder, partition_size);
   }
diff --git a/table/format.h b/table/format.h
index 235c6e89401..2ed80e2fc35 100644
--- a/table/format.h
+++ b/table/format.h
@@ -117,7 +117,7 @@ inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
 }
 
 inline bool BlockBasedTableSupportedVersion(uint32_t version) {
-  return version <= 4;
+  return version <= 5;
 }
 
 // Footer encapsulates the fixed information stored at the tail
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 3bf0e878c5a..7c90c14efee 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -24,7 +24,7 @@ namespace rocksdb {
 namespace test {
 
 const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
-const uint32_t kLatestFormatVersion = 4u;
+const uint32_t kLatestFormatVersion = 5u;
 
 Slice RandomString(Random* rnd, int len, std::string* dst) {
   dst->resize(len);
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index 500e0496871..9e34afb9bb3 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -12,18 +12,219 @@
 #include <stdint.h>
 
 #include "rocksdb/slice.h"
+#include "util/hash.h"
+
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
 
 namespace rocksdb {
 
+// A fast, flexible, and accurate cache-local Bloom implementation with
+// SIMD-optimized query performance (currently using AVX2 on Intel). Write
+// performance and non-SIMD read are very good, benefiting from fastrange32
+// used in place of % and single-cycle multiplication on recent processors.
+//
+// Most other SIMD Bloom implementations sacrifice flexibility and/or
+// accuracy by requiring num_probes to be a power of two and restricting
+// where each probe can occur in a cache line. This implementation sacrifices
+// SIMD-optimization for add (might still be possible, especially with AVX512)
+// in favor of allowing any num_probes, not crossing cache line boundary,
+// and accuracy close to theoretical best accuracy for a cache-local Bloom.
+// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
+// (Intel cache line size) is 0.9535% FP rate. This implementation yields
+// about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
+// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
+//
+// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
+// a 64-bit hash (split into two uint32s). With many millions of keys, the
+// false positive rate associated with using a 32-bit hash can dominate the
+// false positive rate of the underlying filter. At 10 bits/key setting, the
+// inflection point is about 40 million keys, so 32-bit hash is a bad idea
+// with 10s of millions of keys or more.
+//
+// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
+// to pick a cache line, which can be faster than 64-bit in some cases.
+// This only hurts accuracy as you get into 10s of GB for a single filter,
+// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
+// 64-bit fastrange if you need filters so big. ;)
+//
+// Using only a 32-bit input hash within each cache line has negligible
+// impact for any reasonable cache line / bucket size, for arbitrary filter
+// size, and potentially saves intermediate data size in some cases vs.
+// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
+// to generate indices, I might do the same, as a single multiplication
+// suffices to generate a sufficiently mixed 64 bits from 32 bits.)
+//
+// This implementation is currently tied to Intel cache line size, 64 bytes ==
+// 512 bits. If there's sufficient demand for other cache line sizes, this is
+// a pretty good implementation to extend, but slight performance enhancements
+// are possible with an alternate implementation (probably not very compatible
+// with SIMD):
+// (1) Use rotation in addition to multiplication for remixing
+// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
+// because lower bits never depend on original upper bits.)
+// (2) Extract more than one bit index from each re-mix. (Only if rotation
+// or similar is part of remix, because otherwise you're making the
+// multiplication-only problem worse.)
+// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
+// re-mix.
+//
+class FastLocalBloomImpl {
+ public:
+  static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                             int num_probes, char *data) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline void AddHashPrepared(uint32_t h2, int num_probes,
+                                     char *data_at_cache_line) {
+    uint32_t h = h2;
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
+    }
+  }
+
+  static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
+                                 const char *data,
+                                 uint32_t /*out*/ *byte_offset) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
+    *byte_offset = bytes_to_cache_line;
+  }
+
+  static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                                  int num_probes, const char *data) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
+                                          const char *data_at_cache_line) {
+    uint32_t h = h2;
+#ifdef HAVE_AVX2
+    int rem_probes = num_probes;
+
+    // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
+    // etc.} one can insert specialized code for rem_probes <= 2, bypassing
+    // the SIMD code in those cases. There is a detectable but minor overhead
+    // applied to other values of num_probes (when not statically determined),
+    // but smoother performance curve vs. num_probes. But for now, when
+    // in doubt, don't add unnecessary code.
+
+    // Powers of 32-bit golden ratio, mod 2**32.
+    const __m256i multipliers =
+        _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
+                          0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
+
+    for (;;) {
+      // Eight copies of hash
+      __m256i hash_vector = _mm256_set1_epi32(h);
+
+      // Same effect as repeated multiplication by 0x9e3779b9 thanks to
+      // associativity of multiplication.
+      hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
+
+      // Now the top 9 bits of each of the eight 32-bit values in
+      // hash_vector are bit addresses for probes within the cache line.
+      // While the platform-independent code uses byte addressing (6 bits
+      // to pick a byte + 3 bits to pick a bit within a byte), here we work
+      // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
+      // within a word) because that works well with AVX2 and is equivalent
+      // under little-endian.
+
+      // Shift each right by 28 bits to get 4-bit word addresses.
+      const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
+
+      // Gather 32-bit values spread over 512 bits by 4-bit address. In
+      // essence, we are dereferencing eight pointers within the cache
+      // line.
+      //
+      // Option 1: AVX2 gather (seems to be a little slow - understandable)
+      // const __m256i value_vector =
+      //     _mm256_i32gather_epi32(static_cast<const int
+      //     *>(data_at_cache_line),
+      //                            word_addresses,
+      //                            /*bytes / i32*/ 4);
+      // END Option 1
+      // Potentially unaligned as we're not *always* cache-aligned -> loadu
+      const __m256i *mm_data =
+          reinterpret_cast<const __m256i *>(data_at_cache_line);
+      __m256i lower = _mm256_loadu_si256(mm_data);
+      __m256i upper = _mm256_loadu_si256(mm_data + 1);
+      // Option 2: AVX512VL permute hack
+      // Only negligibly faster than Option 3, so not yet worth supporting
+      // const __m256i value_vector =
+      //    _mm256_permutex2var_epi32(lower, word_addresses, upper);
+      // END Option 2
+      // Option 3: AVX2 permute+blend hack
+      // Use lowest three bits to order probing values, as if all from same
+      // 256 bit piece.
+      lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
+      upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
+      // Just top 1 bit of address, to select between lower and upper.
+      const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
+      // Finally: the next 8 probed 32-bit values, in probing sequence order.
+      const __m256i value_vector =
+          _mm256_blendv_epi8(lower, upper, upper_lower_selector);
+      // END Option 3
+
+      // We might not need to probe all 8, so build a mask for selecting only
+      // what we need. (The k_selector(s) could be pre-computed but that
+      // doesn't seem to make a noticeable performance difference.)
+      const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+      // Subtract rem_probes from each of those constants
+      __m256i k_selector =
+          _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
+      // Negative after subtract -> use/select
+      // Keep only high bit (logical shift right each by 31).
+      k_selector = _mm256_srli_epi32(k_selector, 31);
+
+      // Strip off the 4 bit word address (shift left)
+      __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
+      // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
+      bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
+      // Build a bit mask
+      const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
+
+      // Like ((~value_vector) & bit_mask) == 0)
+      bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
+
+      // This check first so that it's easy for branch predictor to optimize
+      // num_probes <= 8 case, making it free of unpredictable branches.
+      if (rem_probes <= 8) {
+        return match;
+      } else if (!match) {
+        return false;
+      }
+      // otherwise
+      // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
+      h *= 0xab25f4c1;
+      rem_probes -= 8;
+    }
+#else
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
+        return false;
+      }
+    }
+    return true;
+#endif
+  }
+};
+
 // A legacy Bloom filter implementation with no locality of probes (slow).
 // It uses double hashing to generate a sequence of hash values.
 // Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
 // subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
 //
-// DO NOT REUSE - faster and more predictably accurate implementations
-// are available at
-// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
-// See e.g. RocksDB DynamicBloom.
+// DO NOT REUSE
 //
 class LegacyNoLocalityBloomImpl {
  public:
@@ -64,10 +265,7 @@ class LegacyNoLocalityBloomImpl {
 // a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
 // https://github.com/facebook/rocksdb/issues/4120
 //
-// DO NOT REUSE - faster and more predictably accurate implementations
-// are available at
-// https://github.com/pdillinger/wormhashing/blob/master/bloom_simulation_tests/foo.cc
-// See e.g. RocksDB DynamicBloom.
+// DO NOT REUSE
 //
 template <bool ExtraRotates>
 class LegacyLocalityBloomImpl {
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 9d969fea985..3ab3e071b1e 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -62,23 +62,21 @@ class BlockBasedBloomTest : public testing::Test {
   std::vector<std::string> keys_;
 
  public:
-  BlockBasedBloomTest()
-      : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, true)) {}
+  BlockBasedBloomTest() { ResetPolicy(); }
 
   void Reset() {
     keys_.clear();
     filter_.clear();
   }
 
-  void ResetPolicy(const FilterPolicy* policy = nullptr) {
-    if (policy == nullptr) {
-      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, true));
-    } else {
-      policy_.reset(policy);
-    }
+  void ResetPolicy(int bits_per_key) {
+    policy_.reset(new BloomFilterPolicy(bits_per_key,
+                                        BloomFilterPolicy::kDeprecatedBlock));
     Reset();
   }
 
+  void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
   void Add(const Slice& s) {
     keys_.push_back(s.ToString());
   }
@@ -189,43 +187,43 @@ TEST_F(BlockBasedBloomTest, VaryingLengths) {
 TEST_F(BlockBasedBloomTest, Schema) {
   char buffer[sizeof(int)];
 
-  ResetPolicy(NewBloomFilterPolicy(8, true));  // num_probes = 5
+  ResetPolicy(8);  // num_probes = 5
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
 
-  ResetPolicy(NewBloomFilterPolicy(9, true));  // num_probes = 6
+  ResetPolicy(9);  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 969445585);
 
-  ResetPolicy(NewBloomFilterPolicy(11, true));  // num_probes = 7
+  ResetPolicy(11);  // num_probes = 7
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 1694458207);
 
-  ResetPolicy(NewBloomFilterPolicy(10, true));  // num_probes = 6
+  ResetPolicy(10);  // num_probes = 6
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
 
-  ResetPolicy(NewBloomFilterPolicy(10, true));
-  for (int key = 1; key < 87; key++) {
+  ResetPolicy(10);
+  for (int key = /*CHANGED*/ 1; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 1908442116);
 
-  ResetPolicy(NewBloomFilterPolicy(10, true));
-  for (int key = 1; key < 88; key++) {
+  ResetPolicy(10);
+  for (int key = 1; key < /*CHANGED*/ 88; key++) {
     Add(Key(key, buffer));
   }
   Build();
@@ -236,19 +234,18 @@ TEST_F(BlockBasedBloomTest, Schema) {
 
 // Different bits-per-byte
 
-class FullBloomTest : public testing::Test {
+class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
  private:
-  std::unique_ptr<const FilterPolicy> policy_;
+  BlockBasedTableOptions table_options_;
+  std::shared_ptr<const FilterPolicy>& policy_;
   std::unique_ptr<FilterBitsBuilder> bits_builder_;
   std::unique_ptr<FilterBitsReader> bits_reader_;
   std::unique_ptr<const char[]> buf_;
   size_t filter_size_;
 
  public:
-  FullBloomTest() :
-      policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
-      filter_size_(0) {
-    Reset();
+  FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) {
+    ResetPolicy();
   }
 
   BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
@@ -257,21 +254,19 @@ class FullBloomTest : public testing::Test {
   }
 
   void Reset() {
-    bits_builder_.reset(policy_->GetFilterBitsBuilder());
+    bits_builder_.reset(FilterBuildingContext(table_options_).GetBuilder());
     bits_reader_.reset(nullptr);
     buf_.reset(nullptr);
     filter_size_ = 0;
   }
 
-  void ResetPolicy(const FilterPolicy* policy = nullptr) {
-    if (policy == nullptr) {
-      policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, false));
-    } else {
-      policy_.reset(policy);
-    }
+  void ResetPolicy(int bits_per_key) {
+    policy_.reset(new BloomFilterPolicy(bits_per_key, GetParam()));
     Reset();
   }
 
+  void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
   void Add(const Slice& s) {
     bits_builder_->AddKey(s);
   }
@@ -292,6 +287,16 @@ class FullBloomTest : public testing::Test {
 
   Slice FilterData() { return Slice(buf_.get(), filter_size_); }
 
+  int GetNumProbesFromFilterData() {
+    assert(filter_size_ >= 5);
+    int8_t raw_num_probes = static_cast<int8_t>(buf_.get()[filter_size_ - 5]);
+    if (raw_num_probes == -1) {  // New bloom filter marker
+      return static_cast<uint8_t>(buf_.get()[filter_size_ - 3]);
+    } else {
+      return raw_num_probes;
+    }
+  }
+
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
       Build();
@@ -299,6 +304,8 @@ class FullBloomTest : public testing::Test {
     return bits_reader_->MayMatch(s);
   }
 
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for reasonbly high FP rates.
   uint64_t PackedMatches() {
     char buffer[sizeof(int)];
     uint64_t result = 0;
@@ -310,6 +317,26 @@ class FullBloomTest : public testing::Test {
     return result;
   }
 
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for lower FP rates.
+  std::string FirstFPs(int count) {
+    char buffer[sizeof(int)];
+    std::string rv;
+    int fp_count = 0;
+    for (int i = 0; i < 1000000; i++) {
+      // Pack four match booleans into each hexadecimal digit
+      if (Matches(Key(i + 1000000, buffer))) {
+        ++fp_count;
+        rv += std::to_string(i);
+        if (fp_count == count) {
+          break;
+        }
+        rv += ',';
+      }
+    }
+    return rv;
+  }
+
   double FalsePositiveRate() {
     char buffer[sizeof(int)];
     int result = 0;
@@ -320,9 +347,25 @@ class FullBloomTest : public testing::Test {
     }
     return result / 10000.0;
   }
+
+  uint32_t SelectByImpl(uint32_t for_legacy_bloom,
+                        uint32_t for_fast_local_bloom) {
+    switch (GetParam()) {
+      case BloomFilterPolicy::kLegacyBloom:
+        return for_legacy_bloom;
+      case BloomFilterPolicy::kFastLocalBloom:
+        return for_fast_local_bloom;
+      case BloomFilterPolicy::kDeprecatedBlock:
+      case BloomFilterPolicy::kAuto:
+          /* N/A */;
+    }
+    // otherwise
+    assert(false);
+    return 0;
+  }
 };
 
-TEST_F(FullBloomTest, FilterSize) {
+TEST_P(FullBloomTest, FilterSize) {
   auto bits_builder = GetBuiltinFilterBitsBuilder();
   for (int n = 1; n < 100; n++) {
     auto space = bits_builder->CalculateSpace(n);
@@ -333,13 +376,13 @@ TEST_F(FullBloomTest, FilterSize) {
   }
 }
 
-TEST_F(FullBloomTest, FullEmptyFilter) {
+TEST_P(FullBloomTest, FullEmptyFilter) {
   // Empty filter is not match, at this level
   ASSERT_TRUE(!Matches("hello"));
   ASSERT_TRUE(!Matches("world"));
 }
 
-TEST_F(FullBloomTest, FullSmall) {
+TEST_P(FullBloomTest, FullSmall) {
   Add("hello");
   Add("world");
   ASSERT_TRUE(Matches("hello"));
@@ -348,7 +391,7 @@ TEST_F(FullBloomTest, FullSmall) {
   ASSERT_TRUE(!Matches("foo"));
 }
 
-TEST_F(FullBloomTest, FullVaryingLengths) {
+TEST_P(FullBloomTest, FullVaryingLengths) {
   char buffer[sizeof(int)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -409,66 +452,175 @@ inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
 }  // namespace
 
 // Ensure the implementation doesn't accidentally change in an
-// incompatible way
-TEST_F(FullBloomTest, Schema) {
+// incompatible way. This test doesn't check the reading side
+// (FirstFPs/PackedMatches) for LegacyBloom because it requires the
+// ability to read filters generated using other cache line sizes.
+// See RawSchema.
+TEST_P(FullBloomTest, Schema) {
   char buffer[sizeof(int)];
 
   // Use enough keys so that changing bits / key by 1 is guaranteed to
   // change number of allocated cache lines. So keys > max cache line bits.
 
-  ResetPolicy(NewBloomFilterPolicy(8));  // num_probes = 5
+  ResetPolicy(2);  // num_probes = 1
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 1);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
+                   3817481309U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+  }
+
+  ResetPolicy(3);  // num_probes = 2
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 2);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
+                   2807269961U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+  }
+
+  ResetPolicy(5);  // num_probes = 3
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
+                   204628445));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+  }
 
-  ResetPolicy(NewBloomFilterPolicy(9));  // num_probes = 6
+  ResetPolicy(8);  // num_probes = 5
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(2092755149, 661139132, 1182970461));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 5);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
+                   355564975));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+  }
 
-  ResetPolicy(NewBloomFilterPolicy(11));  // num_probes = 7
+  ResetPolicy(9);  // num_probes = 6
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
+                   2137566013U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
+  }
 
-  ResetPolicy(NewBloomFilterPolicy(10));  // num_probes = 6
+  ResetPolicy(11);  // num_probes = 7
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(1478976371, 2910591341U, 1182970461));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
+                   2561502687U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+  }
 
-  ResetPolicy(NewBloomFilterPolicy(10));
-  for (int key = 1; key < 2087; key++) {
+  ResetPolicy(14);  // num_probes = 9
+  for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
+                   3129678118U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("130,989,2002,3225,3543,4522,4863,5256,5277", FirstFPs(9));
+  }
 
-  ResetPolicy(NewBloomFilterPolicy(10));
-  for (int key = 1; key < 2088; key++) {
+  ResetPolicy(16);  // num_probes = 11
+  for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()),
-            SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ(GetNumProbesFromFilterData(), 11);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
+                   1262483504));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("240,945,2660,3299,4031,4282,5173,6197,8715", FirstFPs(9));
+  }
+
+  ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
+                   2498541272U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
+
+  ResetPolicy(10);
+  for (int key = /*CHANGED*/ 1; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
+                   2058382345U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
+
+  ResetPolicy(10);
+  for (int key = 1; key < /*CHANGED*/ 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
+                   23699164));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
 
   ResetPolicy();
 }
 
 // A helper class for testing custom or corrupt filter bits as read by
-// FullFilterBitsReader.
+// built-in FilterBitsReaders.
 struct RawFilterTester {
   // Buffer, from which we always return a tail Slice, so the
   // last five bytes are always the metadata bytes.
@@ -502,23 +654,23 @@ struct RawFilterTester {
   }
 };
 
-TEST_F(FullBloomTest, RawSchema) {
+TEST_P(FullBloomTest, RawSchema) {
   RawFilterTester cft;
   // Two probes, about 3/4 bits set: ~50% "FP" rate
   // One 256-byte cache line.
   OpenRaw(cft.ResetWeirdFill(256, 1, 2));
-  ASSERT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
+  EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
 
   // Two 128-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 2, 2));
-  ASSERT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
+  EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
 
   // Four 64-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 4, 2));
-  ASSERT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+  EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
 }
 
-TEST_F(FullBloomTest, CorruptFilters) {
+TEST_P(FullBloomTest, CorruptFilters) {
   RawFilterTester cft;
 
   for (bool fill : {false, true}) {
@@ -634,6 +786,10 @@ TEST_F(FullBloomTest, CorruptFilters) {
   }
 }
 
+INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
+                        testing::Values(BloomFilterPolicy::kLegacyBloom,
+                                        BloomFilterPolicy::kFastLocalBloom));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 268cbf3132f..9dc2ed436c6 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -19,7 +19,7 @@ int main() {
 #include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "rocksdb/filter_policy.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/mock_block_based_table.h"
 #include "table/plain/plain_table_bloom.h"
@@ -93,12 +93,14 @@ void _always_assert_fail(int line, const char *file, const char *expr) {
 
 using rocksdb::Arena;
 using rocksdb::BlockContents;
+using rocksdb::BloomFilterPolicy;
 using rocksdb::BloomHash;
 using rocksdb::CachableEntry;
 using rocksdb::EncodeFixed32;
 using rocksdb::fastrange32;
 using rocksdb::FilterBitsBuilder;
 using rocksdb::FilterBitsReader;
+using rocksdb::FilterBuildingContext;
 using rocksdb::FullFilterBlockReader;
 using rocksdb::GetSliceHash;
 using rocksdb::GetSliceHash64;
@@ -240,8 +242,9 @@ struct FilterBench : public MockBlockBasedTableTester {
   Arena arena_;
 
   FilterBench()
-      : MockBlockBasedTableTester(
-            rocksdb::NewBloomFilterPolicy(FLAGS_bits_per_key)),
+      : MockBlockBasedTableTester(new BloomFilterPolicy(
+            FLAGS_bits_per_key,
+            static_cast<BloomFilterPolicy::Mode>(FLAGS_impl))),
         random_(FLAGS_seed) {
     for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
       kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
@@ -259,17 +262,25 @@ void FilterBench::Go() {
     throw std::runtime_error(
         "Can't combine -use_plain_table_bloom and -use_full_block_reader");
   }
-  if (FLAGS_impl > 1) {
-    throw std::runtime_error("-impl must currently be >= 0 and <= 1");
-  }
-  if (!FLAGS_use_plain_table_bloom && FLAGS_impl == 1) {
-    throw std::runtime_error(
-        "Block-based filter not currently supported by filter_bench");
+  if (FLAGS_use_plain_table_bloom) {
+    if (FLAGS_impl > 1) {
+      throw std::runtime_error(
+          "-impl must currently be >= 0 and <= 1 for Plain table");
+    }
+  } else {
+    if (FLAGS_impl == 1) {
+      throw std::runtime_error(
+          "Block-based filter not currently supported by filter_bench");
+    }
+    if (FLAGS_impl > 2) {
+      throw std::runtime_error(
+          "-impl must currently be 0 or 2 for Block-based table");
+    }
   }
 
   std::unique_ptr<FilterBitsBuilder> builder;
   if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
-    builder.reset(table_options_.filter_policy->GetFilterBitsBuilder());
+    builder.reset(FilterBuildingContext(table_options_).GetBuilder());
   }
 
   uint32_t variance_mask = 1;
@@ -350,7 +361,8 @@ void FilterBench::Go() {
     std::cout << "----------------------------" << std::endl;
     std::cout << "Verifying..." << std::endl;
 
-    uint32_t outside_q_per_f = 1000000 / infos_.size();
+    uint32_t outside_q_per_f =
+        static_cast<uint32_t>(FLAGS_m_queries * 1000000 / infos_.size());
     uint64_t fps = 0;
     for (uint32_t i = 0; i < infos_.size(); ++i) {
       FilterInfo &info = infos_[i];

From e8e7fb1dcfc3312e7322ca05759b78f0fcfdf7c5 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 14 Nov 2019 06:18:23 -0800
Subject: [PATCH 532/572] More fixes to auto-GarbageCollect in BackupEngine
 (#6023)

Summary:
Production:
* Fixes GarbageCollect (and auto-GC triggered by PurgeOldBackups, DeleteBackup, or CreateNewBackup) to clean up backup directory independent of current settings (except max_valid_backups_to_open; see issue https://github.com/facebook/rocksdb/issues/4997) and prior settings used with same backup directory.
* Fixes GarbageCollect (and auto-GC) not to attempt to remove "." and ".." entries from directories.
* Clarifies contract with users in modifying BackupEngine operations. In short, leftovers from any incomplete operation are cleaned up by any subsequent call to that same kind of operation (PurgeOldBackups and DeleteBackup considered the same kind of operation). GarbageCollect is available to clean up after all kinds. (NB: right now PurgeOldBackups and DeleteBackup will clean up after incomplete CreateNewBackup, but we aren't promising to continue that behavior.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6023

Test Plan:
* Refactors open parameters to use an option enum, for readability, etc. (Also fixes an unused parameter bug in the redundant OpenDBAndBackupEngineShareWithChecksum.)
* Fixes an apparent bug in ShareTableFilesWithChecksumsTransition in which old backup data was destroyed in the transition to be tested. That test is now augmented to ensure GarbageCollect (or auto-GC) does not remove shared files when BackupEngine is opened with share_table_files=false.
* Augments DeleteTmpFiles test to ensure that CreateNewBackup does auto-GC when an incompletely created backup is detected.

Differential Revision: D18453559

Pulled By: pdillinger

fbshipit-source-id: 5e54e7b08d711b161bc9c656181012b69a8feac4
---
 include/rocksdb/utilities/backupable_db.h  |  25 ++--
 utilities/backupable/backupable_db.cc      |  94 ++++++++------
 utilities/backupable/backupable_db_test.cc | 138 ++++++++++++++-------
 3 files changed, 162 insertions(+), 95 deletions(-)

diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 1c810d7afe7..afff2c2ac74 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -276,10 +276,14 @@ class BackupEngine {
                                        progress_callback);
   }
 
-  // deletes old backups, keeping latest num_backups_to_keep alive
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
   virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
 
-  // deletes a specific backup
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
   virtual Status DeleteBackup(BackupID backup_id) = 0;
 
   // Call this from another thread if you want to stop the backup
@@ -287,8 +291,8 @@ class BackupEngine {
   // not wait for the backup to stop.
   // The backup will stop ASAP and the call to CreateNewBackup will
   // return Status::Incomplete(). It will not clean up after itself, but
-  // the state will remain consistent. The state will be cleaned up
-  // next time you create BackupableDB or RestoreBackupableDB.
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
   virtual void StopBackup() = 0;
 
   // Returns info about backups in backup_info
@@ -323,12 +327,13 @@ class BackupEngine {
   // Returns Status::OK() if all checks are good
   virtual Status VerifyBackup(BackupID backup_id) = 0;
 
-  // Will delete all the files we don't need anymore
-  // It will do the full scan of the files/ directory and delete all the
-  // files that are not referenced. PurgeOldBackups() and DeleteBackup()
-  // will do a similar operation as needed to clean up from any incomplete
-  // deletions, so this function is not really needed if calling one of
-  // those.
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete).
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
   virtual Status GarbageCollect() = 0;
 };
 
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 02f33da7ed4..ab79abbe186 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -764,7 +764,11 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   if (s.ok()) {
     // maybe last backup failed and left partial state behind, clean it up.
     // need to do this before updating backups_ such that a private dir
-    // named after new_backup_id will be cleaned up
+    // named after new_backup_id will be cleaned up.
+    // (If an incomplete new backup is followed by an incomplete delete
+    // of the latest full backup, then there could be more than one next
+    // id with a private dir, the last thing to be deleted in delete
+    // backup, but all will be cleaned up with a GarbageCollect.)
     s = GarbageCollect();
   } else if (s.IsNotFound()) {
     // normal case, the new backup's private dir doesn't exist yet
@@ -1572,53 +1576,57 @@ Status BackupEngineImpl::GarbageCollect() {
         "constrains how many backups the engine knows about");
   }
 
-  if (options_.share_table_files &&
-      options_.max_valid_backups_to_open == port::kMaxInt32) {
+  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
     // delete obsolete shared files
     // we cannot do this when BackupEngine has `max_valid_backups_to_open` set
     // as those engines don't know about all shared files.
-    std::vector<std::string> shared_children;
-    {
-      std::string shared_path;
-      if (options_.share_files_with_checksum) {
-        shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
-      } else {
-        shared_path = GetAbsolutePath(GetSharedFileRel());
-      }
-      auto s = backup_env_->FileExists(shared_path);
-      if (s.ok()) {
-        s = backup_env_->GetChildren(shared_path, &shared_children);
-      } else if (s.IsNotFound()) {
-        s = Status::OK();
-      }
-      if (!s.ok()) {
-        overall_status = s;
-        // Trying again later might work
-        might_need_garbage_collect_ = true;
-      }
-    }
-    for (auto& child : shared_children) {
-      std::string rel_fname;
-      if (options_.share_files_with_checksum) {
-        rel_fname = GetSharedFileWithChecksumRel(child);
-      } else {
-        rel_fname = GetSharedFileRel(child);
-      }
-      auto child_itr = backuped_file_infos_.find(rel_fname);
-      // if it's not refcounted, delete it
-      if (child_itr == backuped_file_infos_.end() ||
-          child_itr->second->refs == 0) {
-        // this might be a directory, but DeleteFile will just fail in that
-        // case, so we're good
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
-        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                       rel_fname.c_str(), s.ToString().c_str());
-        backuped_file_infos_.erase(rel_fname);
+    for (bool with_checksum : {false, true}) {
+      std::vector<std::string> shared_children;
+      {
+        std::string shared_path;
+        if (with_checksum) {
+          shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
+        } else {
+          shared_path = GetAbsolutePath(GetSharedFileRel());
+        }
+        auto s = backup_env_->FileExists(shared_path);
+        if (s.ok()) {
+          s = backup_env_->GetChildren(shared_path, &shared_children);
+        } else if (s.IsNotFound()) {
+          s = Status::OK();
+        }
         if (!s.ok()) {
+          overall_status = s;
           // Trying again later might work
           might_need_garbage_collect_ = true;
         }
       }
+      for (auto& child : shared_children) {
+        if (child == "." || child == "..") {
+          continue;
+        }
+        std::string rel_fname;
+        if (with_checksum) {
+          rel_fname = GetSharedFileWithChecksumRel(child);
+        } else {
+          rel_fname = GetSharedFileRel(child);
+        }
+        auto child_itr = backuped_file_infos_.find(rel_fname);
+        // if it's not refcounted, delete it
+        if (child_itr == backuped_file_infos_.end() ||
+            child_itr->second->refs == 0) {
+          // this might be a directory, but DeleteFile will just fail in that
+          // case, so we're good
+          Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+          ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                         rel_fname.c_str(), s.ToString().c_str());
+          backuped_file_infos_.erase(rel_fname);
+          if (!s.ok()) {
+            // Trying again later might work
+            might_need_garbage_collect_ = true;
+          }
+        }
+      }
     }
   }
 
@@ -1634,6 +1642,9 @@ Status BackupEngineImpl::GarbageCollect() {
     }
   }
   for (auto& child : private_children) {
+    if (child == "." || child == "..") {
+      continue;
+    }
     // it's ok to do this when BackupEngine has `max_valid_backups_to_open` set
     // as the engine always knows all valid backup numbers.
     BackupID backup_id = 0;
@@ -1650,6 +1661,9 @@ Status BackupEngineImpl::GarbageCollect() {
     std::vector<std::string> subchildren;
     backup_env_->GetChildren(full_private_path, &subchildren);
     for (auto& subchild : subchildren) {
+      if (subchild == "." || subchild == "..") {
+        continue;
+      }
       Status s = backup_env_->DeleteFile(full_private_path + subchild);
       ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                      (full_private_path + subchild).c_str(),
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 19cff6648b8..010ef089739 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -10,7 +10,9 @@
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
 #include <algorithm>
+#include <limits>
 #include <string>
+#include <utility>
 
 #include "db/db_impl/db_impl.h"
 #include "env/env_chroot.h"
@@ -515,6 +517,15 @@ static void AssertEmpty(DB* db, int from, int to) {
 
 class BackupableDBTest : public testing::Test {
  public:
+  enum ShareOption {
+    kNoShare,
+    kShareNoChecksum,
+    kShareWithChecksum,
+  };
+
+  const std::vector<ShareOption> kAllShareOptions = {
+      kNoShare, kShareNoChecksum, kShareWithChecksum};
+
   BackupableDBTest() {
     // set up files
     std::string db_chroot = test::PerThreadDBPath("backupable_db");
@@ -560,15 +571,8 @@ class BackupableDBTest : public testing::Test {
     return db;
   }
 
-  void OpenDBAndBackupEngineShareWithChecksum(
-      bool destroy_old_data = false, bool dummy = false,
-      bool /*share_table_files*/ = true, bool share_with_checksums = false) {
-    backupable_options_->share_files_with_checksum = share_with_checksums;
-    OpenDBAndBackupEngine(destroy_old_data, dummy, share_with_checksums);
-  }
-
   void OpenDBAndBackupEngine(bool destroy_old_data = false, bool dummy = false,
-                             bool share_table_files = true) {
+                             ShareOption shared_option = kShareNoChecksum) {
     // reset all the defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetLimitWrittenFiles(1000000);
@@ -583,7 +587,9 @@ class BackupableDBTest : public testing::Test {
     }
     db_.reset(db);
     backupable_options_->destroy_old_data = destroy_old_data;
-    backupable_options_->share_table_files = share_table_files;
+    backupable_options_->share_table_files = shared_option != kNoShare;
+    backupable_options_->share_files_with_checksum =
+        shared_option == kShareWithChecksum;
     BackupEngine* backup_engine;
     ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
                                  &backup_engine));
@@ -1204,7 +1210,7 @@ TEST_F(BackupableDBTest, FailOverwritingBackups) {
 
 TEST_F(BackupableDBTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
-  OpenDBAndBackupEngine(true, false, false);
+  OpenDBAndBackupEngine(true, false, kNoShare);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1220,7 +1226,7 @@ TEST_F(BackupableDBTest, NoShareTableFiles) {
 // Verify that you can backup and restore with share_files_with_checksum on
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1238,7 +1244,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, false);
+  OpenDBAndBackupEngine(true, false, kShareNoChecksum);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
@@ -1251,65 +1257,107 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   }
 
   // set share_files_with_checksum to true and do some more backups
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
   for (int i = 5; i < 10; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   }
   CloseDBAndBackupEngine();
 
-  for (int i = 0; i < 5; ++i) {
-    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1),
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11);
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  backup_engine_->DeleteBackup(1);
+  backup_engine_->GarbageCollect();
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 1; i < 10; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
                             keys_iteration * 11);
   }
 }
 
+// This test simulates cleaning up after aborted or incomplete creation
+// of a new backup.
 TEST_F(BackupableDBTest, DeleteTmpFiles) {
-  for (int cleanup_fn : {1, 2, 3}) {
-    for (bool shared_checksum : {false, true}) {
-      OpenDBAndBackupEngineShareWithChecksum(
-          false /* destroy_old_data */, false /* dummy */,
-          true /* share_table_files */, shared_checksum);
+  for (int cleanup_fn : {1, 2, 3, 4}) {
+    for (ShareOption shared_option : kAllShareOptions) {
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+      BackupID next_id = 1;
+      BackupID oldest_id = std::numeric_limits<BackupID>::max();
+      {
+        std::vector<BackupInfo> backup_info;
+        backup_engine_->GetBackupInfo(&backup_info);
+        for (const auto& bi : backup_info) {
+          next_id = std::max(next_id, bi.backup_id + 1);
+          oldest_id = std::min(oldest_id, bi.backup_id);
+        }
+      }
       CloseDBAndBackupEngine();
-      std::string shared_tmp = backupdir_;
-      if (shared_checksum) {
-        shared_tmp += "/shared_checksum";
-      } else {
-        shared_tmp += "/shared";
+
+      // An aborted or incomplete new backup will always be in the next
+      // id (maybe more)
+      std::string next_private = "private/" + std::to_string(next_id);
+
+      // NOTE: both shared and shared_checksum should be cleaned up
+      // regardless of how the backup engine is opened.
+      std::vector<std::string> tmp_files_and_dirs;
+      for (const auto& dir_and_file : {
+               std::make_pair(std::string("shared"),
+                              std::string(".00006.sst.tmp")),
+               std::make_pair(std::string("shared_checksum"),
+                              std::string(".00007.sst.tmp")),
+               std::make_pair(next_private, std::string("00003.sst")),
+           }) {
+        std::string dir = backupdir_ + "/" + dir_and_file.first;
+        file_manager_->CreateDir(dir);
+        ASSERT_OK(file_manager_->FileExists(dir));
+
+        std::string file = dir + "/" + dir_and_file.second;
+        file_manager_->WriteToFile(file, "tmp");
+        ASSERT_OK(file_manager_->FileExists(file));
+
+        tmp_files_and_dirs.push_back(file);
       }
-      shared_tmp += "/.00006.sst.tmp";
-      std::string private_tmp_dir = backupdir_ + "/private/10";
-      std::string private_tmp_file = private_tmp_dir + "/00003.sst";
-      file_manager_->WriteToFile(shared_tmp, "tmp");
-      file_manager_->CreateDir(private_tmp_dir);
-      file_manager_->WriteToFile(private_tmp_file, "tmp");
-      ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
-      if (shared_checksum) {
-        OpenDBAndBackupEngineShareWithChecksum(
-            false /* destroy_old_data */, false /* dummy */,
-            true /* share_table_files */, true /* share_with_checksums */);
-      } else {
-        OpenDBAndBackupEngine();
+      if (cleanup_fn != /*CreateNewBackup*/ 4) {
+        // This exists after CreateNewBackup because it's deleted then
+        // re-created.
+        tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private);
       }
+
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
       // Need to call one of these explicitly to delete tmp files
       switch (cleanup_fn) {
         case 1:
-          (void)backup_engine_->GarbageCollect();
+          ASSERT_OK(backup_engine_->GarbageCollect());
           break;
         case 2:
-          (void)backup_engine_->DeleteBackup(1);
+          ASSERT_OK(backup_engine_->DeleteBackup(oldest_id));
           break;
         case 3:
-          (void)backup_engine_->PurgeOldBackups(1);
+          ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+          break;
+        case 4:
+          // Does a garbage collect if it sees that next private dir exists
+          ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
           break;
         default:
           assert(false);
       }
       CloseDBAndBackupEngine();
-      ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
-      ASSERT_EQ(Status::NotFound(),
-                file_manager_->FileExists(private_tmp_file));
-      ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
+      for (std::string file_or_dir : tmp_files_and_dirs) {
+        if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) {
+          FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn;
+        }
+      }
     }
   }
 }

From 6123611c420a8b1f436fe9c7b2fba245a41faa4d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 14 Nov 2019 13:59:43 -0800
Subject: [PATCH 533/572] crash_test: use large max_manifest_file_size most of
 the time. (#6034)

Summary:
Right now, crash_test always uses 16KB max_manifest_file_size value. It is good to cover logic of manifest file switch. However, information stored in manifest files might be useful in debugging failures. Switch to only use small manifest file size in 1/15 of the time.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6034

Test Plan: Observe command generated by db_crash_test.py multiple times and see the --max_manifest_file_size value distribution.

Differential Revision: D18513824

fbshipit-source-id: 7b3ae6dbe521a0918df41064e3fa5ecbf2466e04
---
 tools/db_crashtest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 0716fdb2bae..e88f9492030 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -72,6 +72,10 @@
     "periodic_compaction_seconds" :
         lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     "compaction_ttl" : lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # Test small max_manifest_file_size in a smaller chance, as most of the
+    # time we wnat manifest history to be preserved to help debug
+    "max_manifest_file_size" : lambda : random.choice(
+        [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1,30)])
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'

From 00d58a370eb59a002b21d827e1710c1b437c9e6e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 14 Nov 2019 14:00:58 -0800
Subject: [PATCH 534/572] Abandon use of folly::Optional (#6036)

Summary:
Had complications with LITE build and valgrind test.
Reverts/fixes small parts of PR https://github.com/facebook/rocksdb/issues/6007
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6036

Test Plan:
make LITE=1 all check
and
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 make -j24 db_bloom_filter_test && ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 ./db_bloom_filter_test

Differential Revision: D18512238

Pulled By: pdillinger

fbshipit-source-id: 37213cf0d309edf11c483fb4b2fb6c02c2cf2b28
---
 CMakeLists.txt             |  2 +-
 Makefile                   |  2 +-
 TARGETS                    |  1 -
 buckifier/targets_cfg.py   |  1 -
 db/db_bloom_filter_test.cc | 31 +++++++++++++++++--------------
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94aad1a191d..45693df5024 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -470,7 +470,7 @@ endif()
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
-if(NOT ROCKSDB_LITE)
+if(WITH_FOLLY_DISTRIBUTED_MUTEX)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
 endif()
 find_package(Threads REQUIRED)
diff --git a/Makefile b/Makefile
index ebdb53f1f98..b440f408c0a 100644
--- a/Makefile
+++ b/Makefile
@@ -320,7 +320,7 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
-ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
 	FOLLY_DIR = ./third-party/folly
 	# AIX: pre-defined system headers are surrounded by an extern "C" block
 	ifeq ($(PLATFORM), OS_AIX)
diff --git a/TARGETS b/TARGETS
index 07e5fe6c889..ab1f24cd76c 100644
--- a/TARGETS
+++ b/TARGETS
@@ -77,7 +77,6 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
-    "-I" + REPO_PATH + "third-party/folly/",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index b93f79bb7b8..0ecd6fdda76 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -83,7 +83,6 @@
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
-    "-I" + REPO_PATH + "third-party/folly/",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 7d6c9172950..a3a47e7d54e 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -7,9 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef ROCKSDB_LITE
-#include <folly/Optional.h>
-#endif  // ROCKSDB_LITE
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
@@ -19,6 +16,12 @@ namespace rocksdb {
 
 namespace {
 using BFP = BloomFilterPolicy;
+
+namespace BFP2 {
+// Extends BFP::Mode with option to use Plain table
+using PseudoMode = int;
+static constexpr PseudoMode kPlainTable = -1;
+}  // namespace BFP2
 }  // namespace
 
 // DB tests related to bloom filter.
@@ -867,8 +870,7 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
 #ifndef ROCKSDB_LITE
 class BloomStatsTestWithParam
     : public DBBloomFilterTest,
-      public testing::WithParamInterface<
-          std::tuple<folly::Optional<BFP::Mode>, bool>> {
+      public testing::WithParamInterface<std::tuple<BFP2::PseudoMode, bool>> {
  public:
   BloomStatsTestWithParam() {
     bfp_impl_ = std::get<0>(GetParam());
@@ -878,21 +880,22 @@ class BloomStatsTestWithParam
     options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
     options_.memtable_prefix_bloom_size_ratio =
         8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
-    if (bfp_impl_) {
+    if (bfp_impl_ == BFP2::kPlainTable) {
+      assert(!partition_filters_);  // not supported in plain table
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    } else {
       BlockBasedTableOptions table_options;
       table_options.hash_index_allow_collision = false;
       if (partition_filters_) {
-        assert(*bfp_impl_ != BFP::kDeprecatedBlock);
+        assert(bfp_impl_ != BFP::kDeprecatedBlock);
         table_options.partition_filters = partition_filters_;
         table_options.index_type =
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
       }
-      table_options.filter_policy.reset(new BFP(10, *bfp_impl_));
+      table_options.filter_policy.reset(
+          new BFP(10, static_cast<BFP::Mode>(bfp_impl_)));
       options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    } else {
-      assert(!partition_filters_);  // not supported in plain table
-      PlainTableOptions table_options;
-      options_.table_factory.reset(NewPlainTableFactory(table_options));
     }
     options_.env = env_;
 
@@ -909,7 +912,7 @@ class BloomStatsTestWithParam
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  folly::Optional<BFP::Mode> bfp_impl_;
+  BFP2::PseudoMode bfp_impl_;
   bool partition_filters_;
   Options options_;
 };
@@ -1030,7 +1033,7 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple(BFP::kLegacyBloom, true),
                       std::make_tuple(BFP::kFastLocalBloom, false),
                       std::make_tuple(BFP::kFastLocalBloom, true),
-                      std::make_tuple(folly::Optional<BFP::Mode>(), false)));
+                      std::make_tuple(BFP2::kPlainTable, false)));
 
 namespace {
 void PrefixScanInit(DBBloomFilterTest* dbtest) {

From 0058daef7b6263cfbd6578199e8b1d8546d850fa Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Thu, 14 Nov 2019 14:39:48 -0800
Subject: [PATCH 535/572] Disable SmallestUnCommittedSeq in Valgrind run
 (#6035)

Summary:
SmallestUnCommittedSeq sometimes takes too long when run under Valgrind. The patch disables it when the tests are run under Valgrind.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6035

Differential Revision: D18509198

Pulled By: maysamyabandeh

fbshipit-source-id: 1191443b9fedb6b9c50d6b76f5c92371f5030230
---
 utilities/transactions/write_prepared_transaction_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 7144a7919f1..4efdcaa0a50 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -1578,6 +1578,7 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) {
   delete txn0;
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 // Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
 // delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
 // which moves prepared txns from prepared_txns_ to delayed_prepared_.
@@ -1639,6 +1640,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
     delete txn;
   }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
   // Given the sequential run of txns, with this timeout we should never see a

From f65ec09ef86c8dffd20aa63a12126a23d6d3c5b4 Mon Sep 17 00:00:00 2001
From: Little-Wallace <bupt2013211450@gmail.com>
Date: Fri, 15 Nov 2019 13:59:03 -0800
Subject: [PATCH 536/572] Fix IngestExternalFile's bug with two_write_queue
 (#5976)

Summary:
When two_write_queue enable, IngestExternalFile performs EnterUnbatched on both write queues. SwitchMemtable also EnterUnbatched on 2nd write queue when this option is enabled. When the call stack includes IngestExternalFile -> FlushMemTable -> SwitchMemtable, this results into a deadlock.
The implemented solution is to pass on the existing writes_stopped argument in FlushMemTable to skip EnterUnbatched in SwitchMemtable.
Fixes https://github.com/facebook/rocksdb/issues/5974
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5976

Differential Revision: D18535943

Pulled By: maysamyabandeh

fbshipit-source-id: a4f9d4964c10d4a7ca06b1e0102ca2ec395512bc
---
 db/db_impl/db_impl_compaction_flush.cc | 14 +++++++++
 db/db_impl/db_impl_debug.cc            | 11 ++++++-
 db/db_impl/db_impl_write.cc            | 42 ++++++++++++++++++--------
 db/external_sst_file_test.cc           | 20 ++++++++++++
 4 files changed, 73 insertions(+), 14 deletions(-)

diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index f7423224406..e5fd30e86ca 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1532,8 +1532,12 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     InstrumentedMutexLock guard_lock(&mutex_);
 
     WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
     }
 
     if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
@@ -1596,6 +1600,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
     }
   }
   TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
@@ -1650,8 +1657,12 @@ Status DBImpl::AtomicFlushMemTables(
     InstrumentedMutexLock guard_lock(&mutex_);
 
     WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
     }
 
     for (auto cfd : column_family_datas) {
@@ -1695,6 +1706,9 @@ Status DBImpl::AtomicFlushMemTables(
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
     }
   }
   TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index d783355ce7f..566c175735a 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -105,7 +105,16 @@ Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
   if (cfd == nullptr) {
     cfd = default_cf_handle_->cfd();
   }
-  return SwitchMemtable(cfd, &write_context);
+
+  if (two_write_queues_) {
+    WriteThread::Writer nonmem_w;
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    Status s = SwitchMemtable(cfd, &write_context);
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    return s;
+  } else {
+    return SwitchMemtable(cfd, &write_context);
+  }
 }
 
 Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 4ab9de8c400..34193cabc95 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1246,6 +1246,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
     }
     MaybeFlushStatsCF(&cfds);
   }
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
   for (const auto cfd : cfds) {
     cfd->Ref();
     status = SwitchMemtable(cfd, write_context);
@@ -1254,6 +1259,10 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
       break;
     }
   }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1314,6 +1323,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
     MaybeFlushStatsCF(&cfds);
   }
 
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
   for (const auto cfd : cfds) {
     if (cfd->mem()->IsEmpty()) {
       continue;
@@ -1325,6 +1338,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
       break;
     }
   }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1530,6 +1547,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
     MaybeFlushStatsCF(&cfds);
   }
   Status status;
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
   for (auto& cfd : cfds) {
     if (!cfd->mem()->IsEmpty()) {
       status = SwitchMemtable(cfd, context);
@@ -1542,6 +1564,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
       break;
     }
   }
+
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1572,15 +1599,11 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
 Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
   WriteThread::Writer nonmem_w;
-  if (two_write_queues_) {
-    // SwitchMemtable is a rare event. To simply the reasoning, we make sure
-    // that there is no concurrent thread writing to WAL.
-    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
-  }
-
   std::unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
   MemTable* new_mem = nullptr;
@@ -1687,10 +1710,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
     // Read back bg_error in order to get the right severity
     s = error_handler_.GetBGError();
-
-    if (two_write_queues_) {
-      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-    }
     return s;
   }
 
@@ -1721,9 +1740,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   NotifyOnMemTableSealed(cfd, memtable_info);
   mutex_.Lock();
 #endif  // ROCKSDB_LITE
-  if (two_write_queues_) {
-    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-  }
   return s;
 }
 
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 7fe325e3cf7..3a059773f33 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -2750,6 +2750,26 @@ TEST_P(ExternalSSTFileTest,
   Destroy(options, true /* delete_cf_paths */);
 }
 
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+  Options options = CurrentOptions();
+  // Use large buffer to avoid memtable flush
+  options.write_buffer_size = 1024 * 1024;
+  options.two_write_queues = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+  // Put one key which is overlap with keys in memtable.
+  // It will trigger flushing memtable and require this thread is
+  // currently at the front of the 2nd writer queue. We must make
+  // sure that it won't enter the 2nd writer queue for the second time.
+  std::vector<std::pair<std::string, std::string>> data;
+  data.push_back(std::make_pair("1001", "v2"));
+  GenerateAndAddExternalFile(options, data);
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
                         testing::Values(std::make_tuple(false, false),
                                         std::make_tuple(false, true),

From 5b9233bfe815d0e2e827e039e0862309c6b0c2e0 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Mon, 18 Nov 2019 09:35:37 -0800
Subject: [PATCH 537/572] Fix a test failure on systems that don't have Snappy
 compression libraries (#6038)

Summary:
The ParallelIO/DBBasicTestWithParallelIO.MultiGet/11 test fails if Snappy compression library is not installed, since RocksDB defaults to Snappy if none is specified. So dynamically determine the supported compression types and pick the first one.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6038

Differential Revision: D18532370

Pulled By: anand1976

fbshipit-source-id: a0a735114d1f8892ea09f7c4af8688d7bcc5b075
---
 db/db_basic_test.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 84111aec427..46bfaaec2bd 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1663,6 +1663,19 @@ class DBBasicTestWithParallelIO
     Options options = CurrentOptions();
     Random rnd(301);
     BlockBasedTableOptions table_options;
+
+    if (compression_enabled_) {
+      std::vector<CompressionType> compression_types;
+      compression_types = GetSupportedCompressions();
+      // Not every platform may have compression libraries available, so
+      // dynamically pick based on what's available
+      if (compression_types.size() == 0) {
+        compression_enabled_ = false;
+      } else {
+        options.compression = compression_types[0];
+      }
+    }
+
     table_options.block_cache = uncompressed_cache_;
     if (table_options.block_cache == nullptr) {
       table_options.no_block_cache = true;

From a150604e101077c6910e6699219dad6d119fc302 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 18 Nov 2019 15:00:23 -0800
Subject: [PATCH 538/572] db_stress to cover total order seek (#6039)

Summary:
Right now, in db_stress, as long as prefix extractor is defined, TestIterator always uses. There is value of cover total_order_seek = true when prefix extractor is define. Add a small chance that this flag is turned on.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6039

Test Plan: Run the test for a while.

Differential Revision: D18539689

fbshipit-source-id: 568790dd7789c9986b83764b870df0423a122d99
---
 tools/db_stress_tool.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index d550bdf8de3..66f17c3b6b3 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2426,6 +2426,11 @@ class StressTest {
     ReadOptions readoptionscopy = read_opts;
     readoptionscopy.snapshot = snapshot;
 
+    if (thread->rand.OneIn(16)) {
+      // When prefix extractor is used, it's useful to cover total order seek.
+      readoptionscopy.total_order_seek = true;
+    }
+
     std::string upper_bound_str;
     Slice upper_bound;
     if (thread->rand.OneIn(16)) {
@@ -2585,7 +2590,8 @@ class StressTest {
       // Iterator is not valid. It can be legimate if it has already been
       // out of upper or lower bound, or filtered out by prefix iterator.
       const Slice& total_order_key = cmp_iter->key();
-      const SliceTransform* pe = options_.prefix_extractor.get();
+      const SliceTransform* pe =
+          ro.total_order_seek ? nullptr : options_.prefix_extractor.get();
       const Comparator* cmp = options_.comparator;
 
       if (pe != nullptr) {

From 279c488395e736bf9cc8e643919866b61c598b90 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Mon, 18 Nov 2019 16:28:04 -0800
Subject: [PATCH 539/572] Mark blob files not needed by any memtables/SSTs
 obsolete (#6032)

Summary:
The patch adds logic to mark no longer needed blob files obsolete upon database open
and whenever a flush or compaction completes. Unneeded blob files are detected by
iterating through live immutable non-TTL blob files starting from the lowest-numbered one,
and stopping when a blob file used by any SSTs or potentially used by memtables is found.
(The latter is determined by comparing the sequence number at which the blob file
became immutable with the largest sequence number received in flush notifications.)

In addition, the patch cleans up the logic around closing and obsoleting blob files and
enforces invariants around this area (blob files are now guaranteed to go through the
stages mutable-non-obsolete, immutable-non-obsolete, and immutable-obsolete in this
order).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6032

Test Plan: Extended unit tests and tested using the BlobDB mode of `db_bench`.

Differential Revision: D18495610

Pulled By: ltamasi

fbshipit-source-id: 11825b84af74f3f4abfd9bcae04e80870ae58961
---
 utilities/blob_db/blob_db_impl.cc | 281 +++++++++++++++++++++++-------
 utilities/blob_db/blob_db_impl.h  |  38 +++-
 utilities/blob_db/blob_db_test.cc | 142 ++++++++++++++-
 utilities/blob_db/blob_file.cc    |   7 +-
 utilities/blob_db/blob_file.h     |  17 +-
 5 files changed, 404 insertions(+), 81 deletions(-)

diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index ea68178f82c..4d259eab1d4 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <iomanip>
 #include <memory>
+#include <sstream>
 
 #include "db/blob_index.h"
 #include "db/db_impl/db_impl.h"
@@ -82,6 +83,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       env_options_(db_options),
       statistics_(db_options_.statistics.get()),
       next_file_number_(1),
+      flush_sequence_(0),
       epoch_of_(0),
       closed_(true),
       open_file_count_(0),
@@ -206,6 +208,8 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
 
     InitializeBlobFileToSstMapping(live_files);
 
+    MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
     if (!disable_auto_compactions) {
       s = db_->EnableAutoCompaction(*handles);
       if (!s.ok()) {
@@ -288,23 +292,25 @@ Status BlobDBImpl::OpenAllBlobFiles() {
     next_file_number_.store(*file_numbers.rbegin() + 1);
   }
 
-  std::string blob_file_list;
-  std::string obsolete_file_list;
+  std::ostringstream blob_file_oss;
+  std::ostringstream live_imm_oss;
+  std::ostringstream obsolete_file_oss;
 
   for (auto& file_number : file_numbers) {
     std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
         this, blob_dir_, file_number, db_options_.info_log.get());
-    blob_file->MarkImmutable();
+    blob_file->MarkImmutable(/* sequence */ 0);
 
     // Read file header and footer
     Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
     if (read_metadata_status.IsCorruption()) {
       // Remove incomplete file.
-      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
-      if (!obsolete_file_list.empty()) {
-        obsolete_file_list.append(", ");
+      if (!obsolete_files_.empty()) {
+        obsolete_file_oss << ", ";
       }
-      obsolete_file_list.append(ToString(file_number));
+      obsolete_file_oss << file_number;
+
+      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
       continue;
     } else if (!read_metadata_status.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
@@ -316,20 +322,33 @@ Status BlobDBImpl::OpenAllBlobFiles() {
 
     total_blob_size_ += blob_file->GetFileSize();
 
+    if (!blob_files_.empty()) {
+      blob_file_oss << ", ";
+    }
+    blob_file_oss << file_number;
+
     blob_files_[file_number] = blob_file;
-    if (!blob_file_list.empty()) {
-      blob_file_list.append(", ");
+
+    if (!blob_file->HasTTL()) {
+      if (!live_imm_non_ttl_blob_files_.empty()) {
+        live_imm_oss << ", ";
+      }
+      live_imm_oss << file_number;
+
+      live_imm_non_ttl_blob_files_[file_number] = blob_file;
     }
-    blob_file_list.append(ToString(file_number));
   }
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
-                 blob_file_list.c_str());
+                 blob_file_oss.str().c_str());
+  ROCKS_LOG_INFO(
+      db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s",
+      live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str());
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Found %" ROCKSDB_PRIszt
                  " incomplete or corrupted blob files: %s",
-                 obsolete_files_.size(), obsolete_file_list.c_str());
+                 obsolete_files_.size(), obsolete_file_oss.str().c_str());
   return s;
 }
 
@@ -426,14 +445,16 @@ void BlobDBImpl::InitializeBlobFileToSstMapping(
 void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
   assert(bdb_options_.enable_garbage_collection);
 
-  if (info.oldest_blob_file_number == kInvalidBlobFileNumber) {
-    return;
-  }
+  WriteLock lock(&mutex_);
 
-  {
-    ReadLock lock(&mutex_);
+  if (info.oldest_blob_file_number != kInvalidBlobFileNumber) {
     LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
   }
+
+  assert(flush_sequence_ < info.largest_seqno);
+  flush_sequence_ = info.largest_seqno;
+
+  MarkUnreferencedBlobFilesObsolete();
 }
 
 void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
@@ -443,27 +464,107 @@ void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
   // file list in case of a trivial move. We process the inputs first
   // to ensure the blob file still has a link after processing all updates.
 
-  {
-    ReadLock lock(&mutex_);
+  WriteLock lock(&mutex_);
 
-    for (const auto& input : info.input_file_infos) {
-      if (input.oldest_blob_file_number == kInvalidBlobFileNumber) {
-        continue;
-      }
+  for (const auto& input : info.input_file_infos) {
+    if (input.oldest_blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
 
-      UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+    UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+  }
+
+  for (const auto& output : info.output_file_infos) {
+    if (output.oldest_blob_file_number == kInvalidBlobFileNumber) {
+      continue;
     }
 
-    for (const auto& output : info.output_file_infos) {
-      if (output.oldest_blob_file_number == kInvalidBlobFileNumber) {
-        continue;
-      }
+    LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+  }
 
-      LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+  MarkUnreferencedBlobFilesObsolete();
+}
+
+bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
+    const std::shared_ptr<BlobFile>& blob_file, SequenceNumber obsolete_seq) {
+  assert(blob_file);
+  assert(!blob_file->HasTTL());
+  assert(blob_file->Immutable());
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Note: FIFO eviction could have marked this file obsolete already.
+  if (blob_file->Obsolete()) {
+    return true;
+  }
+
+  // We cannot mark this file (or any higher-numbered files for that matter)
+  // obsolete if it is referenced by any memtables or SSTs. We keep track of
+  // the SSTs explicitly. To account for memtables, we keep track of the highest
+  // sequence number received in flush notifications, and we do not mark the
+  // blob file obsolete if there are still unflushed memtables from before
+  // the time the blob file was closed.
+  if (blob_file->GetImmutableSequence() > flush_sequence_ ||
+      !blob_file->GetLinkedSstFiles().empty()) {
+    return false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " is no longer needed, marking obsolete",
+                 blob_file->BlobFileNumber());
+
+  ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true);
+  return true;
+}
+
+template <class Functor>
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Iterate through all live immutable non-TTL blob files, and mark them
+  // obsolete assuming no SST files or memtables rely on the blobs in them.
+  // Note: we need to stop as soon as we find a blob file that has any
+  // linked SSTs (or one potentially referenced by memtables).
+
+  auto it = live_imm_non_ttl_blob_files_.begin();
+  while (it != live_imm_non_ttl_blob_files_.end()) {
+    const auto& blob_file = it->second;
+    assert(blob_file);
+    assert(blob_file->BlobFileNumber() == it->first);
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
+
+    // Small optimization: Obsolete() does an atomic read, so we can do
+    // this check without taking a lock on the blob file's mutex.
+    if (blob_file->Obsolete()) {
+      it = live_imm_non_ttl_blob_files_.erase(it);
+      continue;
+    }
+
+    if (!mark_if_needed(blob_file)) {
+      break;
     }
+
+    it = live_imm_non_ttl_blob_files_.erase(it);
   }
 }
 
+void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() {
+  const SequenceNumber obsolete_seq = GetLatestSequenceNumber();
+
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [=](const std::shared_ptr<BlobFile>& blob_file) {
+        WriteLock file_lock(&blob_file->mutex_);
+        return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
+      });
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [=](const std::shared_ptr<BlobFile>& blob_file) {
+        return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
+      });
+}
+
 void BlobDBImpl::CloseRandomAccessLocked(
     const std::shared_ptr<BlobFile>& bfile) {
   bfile->CloseRandomAccessLocked();
@@ -1041,11 +1142,12 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size,
     WriteLock file_lock(&blob_file->mutex_);
     if (blob_file->Obsolete()) {
       // File already obsoleted by someone else.
+      assert(blob_file->Immutable());
       continue;
     }
     // FIFO eviction can evict open blob files.
     if (!blob_file->Immutable()) {
-      Status s = CloseBlobFile(blob_file, false /*need_lock*/);
+      Status s = CloseBlobFile(blob_file);
       if (!s.ok()) {
         return s;
       }
@@ -1380,8 +1482,16 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
   ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt,
                  open_ttl_files_.size());
 
-  for (auto bfile : open_ttl_files_) {
-    assert(!bfile->Immutable());
+  for (const auto& blob_file : open_ttl_files_) {
+    (void)blob_file;
+    assert(!blob_file->Immutable());
+  }
+
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    const auto& blob_file = pair.second;
+    (void)blob_file;
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
   }
 
   uint64_t now = EpochNow();
@@ -1423,58 +1533,75 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
   return std::make_pair(true, -1);
 }
 
-Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile,
-                                 bool need_lock) {
-  assert(bfile != nullptr);
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+  assert(bfile);
+  assert(!bfile->Immutable());
+  assert(!bfile->Obsolete());
   write_mutex_.AssertHeld();
-  Status s;
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Closing blob file %" PRIu64 ". Path: %s",
                  bfile->BlobFileNumber(), bfile->PathName().c_str());
-  {
-    std::unique_ptr<WriteLock> lock;
-    if (need_lock) {
-      lock.reset(new WriteLock(&mutex_));
-    }
 
-    if (bfile->HasTTL()) {
-      size_t erased __attribute__((__unused__));
-      erased = open_ttl_files_.erase(bfile);
-    } else if (bfile == open_non_ttl_file_) {
-      open_non_ttl_file_ = nullptr;
-    }
-  }
+  const SequenceNumber sequence = GetLatestSequenceNumber();
 
-  if (!bfile->closed_.load()) {
-    std::unique_ptr<WriteLock> file_lock;
-    if (need_lock) {
-      file_lock.reset(new WriteLock(&bfile->mutex_));
-    }
-    s = bfile->WriteFooterAndCloseLocked();
-  }
+  const Status s = bfile->WriteFooterAndCloseLocked(sequence);
 
   if (s.ok()) {
     total_blob_size_ += BlobLogFooter::kSize;
   } else {
+    bfile->MarkImmutable(sequence);
+
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to close blob file %" PRIu64 "with error: %s",
                     bfile->BlobFileNumber(), s.ToString().c_str());
   }
 
+  if (bfile->HasTTL()) {
+    size_t erased __attribute__((__unused__));
+    erased = open_ttl_files_.erase(bfile);
+  } else {
+    if (bfile == open_non_ttl_file_) {
+      open_non_ttl_file_ = nullptr;
+    }
+
+    const uint64_t blob_file_number = bfile->BlobFileNumber();
+    auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number);
+    assert(it == live_imm_non_ttl_blob_files_.end() ||
+           it->first != blob_file_number);
+    live_imm_non_ttl_blob_files_.insert(
+        it, std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+                blob_file_number, bfile));
+  }
+
   return s;
 }
 
 Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
+  write_mutex_.AssertHeld();
+
   // atomic read
   if (bfile->GetFileSize() < bdb_options_.blob_file_size) {
     return Status::OK();
   }
+
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
+  assert(!bfile->Obsolete() || bfile->Immutable());
+  if (bfile->Immutable()) {
+    return Status::OK();
+  }
+
   return CloseBlobFile(bfile);
 }
 
 void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
                                   SequenceNumber obsolete_seq,
                                   bool update_size) {
+  assert(blob_file->Immutable());
+  assert(!blob_file->Obsolete());
+
   // Should hold write lock of mutex_ or during DB open.
   blob_file->MarkObsolete(obsolete_seq);
   obsolete_files_.push_back(blob_file);
@@ -1545,15 +1672,23 @@ std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
   SequenceNumber seq = GetLatestSequenceNumber();
   {
     MutexLock l(&write_mutex_);
+    WriteLock lock(&mutex_);
     for (auto& blob_file : process_files) {
       WriteLock file_lock(&blob_file->mutex_);
-      if (!blob_file->Immutable()) {
-        CloseBlobFile(blob_file, false /*need_lock*/);
-      }
+
       // Need to double check if the file is obsolete.
-      if (!blob_file->Obsolete()) {
-        ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
+      if (blob_file->Obsolete()) {
+        assert(blob_file->Immutable());
+        continue;
       }
+
+      if (!blob_file->Immutable()) {
+        CloseBlobFile(blob_file);
+      }
+
+      assert(blob_file->Immutable());
+
+      ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
     }
   }
 
@@ -1918,6 +2053,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
   if (newfile != nullptr) {
     {
       MutexLock l(&write_mutex_);
+      WriteLock lock(&mutex_);
+      WriteLock file_lock(&newfile->mutex_);
       CloseBlobFile(newfile);
     }
     total_blob_size_ += newfile->file_size_;
@@ -2092,9 +2229,14 @@ Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
   return GetBlobValue(key, index_entry, value);
 }
 
-void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number) {
-  blob_files_[blob_file_number] = std::make_shared<BlobFile>(
-      this, blob_dir_, blob_file_number, db_options_.info_log.get());
+void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                                       SequenceNumber immutable_sequence) {
+  auto blob_file = std::make_shared<BlobFile>(this, blob_dir_, blob_file_number,
+                                              db_options_.info_log.get());
+  blob_file->MarkImmutable(immutable_sequence);
+
+  blob_files_[blob_file_number] = blob_file;
+  live_imm_non_ttl_blob_files_[blob_file_number] = blob_file;
 }
 
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
@@ -2106,6 +2248,16 @@ std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
   return blob_files;
 }
 
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetLiveImmNonTTLFiles()
+    const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> live_imm_non_ttl_files;
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    live_imm_non_ttl_files.emplace_back(pair.second);
+  }
+  return live_imm_non_ttl_files;
+}
+
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
     const {
   ReadLock l(&mutex_);
@@ -2122,6 +2274,9 @@ void BlobDBImpl::TEST_DeleteObsoleteFiles() {
 
 Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
   MutexLock l(&write_mutex_);
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
   return CloseBlobFile(bfile);
 }
 
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 49e39c89f22..17c97abf9f0 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -185,10 +185,13 @@ class BlobDBImpl : public BlobDB {
   Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
                            PinnableSlice* value);
 
-  void TEST_AddDummyBlobFile(uint64_t blob_file_number);
+  void TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                             SequenceNumber immutable_sequence);
 
   std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
 
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetLiveImmNonTTLFiles() const;
+
   std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
 
   Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
@@ -238,9 +241,12 @@ class BlobDBImpl : public BlobDB {
                            std::string* compression_output) const;
 
   // Close a file by appending a footer, and removes file from open files list.
-  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile, bool need_lock = true);
+  // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
+  // and the blob file's mutex_.
+  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
 
   // Close a file if its size exceeds blob_file_size
+  // REQUIRES: lock held on write_mutex_.
   Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
 
   // Mark file as obsolete and move the file to obsolete file list.
@@ -321,12 +327,30 @@ class BlobDBImpl : public BlobDB {
   void InitializeBlobFileToSstMapping(
       const std::vector<LiveFileMetaData>& live_files);
 
-  // Update the mapping between blob files and SSTs after a flush.
+  // Update the mapping between blob files and SSTs after a flush and mark
+  // any unneeded blob files obsolete.
   void ProcessFlushJobInfo(const FlushJobInfo& info);
 
-  // Update the mapping between blob files and SSTs after a compaction.
+  // Update the mapping between blob files and SSTs after a compaction and
+  // mark any unneeded blob files obsolete.
   void ProcessCompactionJobInfo(const CompactionJobInfo& info);
 
+  // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs
+  // linked to it, and all memtables from before the blob file became immutable
+  // have been flushed. Note: should only be called if the condition holds for
+  // all lower-numbered non-TTL blob files as well.
+  bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr<BlobFile>& blob_file,
+                                    SequenceNumber obsolete_seq);
+
+  // Mark all immutable non-TTL blob files that aren't needed by any SSTs as
+  // obsolete. Comes in two varieties; the version used during Open need not
+  // worry about locking or snapshots.
+  template <class Functor>
+  void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed);
+
+  void MarkUnreferencedBlobFilesObsolete();
+  void MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
   void UpdateLiveSSTSize();
 
   Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
@@ -404,6 +428,12 @@ class BlobDBImpl : public BlobDB {
   // entire metadata of all the BLOB files memory
   std::map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
 
+  // All live immutable non-TTL blob files.
+  std::map<uint64_t, std::shared_ptr<BlobFile>> live_imm_non_ttl_blob_files_;
+
+  // The largest sequence number that has been flushed.
+  SequenceNumber flush_sequence_;
+
   // epoch or version of the open files.
   std::atomic<uint64_t> epoch_of_;
 
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index ce18f144ee1..a538ad0f691 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -1621,11 +1621,11 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
   Open(bdb_options);
 
   // Register some dummy blob files.
-  blob_db_impl()->TEST_AddDummyBlobFile(1);
-  blob_db_impl()->TEST_AddDummyBlobFile(2);
-  blob_db_impl()->TEST_AddDummyBlobFile(3);
-  blob_db_impl()->TEST_AddDummyBlobFile(4);
-  blob_db_impl()->TEST_AddDummyBlobFile(5);
+  blob_db_impl()->TEST_AddDummyBlobFile(1, /* immutable_sequence */ 200);
+  blob_db_impl()->TEST_AddDummyBlobFile(2, /* immutable_sequence */ 300);
+  blob_db_impl()->TEST_AddDummyBlobFile(3, /* immutable_sequence */ 400);
+  blob_db_impl()->TEST_AddDummyBlobFile(4, /* immutable_sequence */ 500);
+  blob_db_impl()->TEST_AddDummyBlobFile(5, /* immutable_sequence */ 600);
 
   // Initialize the blob <-> SST file mapping. First, add some SST files with
   // blob file references, then some without.
@@ -1653,28 +1653,62 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
 
   ASSERT_EQ(blob_files.size(), 5);
 
+  {
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
   {
     const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
         {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
     for (size_t i = 0; i < 5; ++i) {
       const auto &blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
     }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
   }
 
   // Simulate a flush where the SST does not reference any blob files.
   {
     FlushJobInfo info{};
     info.file_number = 21;
+    info.smallest_seqno = 1;
+    info.largest_seqno = 100;
 
     blob_db_impl()->TEST_ProcessFlushJobInfo(info);
 
     const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
         {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
     for (size_t i = 0; i < 5; ++i) {
       const auto &blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
   }
 
   // Simulate a flush where the SST references a blob file.
@@ -1682,40 +1716,130 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     FlushJobInfo info{};
     info.file_number = 22;
     info.oldest_blob_file_number = 5;
+    info.smallest_seqno = 101;
+    info.largest_seqno = 200;
 
     blob_db_impl()->TEST_ProcessFlushJobInfo(info);
 
     const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
         {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
     for (size_t i = 0; i < 5; ++i) {
       const auto &blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
   }
 
   // Simulate a compaction. Some inputs and outputs have blob file references,
   // some don't. There is also a trivial move (which means the SST appears on
-  // both the input and the output list).
+  // both the input and the output list). Blob file 1 loses all its linked SSTs,
+  // and since it got marked immutable at sequence number 200 which has already
+  // been flushed, it can be marked obsolete.
   {
     CompactionJobInfo info{};
     info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1});
     info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 6, 1});
     info.input_file_infos.emplace_back(
         CompactionFileInfo{1, 11, kInvalidBlobFileNumber});
-    info.input_file_infos.emplace_back(CompactionFileInfo{1, 5, 22});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
     info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3});
     info.output_file_infos.emplace_back(
         CompactionFileInfo{2, 24, kInvalidBlobFileNumber});
-    info.output_file_infos.emplace_back(CompactionFileInfo{2, 5, 22});
 
     blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
 
     const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
-        {6}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+        {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate another compaction. Blob file 2 loses all its linked SSTs
+  // but since it got marked immutable at sequence number 300 which hasn't
+  // been flushed yet, it cannot be marked obsolete at this point.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
       const auto &blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate a flush with largest sequence number 300. This will make it
+  // possible to mark blob file 2 obsolete.
+  {
+    FlushJobInfo info{};
+    info.file_number = 26;
+    info.smallest_seqno = 201;
+    info.largest_seqno = 300;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, true, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 3);
+    for (size_t i = 0; i < 3; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 3);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 2);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+    ASSERT_EQ(obsolete_files[1]->BlobFileNumber(), 2);
   }
 }
 
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 14bfb430954..a3a037ac88b 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -35,7 +35,9 @@ BlobFile::BlobFile()
       blob_count_(0),
       file_size_(0),
       closed_(false),
+      immutable_sequence_(0),
       obsolete_(false),
+      obsolete_sequence_(0),
       expiration_range_({0, 0}),
       last_access_(-1),
       last_fsync_(0),
@@ -54,7 +56,9 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
       blob_count_(0),
       file_size_(0),
       closed_(false),
+      immutable_sequence_(0),
       obsolete_(false),
+      obsolete_sequence_(0),
       expiration_range_({0, 0}),
       last_access_(-1),
       last_fsync_(0),
@@ -125,7 +129,7 @@ bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
                 : (file_size_ - last_fsync_) >= bytes_per_sync;
 }
 
-Status BlobFile::WriteFooterAndCloseLocked() {
+Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) {
   BlobLogFooter footer;
   footer.blob_count = blob_count_;
   if (HasTTL()) {
@@ -136,6 +140,7 @@ Status BlobFile::WriteFooterAndCloseLocked() {
   Status s = log_writer_->AppendFooter(footer);
   if (s.ok()) {
     closed_ = true;
+    immutable_sequence_ = sequence;
     file_size_ += BlobLogFooter::kSize;
   }
   // delete the sequential writer
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 53467619952..06ada8635a9 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -68,6 +68,9 @@ class BlobFile {
   // no more blobs will be appended and the footer has been written out
   std::atomic<bool> closed_;
 
+  // The latest sequence number when the file was closed/made immutable.
+  SequenceNumber immutable_sequence_;
+
   // has a pass of garbage collection successfully finished on this file
   // obsolete_ still needs to do iterator/snapshot checks
   std::atomic<bool> obsolete_;
@@ -98,8 +101,6 @@ class BlobFile {
 
   bool footer_valid_;
 
-  SequenceNumber garbage_collection_finish_sequence_;
-
  public:
   BlobFile();
 
@@ -153,7 +154,15 @@ class BlobFile {
 
   // Mark the file as immutable.
   // REQUIRES: write lock held, or access from single thread (on DB open).
-  void MarkImmutable() { closed_ = true; }
+  void MarkImmutable(SequenceNumber sequence) {
+    closed_ = true;
+    immutable_sequence_ = sequence;
+  }
+
+  SequenceNumber GetImmutableSequence() const {
+    assert(Immutable());
+    return immutable_sequence_;
+  }
 
   // if the file has gone through GC and blobs have been relocated
   bool Obsolete() const {
@@ -216,7 +225,7 @@ class BlobFile {
 
   Status ReadFooter(BlobLogFooter* footer);
 
-  Status WriteFooterAndCloseLocked();
+  Status WriteFooterAndCloseLocked(SequenceNumber sequence);
 
   void CloseRandomAccessLocked();
 

From ac498cdb86e35862db348d3a5d5f0b82988f475a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 19 Nov 2019 08:18:57 -0800
Subject: [PATCH 540/572] Remove a few unnecessary includes

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/6046

Test Plan: make check, manual inspection

Differential Revision: D18573044

Pulled By: pdillinger

fbshipit-source-id: 7a5999fc08d798ce3157b56d4b36d24027409fc3
---
 table/plain/plain_table_index.h   | 2 --
 table/plain/plain_table_reader.cc | 1 -
 2 files changed, 3 deletions(-)

diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h
index 7c8ed1953b0..c4bb272282c 100644
--- a/table/plain/plain_table_index.h
+++ b/table/plain/plain_table_index.h
@@ -15,8 +15,6 @@
 #include "monitoring/histogram.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
-#include "util/hash.h"
-#include "util/murmurhash.h"
 
 namespace rocksdb {
 
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index af97bb8dbe0..15cd32d0b08 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -36,7 +36,6 @@
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
-#include "util/murmurhash.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 

From 38cc6112971d06ba17425d54875f1c5a459e6384 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 19 Nov 2019 10:11:56 -0800
Subject: [PATCH 541/572] Fix test failure in LITE mode (#6050)

Summary:
GetSupportedCompressions() is not available in LITE build, so check and use Snappy compression in db_basic_test.cc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6050

Test Plan:
make LITE=1 check
make check

Differential Revision: D18588114

Pulled By: anand1976

fbshipit-source-id: a193de58c44f91bcc237107f25dbc1b9458eef3d
---
 db/db_basic_test.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 46bfaaec2bd..601033d6125 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -1664,6 +1664,7 @@ class DBBasicTestWithParallelIO
     Random rnd(301);
     BlockBasedTableOptions table_options;
 
+#ifndef ROCKSDB_LITE
     if (compression_enabled_) {
       std::vector<CompressionType> compression_types;
       compression_types = GetSupportedCompressions();
@@ -1675,6 +1676,12 @@ class DBBasicTestWithParallelIO
         options.compression = compression_types[0];
       }
     }
+#else
+    // GetSupportedCompressions() is not available in LITE build
+    if (!Snappy_Supported()) {
+      compression_enabled_ = false;
+    }
+#endif //ROCKSDB_LITE
 
     table_options.block_cache = uncompressed_cache_;
     if (table_options.block_cache == nullptr) {

From 20b48c64788ab54a9529ad5b23c520e4a4f42bec Mon Sep 17 00:00:00 2001
From: tabokie <xy.tao@outlook.com>
Date: Tue, 19 Nov 2019 11:37:24 -0800
Subject: [PATCH 542/572] Fix blob context when db_iter uses seek (#6051)

Summary:
Fix: when `db_iter` falls back to using seek by `FindValueForCurrentKeyUsingSeek`, `is_blob_` flag is not properly set on encountering BlobIndex.
Also patch existing test for the mentioned code path.
Signed-off-by: tabokie <xy.tao@outlook.com>
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6051

Differential Revision: D18596274

Pulled By: ltamasi

fbshipit-source-id: 8e4714af263b99dc2c379707d50db88fe6799278
---
 HISTORY.md               |  1 +
 db/db_blob_index_test.cc | 20 ++++++++++++++++++++
 db/db_iter.cc            |  2 ++
 3 files changed, 23 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index ac96d92a763..c05f98dec7b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -21,6 +21,7 @@
 * Fix a assertion failure in MultiGe4t() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
 * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
+* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
 
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
index 0cdc93dde87..b7a048dd4b9 100644
--- a/db/db_blob_index_test.cc
+++ b/db/db_blob_index_test.cc
@@ -398,6 +398,26 @@ TEST_F(DBBlobIndexTest, Iterate) {
     verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
            create_blob_iterator, check_is_blob(false));
 
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
     for (auto* snapshot : snapshots) {
       dbfull()->ReleaseSnapshot(snapshot);
     }
diff --git a/db/db_iter.cc b/db/db_iter.cc
index df7e1b45042..b2675c520a6 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -856,6 +856,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   // In case read_callback presents, the value we seek to may not be visible.
   // Find the next value that's visible.
   ParsedInternalKey ikey;
+  is_blob_ = false;
   while (true) {
     if (!iter_.Valid()) {
       valid_ = false;
@@ -897,6 +898,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_.iter()->IsValuePinned());
     pinned_value_ = iter_.value();
+    is_blob_ = (ikey.type == kTypeBlobIndex);
     valid_ = true;
     return true;
   }

From 4e0dcd36dfcf3109c4a8ede9c793bc8b1f76e68b Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 19 Nov 2019 13:15:40 -0800
Subject: [PATCH 543/572] db_stress sometimes generates keys close to SST file
 boundaries (#6037)

Summary:
Recently, a bug was found related to a seek key that is close to SST file boundary. However, it only occurs in a very small chance in db_stress, because the chance that a random key hits SST file boundaries is small. To boost the chance, with 1/16 chance, we pick keys that are close to SST file boundaries.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6037

Test Plan: Did some manual printing out, and hack to cover the key generation logic to be correct.

Differential Revision: D18598476

fbshipit-source-id: 13b76687d106c5be4e3e02a0c77fa5578105a071
---
 tools/db_stress_tool.cc | 77 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
index 66f17c3b6b3..47943e7f4c6 100644
--- a/tools/db_stress_tool.cc
+++ b/tools/db_stress_tool.cc
@@ -2416,6 +2416,65 @@ class StressTest {
     return column_families_[column_family_id];
   }
 
+#ifndef ROCKSDB_LITE
+  // Generated a list of keys that close to boundaries of SST keys.
+  // If there isn't any SST file in the DB, return empty list.
+  std::vector<std::string> GetWhiteBoxKeys(ThreadState* thread, DB* db,
+                                           ColumnFamilyHandle* cfh,
+                                           size_t num_keys) {
+    ColumnFamilyMetaData cfmd;
+    db->GetColumnFamilyMetaData(cfh, &cfmd);
+    std::vector<std::string> boundaries;
+    for (const LevelMetaData& lmd : cfmd.levels) {
+      for (const SstFileMetaData& sfmd : lmd.files) {
+        boundaries.push_back(sfmd.smallestkey);
+        boundaries.push_back(sfmd.largestkey);
+      }
+    }
+    if (boundaries.empty()) {
+      return {};
+    }
+
+    std::vector<std::string> ret;
+    for (size_t j = 0; j < num_keys; j++) {
+      std::string k =
+          boundaries[thread->rand.Uniform(static_cast<int>(boundaries.size()))];
+      if (thread->rand.OneIn(3)) {
+        // Reduce one byte from the string
+        for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+          uint8_t cur = k[i];
+          if (cur > 0) {
+            k[i] = static_cast<char>(cur - 1);
+            break;
+          } else if (i > 0) {
+            k[i] = 0xFF;
+          }
+        }
+      } else if (thread->rand.OneIn(2)) {
+        // Add one byte to the string
+        for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+          uint8_t cur = k[i];
+          if (cur < 255) {
+            k[i] = static_cast<char>(cur + 1);
+            break;
+          } else if (i > 0) {
+            k[i] = 0x00;
+          }
+        }
+      }
+      ret.push_back(k);
+    }
+    return ret;
+  }
+#else // !ROCKSDB_LITE
+std::vector<std::string> GetWhiteBoxKeys(ThreadState*, DB*,
+                                         ColumnFamilyHandle*,
+                                         size_t) {
+  // Not supported in LITE mode.
+  return {};
+}
+#endif // !ROCKSDB_LITE
+
   // Given a key K, this creates an iterator which scans to K and then
   // does a random sequence of Next/Prev operations.
   virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
@@ -2457,9 +2516,21 @@ class StressTest {
     auto cfh = column_families_[rand_column_families[0]];
     std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
 
-    for (int64_t rkey : rand_keys) {
-      std::string key_str = Key(rkey);
-      Slice key = key_str;
+    std::vector<std::string> key_str;
+    if (thread->rand.OneIn(16)) {
+      // Generate keys close to lower or upper bound of SST files.
+      key_str = GetWhiteBoxKeys(thread, db_, cfh, rand_keys.size());
+    }
+    if (key_str.empty()) {
+      // If key string is not geneerated using white block keys,
+      // Use randomized key passe in.
+      for (int64_t rkey : rand_keys) {
+        key_str.push_back(Key(rkey));
+      }
+    }
+
+    for (const std::string& skey : key_str) {
+      Slice key = skey;
 
       if (readoptionscopy.iterate_upper_bound != nullptr &&
           thread->rand.OneIn(2)) {

From 019eb1f402ee3b501b7211deeb7f872444a565a6 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 19 Nov 2019 15:00:47 -0800
Subject: [PATCH 544/572] Disable blob iterator test with
 max_sequential_skip_in_iterations==0 in LITE mode (#6052)

Summary:
The SetOptions API used by the test is not supported in LITE mode,
so we should skip the new chunk in this case.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6052

Test Plan: Ran the unit tests both in regular and LITE mode.

Differential Revision: D18601763

Pulled By: ltamasi

fbshipit-source-id: 883d6882771e0fb4aae72bb77ba4e63d9febec04
---
 db/db_blob_index_test.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
index b7a048dd4b9..30e44e5bac0 100644
--- a/db/db_blob_index_test.cc
+++ b/db/db_blob_index_test.cc
@@ -398,8 +398,10 @@ TEST_F(DBBlobIndexTest, Iterate) {
     verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
            create_blob_iterator, check_is_blob(false));
 
+#ifndef ROCKSDB_LITE
     // Iterator with blob support and using seek.
-    ASSERT_OK(dbfull()->SetOptions(cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
     verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
            create_blob_iterator, check_is_blob(true));
     verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
@@ -417,6 +419,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
            create_blob_iterator, check_is_blob(false));
     verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
            create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
 
     for (auto* snapshot : snapshots) {
       dbfull()->ReleaseSnapshot(snapshot);

From ec3e3c3e02c8cd2d70dee82f5163f5b039493513 Mon Sep 17 00:00:00 2001
From: Little-Wallace <bupt2013211450@gmail.com>
Date: Tue, 19 Nov 2019 15:07:49 -0800
Subject: [PATCH 545/572] Fix corruption with intra-L0 on ingested files
 (#5958)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
## Problem Description

Our process was abort when it call `CheckConsistency`. And the information in  `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ".  Here are the causes of the accident I investigated.

* RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested.
* When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number.
* `CheckConsistency`  determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal.
* If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst,  the `smallest_seqno` of this new file will be smaller than his `largest_seqno`.
    * If more than one ingested file was ingested before memtable schedule flush,  and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable.  So `CheckConsistency` will return a `Corruption`.
    * If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start,  but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2).  **But there was still some data in s1  written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files:
    > s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno  < s1.largest_seqno

So I skip pick sst file which was ingested in function `FindIntraL0Compaction `

## Reason

Here is my bug report: https://github.com/facebook/rocksdb/issues/5913

There are two situations that can cause the check to fail.

### First situation：
- First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest.

- If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst.

- Then some data was put into memtable, and memtable was flushed to L0. We called this sst B.
- RocksDB check consistency , and find the `smallest_seqno` of B is  less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested.

### Secondary situaion

- First we have flushed many sst in L0,  we call them [s1, s2, s3].

- There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1.  And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1.

- m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5.

- [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction.  We call it s6.
  - compacted 4@0 files to L0
- When s6 is added into manifest,  the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5.  But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5.
   - s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958

Differential Revision: D18601316

fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
---
 HISTORY.md                                   |   3 +
 db/column_family.cc                          |   6 +-
 db/compaction/compaction_picker.cc           |  44 ++++--
 db/compaction/compaction_picker.h            |  27 ++--
 db/compaction/compaction_picker_fifo.cc      |   3 +-
 db/compaction/compaction_picker_fifo.h       |   8 +-
 db/compaction/compaction_picker_level.cc     |  17 +-
 db/compaction/compaction_picker_level.h      |   8 +-
 db/compaction/compaction_picker_test.cc      |  56 +++++--
 db/compaction/compaction_picker_universal.cc |   3 +-
 db/compaction/compaction_picker_universal.h  |   9 +-
 db/db_compaction_test.cc                     | 158 +++++++++++++++++++
 12 files changed, 283 insertions(+), 59 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c05f98dec7b..4ee3ace9766 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,8 @@
 # Rocksdb Change Log
 ## Unreleased
+### Bug Fixes
+* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+
 ### Public API Change
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
diff --git a/db/column_family.cc b/db/column_family.cc
index f6a012d8faa..c9f4123e61a 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -954,8 +954,12 @@ bool ColumnFamilyData::NeedsCompaction() const {
 
 Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  SequenceNumber earliest_mem_seqno =
+      std::min(mem_->GetEarliestSequenceNumber(),
+               imm_.current()->GetEarliestSequenceNumber(false));
   auto* result = compaction_picker_->PickCompaction(
-      GetName(), mutable_options, current_->storage_info(), log_buffer);
+      GetName(), mutable_options, current_->storage_info(), log_buffer,
+      earliest_mem_seqno);
   if (result != nullptr) {
     result->SetInputVersion(current_);
   }
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 3357e06319d..361549ad144 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -39,20 +39,40 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
                            uint64_t max_compact_bytes_per_del_file,
                            uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs) {
-  size_t compact_bytes = static_cast<size_t>(level_files[0]->fd.file_size);
-  uint64_t compensated_compact_bytes = level_files[0]->compensated_file_size;
+                           CompactionInputFiles* comp_inputs,
+                           SequenceNumber earliest_mem_seqno) {
+  // Do not pick ingested file when there is at least one memtable not flushed
+  // which of seqno is overlap with the sst.
+  size_t start = 0;
+  for (; start < level_files.size(); start++) {
+    if (level_files[start]->being_compacted) {
+      return false;
+    }
+    // If there is no data in memtable, the earliest sequence number would the
+    // largest sequence number in last memtable.
+    // Because all files are sorted in descending order by largest_seqno, so we
+    // only need to check the first one.
+    if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+      break;
+    }
+  }
+  if (start >= level_files.size()) {
+    return false;
+  }
+  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+  uint64_t compensated_compact_bytes =
+      level_files[start]->compensated_file_size;
   size_t compact_bytes_per_del_file = port::kMaxSizet;
-  // Compaction range will be [0, span_len).
-  size_t span_len;
+  // Compaction range will be [start, limit).
+  size_t limit;
   // Pull in files until the amount of compaction work per deleted file begins
   // increasing or maximum total compaction size is reached.
   size_t new_compact_bytes_per_del_file = 0;
-  for (span_len = 1; span_len < level_files.size(); ++span_len) {
-    compact_bytes += static_cast<size_t>(level_files[span_len]->fd.file_size);
-    compensated_compact_bytes += level_files[span_len]->compensated_file_size;
-    new_compact_bytes_per_del_file = compact_bytes / span_len;
-    if (level_files[span_len]->being_compacted ||
+  for (limit = start + 1; limit < level_files.size(); ++limit) {
+    compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+    compensated_compact_bytes += level_files[limit]->compensated_file_size;
+    new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+    if (level_files[limit]->being_compacted ||
         new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
         compensated_compact_bytes > max_compaction_bytes) {
       break;
@@ -60,11 +80,11 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
     compact_bytes_per_del_file = new_compact_bytes_per_del_file;
   }
 
-  if (span_len >= min_files_to_compact &&
+  if ((limit - start) >= min_files_to_compact &&
       compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
     assert(comp_inputs != nullptr);
     comp_inputs->level = 0;
-    for (size_t i = 0; i < span_len; ++i) {
+    for (size_t i = start; i < limit; ++i) {
       comp_inputs->files.push_back(level_files[i]);
     }
     return true;
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 53477014cf6..ae158059a1b 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -54,10 +54,10 @@ class CompactionPicker {
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) = 0;
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
   // the specified level.  Returns nullptr if there is nothing in that
@@ -247,10 +247,11 @@ class NullCompactionPicker : public CompactionPicker {
   virtual ~NullCompactionPicker() {}
 
   // Always return "nullptr"
-  Compaction* PickCompaction(const std::string& /*cf_name*/,
-                             const MutableCFOptions& /*mutable_cf_options*/,
-                             VersionStorageInfo* /*vstorage*/,
-                             LogBuffer* /*log_buffer*/) override {
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      SequenceNumber /* earliest_memtable_seqno */) override {
     return nullptr;
   }
 
@@ -292,11 +293,11 @@ class NullCompactionPicker : public CompactionPicker {
 //                                        files. Cannot be nullptr.
 //
 // @return                                true iff compaction was found.
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs);
+bool FindIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs,
+    SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
 
 CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
                                    const VersionStorageInfo* vstorage,
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index cacb33c2273..cdf5e46dab6 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -202,7 +202,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
 
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /*earliest_memtable_seqno*/) {
   assert(vstorage->num_levels() == 1);
 
   Compaction* c = nullptr;
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index a4e63803cf8..065faef1398 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -19,10 +19,10 @@ class FIFOCompactionPicker : public CompactionPicker {
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
 
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* version,
-                                     LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* version, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
 
   virtual Compaction* CompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index cc0f19b8171..4c2afa66709 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -45,12 +45,14 @@ class LevelCompactionBuilder {
  public:
   LevelCompactionBuilder(const std::string& cf_name,
                          VersionStorageInfo* vstorage,
+                         SequenceNumber earliest_mem_seqno,
                          CompactionPicker* compaction_picker,
                          LogBuffer* log_buffer,
                          const MutableCFOptions& mutable_cf_options,
                          const ImmutableCFOptions& ioptions)
       : cf_name_(cf_name),
         vstorage_(vstorage),
+        earliest_mem_seqno_(earliest_mem_seqno),
         compaction_picker_(compaction_picker),
         log_buffer_(log_buffer),
         mutable_cf_options_(mutable_cf_options),
@@ -97,6 +99,7 @@ class LevelCompactionBuilder {
 
   const std::string& cf_name_;
   VersionStorageInfo* vstorage_;
+  SequenceNumber earliest_mem_seqno_;
   CompactionPicker* compaction_picker_;
   LogBuffer* log_buffer_;
   int start_level_ = -1;
@@ -537,17 +540,19 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() {
     // resort to L0->L0 compaction yet.
     return false;
   }
-  return FindIntraL0Compaction(
-      level_files, kMinFilesForIntraL0Compaction, port::kMaxUint64,
-      mutable_cf_options_.max_compaction_bytes, &start_level_inputs_);
+  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+                               port::kMaxUint64,
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_, earliest_mem_seqno_);
 }
 }  // namespace
 
 Compaction* LevelCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
-                                 mutable_cf_options, ioptions_);
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber earliest_mem_seqno) {
+  LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+                                 log_buffer, mutable_cf_options, ioptions_);
   return builder.PickCompaction();
 }
 }  // namespace rocksdb
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
index 9fc196698a1..c8d905ef90f 100644
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -20,10 +20,10 @@ class LevelCompactionPicker : public CompactionPicker {
   LevelCompactionPicker(const ImmutableCFOptions& ioptions,
                         const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
 
   virtual bool NeedsCompaction(
       const VersionStorageInfo* vstorage) const override;
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 793261bfc15..5dddd2c9145 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -1648,12 +1648,12 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
   // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
   // spans entire L0 key range and is marked as being compacted to avoid
   // L0->L1 compaction.
-  Add(0, 1U, "100", "150", 200000U);
-  Add(0, 2U, "151", "200", 200000U);
-  Add(0, 3U, "201", "250", 200000U);
-  Add(0, 4U, "251", "300", 200000U);
-  Add(0, 5U, "301", "350", 200000U);
-  Add(1, 6U, "100", "350", 200000U);
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
   vstorage_->LevelFiles(1)[0]->being_compacted = true;
   UpdateVersionStorageInfo();
 
@@ -1678,12 +1678,12 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   // max_compaction_bytes limit (the minimum number of files for triggering
   // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
   // is marked as being compacted to avoid L0->L1 compaction.
-  Add(0, 1U, "100", "150", 200000U);
-  Add(0, 2U, "151", "200", 200000U);
-  Add(0, 3U, "201", "250", 200000U);
-  Add(0, 4U, "251", "300", 200000U);
-  Add(0, 5U, "301", "350", 200000U);
-  Add(1, 6U, "100", "350", 200000U);
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
   vstorage_->LevelFiles(1)[0]->being_compacted = true;
   UpdateVersionStorageInfo();
 
@@ -1697,6 +1697,38 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   ASSERT_EQ(0, compaction->output_level());
 }
 
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+  // being_compact limit. And the latest one L0 will be skipped due to earliest
+  // seqno. The one L1 file spans entire L0 key range and is marked as being
+  // compacted to avoid L0->L1 compaction.
+  Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+  Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+  Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+  Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+  Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+  Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+  Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+  vstorage_->LevelFiles(0)[5]->being_compacted = true;
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 4115af39507..473a480cbc2 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -277,7 +277,8 @@ bool UniversalCompactionPicker::NeedsCompaction(
 
 Compaction* UniversalCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /* earliest_memtable_seqno */) {
   UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
                                      mutable_cf_options, vstorage, this,
                                      log_buffer);
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index 28f3a63cd92..150b6bd79c1 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -18,11 +18,10 @@ class UniversalCompactionPicker : public CompactionPicker {
   UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
-
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
   virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
 
   virtual bool NeedsCompaction(
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index f8c25e89637..b40b8917dad 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -12,6 +12,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
 #include "rocksdb/utilities/convenience.h"
 #include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
@@ -4830,6 +4831,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Close();
 }
+
 TEST_F(DBCompactionTest, ConsistencyFailTest) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
@@ -4855,6 +4857,162 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) {
   ASSERT_NOK(Put("foo", "bar"));
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+                       const std::string& value, const Options& options) {
+  ExternalSstFileInfo info;
+  std::string f = test::PerThreadDBPath("sst_file" + key);
+  EnvOptions env;
+  rocksdb::SstFileWriter writer(env, options);
+  auto s = writer.Open(f);
+  ASSERT_OK(s);
+  // ASSERT_OK(writer.Put(Key(), ""));
+  ASSERT_OK(writer.Put(key, value));
+
+  ASSERT_OK(writer.Finish(&info));
+  IngestExternalFileOptions ingest_opt;
+
+  ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+       FlushAfterL0IntraCompactionCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Flush 5 L0 sst.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Put one key, to make smallest log sequence number in this memtable is less
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(0), "a"));
+
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+  for (int i = 5; i < 10; i++) {
+    IngestOneKeyValue(dbfull(), Key(i), value, options);
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  // Put one key, to make biggest log sequence number in this memtable is bigger
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(2), "b"));
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GT(level_to_files[0].size(), 0);
+
+  ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+       IntraL0CompactionAfterFlushCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.write_buffer_size = 2 << 20;
+  options.max_write_buffer_number = 6;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+  std::string value2(RandomString(&rnd, kValueSize));
+  std::string bigvalue = value + value;
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Make 6 L0 sst.
+  for (int i = 0; i < 6; ++i) {
+    if (i % 2 == 0) {
+      IngestOneKeyValue(dbfull(), Key(i), value, options);
+    } else {
+      ASSERT_OK(Put(Key(i), value));
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+  // Stop run flush job
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_tasks;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+                 Env::Priority::HIGH);
+  sleeping_tasks.WaitUntilSleeping();
+
+  // Put many keys to make memtable request to flush
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(Put(Key(i), bigvalue));
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  // ingest file to trigger IntraL0Compaction
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value2, options);
+  }
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+
+  // Wake up flush job
+  sleeping_tasks.WakeUp();
+  sleeping_tasks.WaitUntilDone();
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  uint64_t error_count = 0;
+  db_->GetIntProperty("rocksdb.background-errors", &error_count);
+  ASSERT_EQ(error_count, 0);
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(bigvalue, Get(Key(i)));
+  }
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(value2, Get(Key(i)));
+  }
+}
+
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 

From 0306e01233c26090ee6404e80e125cad36ebe694 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 19 Nov 2019 15:41:56 -0800
Subject: [PATCH 546/572] Fixes for g++ 4.9.2 compatibility (#6053)

Summary:
Taken from merryChris in https://github.com/facebook/rocksdb/issues/6043

Stackoverflow ref on {{}} vs. {}:
https://stackoverflow.com/questions/26947704/implicit-conversion-failure-from-initializer-list

Note to reader: .clear() does not empty out an ostringstream, but .str("")
suffices because we don't have to worry about clearing error flags.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6053

Test Plan: make check, manual run of filter_bench

Differential Revision: D18602259

Pulled By: pdillinger

fbshipit-source-id: f6190f83b8eab4e80e7c107348839edabe727841
---
 db/compaction/compaction_iterator_test.cc | 2 +-
 util/filter_bench.cc                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
index ddda79a4cfe..94f297961e2 100644
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -184,7 +184,7 @@ class TestSnapshotChecker : public SnapshotChecker {
  public:
   explicit TestSnapshotChecker(
       SequenceNumber last_committed_sequence,
-      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {})
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}})
       : last_committed_sequence_(last_committed_sequence),
         snapshots_(snapshots) {}
 
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 9dc2ed436c6..6ff496cb10e 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -606,7 +606,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
   }
 
   if (!dry_run) {
-    fp_rate_report_ = std::ostringstream();
+    fp_rate_report_.str("");
     uint64_t q = 0;
     uint64_t fp = 0;
     double worst_fp_rate = 0.0;

From 27ec3b34667782d3a98176302e2206ea54f4bd19 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 20 Nov 2019 10:35:56 -0800
Subject: [PATCH 547/572] Sanitize input in DB::MultiGet() API (#6054)

Summary:
The new DB::MultiGet() doesn't validate input for num_keys > 1 and GCC-9 complains about it. Fix it by directly return when num_keys == 0
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6054

Test Plan: Build with GCC-9 and see it passes.

Differential Revision: D18608958

fbshipit-source-id: 1c279aff3c7fe6e9d5a6d085ed02550ecea4fdb2
---
 db/db_impl/db_impl.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 17b31497828..13f928dfdaf 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1881,6 +1881,9 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
                       ColumnFamilyHandle** column_families, const Slice* keys,
                       PinnableSlice* values, Status* statuses,
                       const bool sorted_input) {
+  if (num_keys == 0) {
+    return;
+  }
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   sorted_keys.resize(num_keys);

From 3cd75736a7eb0cbdb120acc7c669fc49ccc3959e Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@microsoft.com>
Date: Wed, 20 Nov 2019 11:27:05 -0800
Subject: [PATCH 548/572] Add operator[] to autovector::iterator_impl. (#6047)

Summary:
This is a required operator for random-access iterators, and an upcoming update for Visual Studio 2019 will change the C++ Standard Library's heap algorithms to use this operator.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6047

Differential Revision: D18618531

Pulled By: ltamasi

fbshipit-source-id: 08d10bc85bf2dbc3f7ef0fa3c777e99f1e927ef5
---
 util/autovector.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/util/autovector.h b/util/autovector.h
index 5843fa8a119..7ad4f5bef1e 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -140,6 +140,14 @@ class autovector {
       return &(*vect_)[index_];
     }
 
+    reference operator[](difference_type len) {
+      return *(*this + len);
+    }
+
+    const_reference operator[](difference_type len) const {
+      return *(*this + len);
+    }
+
 
     // -- Logical Operators
     bool operator==(const self_type& other) const {

From c0983d069193d59aec7ddc8c02475e0f5081b52f Mon Sep 17 00:00:00 2001
From: Cheng Chang <chengchang.greatwall@gmail.com>
Date: Wed, 20 Nov 2019 14:17:16 -0800
Subject: [PATCH 549/572] Add asserts in transaction example (#6055)

Summary:
The intention of the example for read committed is clearer with these added asserts.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6055

Test Plan: `cd examples && make transaction_example && ./transaction_example`

Differential Revision: D18621830

Pulled By: riversand963

fbshipit-source-id: a94b08c5958b589049409ee4fc4d6799e5cbef79
---
 examples/transaction_example.cc | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc
index 7274cf7ec07..6d12651ada9 100644
--- a/examples/transaction_example.cc
+++ b/examples/transaction_example.cc
@@ -50,17 +50,33 @@ int main() {
 
   // Read a key OUTSIDE this transaction. Does not affect txn.
   s = txn_db->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
 
   // Write a key OUTSIDE of this transaction.
-  // Does not affect txn since this is an unrelated key.  If we wrote key 'abc'
-  // here, the transaction would fail to commit.
+  // Does not affect txn since this is an unrelated key.
   s = txn_db->Put(write_options, "xyz", "zzz");
+  assert(s.ok());
+
+  // Write a key OUTSIDE of this transaction.
+  // Fail because the key conflicts with the key written in txn.
+  s = txn_db->Put(write_options, "abc", "def");
+  assert(s.subcode() == Status::kLockTimeout);
+
+  // Value for key "xyz" has been committed, can be read in txn.
+  s = txn->Get(read_options, "xyz", &value);
+  assert(s.ok());
+  assert(value == "zzz");
 
   // Commit transaction
   s = txn->Commit();
   assert(s.ok());
   delete txn;
 
+  // Value is committed, can be read now.
+  s = txn_db->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
   ////////////////////////////////////////////////////////
   //
   // "Repeatable Read" (Snapshot Isolation) Example

From 0ce0edbe120f0ec5c17b3cf7d54a09b1f911e8db Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 20 Nov 2019 16:32:17 -0800
Subject: [PATCH 550/572] Fix a data race between GetColumnFamilyMetaData and
 MarkFilesBeingCompacted (#6056)

Summary:
Use db mutex to protect the execution of Version::GetColumnFamilyMetaData()
called in DBImpl::GetColumnFamilyMetaData().
Without mutex, GetColumnFamilyMetaData() races with MarkFilesBeingCompacted()
for access to FileMetaData::being_compacted.
Other than mutex, there are several more alternatives.

- Make FileMetaData::being_compacted an atomic variable. This will make
  FileMetaData non-copy-able.

- Separate being_compacted from FileMetaData. This requires re-organizing data
  structures that are already used in many places.

Test Plan (dev server):
```
make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6056

Differential Revision: D18620488

Pulled By: riversand963

fbshipit-source-id: 87f89660b5d5e2ab4ef7962b7b2a7d00e346aa3b
---
 HISTORY.md            |  1 +
 db/db_impl/db_impl.cc | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4ee3ace9766..247e95a3794 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 ## Unreleased
 ### Bug Fixes
 * Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
 
 ### Public API Change
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 13f928dfdaf..ee73cc3fd75 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3241,7 +3241,19 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
   assert(column_family);
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   auto* sv = GetAndRefSuperVersion(cfd);
-  sv->current->GetColumnFamilyMetaData(cf_meta);
+  {
+    // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+    // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+    // this may cause regression. An alternative is to make
+    // FileMetaData::being_compacted atomic, but it will make FileMetaData
+    // non-copy-able. Another option is to separate these variables from
+    // original FileMetaData struct, and this requires re-organization of data
+    // structures. For now, we take the easy approach. If
+    // DB::GetColumnFamilyMetaData is not called frequently, the regression
+    // should not be big. We still need to keep an eye on it.
+    InstrumentedMutexLock l(&mutex_);
+    sv->current->GetColumnFamilyMetaData(cf_meta);
+  }
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 

From e50b64bdba0cfb09be8ae3f1e553f11c7494edda Mon Sep 17 00:00:00 2001
From: Little-Wallace <bupt2013211450@gmail.com>
Date: Thu, 21 Nov 2019 15:22:38 -0800
Subject: [PATCH 551/572] fix unstable unittest caused by #5958 (#6061)

Summary:
Signed-off-by: Little-Wallace <bupt2013211450@gmail.com>

This PR is to fix unstable unit test added by  (https://github.com/facebook/rocksdb/pull/5958).
I set SYNC_POINT in PickCompaction before. If IntraL0Compaction was trigger,  the compact job which compact sst to base level would start instantly. If the compaction thread run faster than unittest main thread, we may observe the number of files in L0 reduce.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6061

Differential Revision: D18642301

fbshipit-source-id: 3e4da2ee963532b6e142336951ea3f47d46df148
---
 db/compaction/compaction_picker.cc |  1 +
 db/db_compaction_test.cc           | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 361549ad144..0461ff32d1c 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -43,6 +43,7 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            SequenceNumber earliest_mem_seqno) {
   // Do not pick ingested file when there is at least one memtable not flushed
   // which of seqno is overlap with the sst.
+  TEST_SYNC_POINT("FindIntraL0Compaction");
   size_t start = 0;
   for (; start < level_files.size(); start++) {
     if (level_files[start]->being_compacted) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index b40b8917dad..326061ee985 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -4876,7 +4876,7 @@ void IngestOneKeyValue(DBImpl* db, const std::string& key,
 }
 
 TEST_P(DBCompactionTestWithParam,
-       FlushAfterL0IntraCompactionCheckConsistencyFail) {
+       FlushAfterIntraL0CompactionCheckConsistencyFail) {
   Options options = CurrentOptions();
   options.force_consistency_checks = true;
   options.compression = kNoCompression;
@@ -4887,11 +4887,16 @@ TEST_P(DBCompactionTestWithParam,
 
   const size_t kValueSize = 1 << 20;
   Random rnd(301);
+  std::atomic<int> pick_intra_l0_count(0);
   std::string value(RandomString(&rnd, kValueSize));
 
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"LevelCompactionPicker::PickCompactionBySize:0",
+      {{"DBCompactionTestWithParam::FlushAfterIntraL0:1",
         "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   // prevents trivial move
@@ -4921,6 +4926,7 @@ TEST_P(DBCompactionTestWithParam,
     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
 
+  TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1");
   // Put one key, to make biggest log sequence number in this memtable is bigger
   // than sst which would be ingested in next step.
   ASSERT_OK(Put(Key(2), "b"));
@@ -4931,6 +4937,7 @@ TEST_P(DBCompactionTestWithParam,
   dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
                                   &level_to_files);
   ASSERT_GT(level_to_files[0].size(), 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
 
   ASSERT_OK(Flush());
 }
@@ -4960,9 +4967,14 @@ TEST_P(DBCompactionTestWithParam,
   ASSERT_OK(Flush());
   Compact("", Key(99));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  std::atomic<int> pick_intra_l0_count(0);
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"LevelCompactionPicker::PickCompactionBySize:0",
+      {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1",
         "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   // Make 6 L0 sst.
   for (int i = 0; i < 6; ++i) {
@@ -4999,12 +5011,14 @@ TEST_P(DBCompactionTestWithParam,
   // Wake up flush job
   sleeping_tasks.WakeUp();
   sleeping_tasks.WaitUntilDone();
+  TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1");
   dbfull()->TEST_WaitForCompact();
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
   uint64_t error_count = 0;
   db_->GetIntProperty("rocksdb.background-errors", &error_count);
   ASSERT_EQ(error_count, 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
   for (int i = 0; i < 6; ++i) {
     ASSERT_EQ(bigvalue, Get(Key(i)));
   }

From adcf920f407f0d7b78b71348017e13632fdb95e9 Mon Sep 17 00:00:00 2001
From: suzanwen <suzanwen@bytedance.com>
Date: Fri, 22 Nov 2019 08:18:15 -0800
Subject: [PATCH 552/572] Compatible changes for cmake (#6045)

Summary:
`${TESTUTILLIB}` should be linked with targets`${LIBS}`, otherwise it may not find the references. After that, we have to work fine with `${CMAKE_CURRENT_SOURCE_DIR}` in `cmake/modules/ReadVersion.cmake`, while building external projects with `add_subdirectory(/path/to/rocksdb)`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6045

Differential Revision: D18641791

Pulled By: pdillinger

fbshipit-source-id: a56b03b4dda6bae6edce1375324f51340917dddc
---
 CMakeLists.txt                  | 1 +
 cmake/modules/ReadVersion.cmake | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45693df5024..b2883b53aa0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1070,6 +1070,7 @@ if(WITH_TESTS)
   add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
   set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
   add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
+  target_link_libraries(${TESTUTILLIB} ${LIBS})
   if(MSVC)
     set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
   endif()
diff --git a/cmake/modules/ReadVersion.cmake b/cmake/modules/ReadVersion.cmake
index ae356d99659..ebfd7d6f949 100644
--- a/cmake/modules/ReadVersion.cmake
+++ b/cmake/modules/ReadVersion.cmake
@@ -1,7 +1,7 @@
 # Read rocksdb version from version.h header file.
 
 function(get_rocksdb_version version_var)
-  file(READ "${CMAKE_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
+  file(READ "${CMAKE_CURRENT_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
   foreach(component MAJOR MINOR PATCH)
     string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file})
     set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1})

From d8c28e692a4a7b0393977c9088914adc099a70f9 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 22 Nov 2019 16:01:21 -0800
Subject: [PATCH 553/572] Support options.ttl with options.max_open_files = -1
 (#6060)

Summary:
Previously, options.ttl cannot be set with options.max_open_files = -1, because it makes use of creation_time field in table properties, which is not available unless max_open_files = -1. With this commit, the information will be stored in manifest and when it is available, will be used instead.

Note that, this change will break forward compatibility for release 5.1 and older.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6060

Test Plan: Extend existing test case to options.max_open_files != -1, and simulate backward compatility in one test case by forcing the value to be 0.

Differential Revision: D18631623

fbshipit-source-id: 30c232a8672de5432ce9608bb2488ecc19138830
---
 HISTORY.md                              |   1 +
 db/column_family.cc                     |   5 -
 db/compaction/compaction.cc             |  15 +-
 db/compaction/compaction.h              |   2 +-
 db/compaction/compaction_job.cc         |  51 ++++---
 db/compaction/compaction_job_test.cc    |   3 +-
 db/compaction/compaction_picker_test.cc |   3 +-
 db/db_compaction_test.cc                | 195 +++++++++++++++---------
 db/db_impl/db_impl_compaction_flush.cc  |   5 +-
 db/db_impl/db_impl_experimental.cc      |   3 +-
 db/db_impl/db_impl_open.cc              |   4 +-
 db/db_test.cc                           |  29 ----
 db/external_sst_file_ingestion_job.cc   |  10 +-
 db/flush_job.cc                         |   7 +-
 db/import_column_family_job.cc          |  11 +-
 db/repair.cc                            |   4 +-
 db/version_builder_test.cc              |  33 ++--
 db/version_edit.cc                      | 137 ++++++++---------
 db/version_edit.h                       |  36 ++++-
 db/version_edit_test.cc                 |  23 +--
 db/version_set.cc                       |  29 ++--
 db/version_set_test.cc                  |   8 +-
 22 files changed, 342 insertions(+), 272 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 247e95a3794..e3279f52b7c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -17,6 +17,7 @@
 * A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
 * Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
 * Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
+* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
 
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
diff --git a/db/column_family.cc b/db/column_family.cc
index c9f4123e61a..79f2abc0cd2 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1198,11 +1198,6 @@ Status ColumnFamilyData::ValidateOptions(
   }
 
   if (cf_options.ttl > 0) {
-    if (db_options.max_open_files != -1) {
-      return Status::NotSupported(
-          "TTL is only supported when files are always "
-          "kept open (set max_open_files = -1). ");
-    }
     if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
       return Status::NotSupported(
           "TTL is only supported in Block-Based Table format. ");
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 3e2b6079bf3..d83bb719704 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -545,17 +545,16 @@ bool Compaction::ShouldFormSubcompactions() const {
   }
 }
 
-uint64_t Compaction::MinInputFileCreationTime() const {
-  uint64_t min_creation_time = port::kMaxUint64;
+uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+  uint64_t min_oldest_ancester_time = port::kMaxUint64;
   for (const auto& file : inputs_[0].files) {
-    if (file->fd.table_reader != nullptr &&
-        file->fd.table_reader->GetTableProperties() != nullptr) {
-      uint64_t creation_time =
-          file->fd.table_reader->GetTableProperties()->creation_time;
-      min_creation_time = std::min(min_creation_time, creation_time);
+    uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+    if (oldest_ancester_time != 0) {
+      min_oldest_ancester_time =
+          std::min(min_oldest_ancester_time, oldest_ancester_time);
     }
   }
-  return min_creation_time;
+  return min_oldest_ancester_time;
 }
 
 int Compaction::GetInputBaseLevel() const {
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 875570ac412..dec5e607e1a 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -291,7 +291,7 @@ class Compaction {
 
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
-  uint64_t MinInputFileCreationTime() const;
+  uint64_t MinInputFileOldestAncesterTime() const;
 
  private:
   // mark (or clear) all files that are being compacted
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 225fb10cf09..3aec2cf6a49 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1479,12 +1479,32 @@ Status CompactionJob::OpenCompactionOutputFile(
     return s;
   }
 
-  SubcompactionState::Output out;
-  out.meta.fd =
-      FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0);
-  out.finished = false;
+  // Try to figure out the output file's oldest ancester time.
+  int64_t temp_current_time = 0;
+  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+  if (!get_time_status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get current time. Status: %s",
+                   get_time_status.ToString().c_str());
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  uint64_t oldest_ancester_time =
+      sub_compact->compaction->MinInputFileOldestAncesterTime();
+  if (oldest_ancester_time == port::kMaxUint64) {
+    oldest_ancester_time = current_time;
+  }
+
+  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+  {
+    SubcompactionState::Output out;
+    out.meta.fd = FileDescriptor(file_number,
+                                 sub_compact->compaction->output_path_id(), 0);
+    out.meta.oldest_ancester_time = oldest_ancester_time;
+    out.finished = false;
+    sub_compact->outputs.push_back(out);
+  }
 
-  sub_compact->outputs.push_back(out);
   writable_file->SetIOPriority(Env::IO_LOW);
   writable_file->SetWriteLifeTimeHint(write_hint_);
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
@@ -1501,21 +1521,6 @@ Status CompactionJob::OpenCompactionOutputFile(
   bool skip_filters =
       cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
 
-  int64_t temp_current_time = 0;
-  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
-  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
-  if (!get_time_status.ok()) {
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "Failed to get current time. Status: %s",
-                   get_time_status.ToString().c_str());
-  }
-  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
-
-  uint64_t creation_time = sub_compact->compaction->MinInputFileCreationTime();
-  if (creation_time == port::kMaxUint64) {
-    creation_time = current_time;
-  }
-
   sub_compact->builder.reset(NewTableBuilder(
       *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
       cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
@@ -1523,9 +1528,9 @@ Status CompactionJob::OpenCompactionOutputFile(
       sub_compact->compaction->output_compression(),
       0 /*sample_for_compression */,
       sub_compact->compaction->output_compression_opts(),
-      sub_compact->compaction->output_level(), skip_filters, creation_time,
-      0 /* oldest_key_time */, sub_compact->compaction->max_output_file_size(),
-      current_time));
+      sub_compact->compaction->output_level(), skip_filters,
+      oldest_ancester_time, 0 /* oldest_key_time */,
+      sub_compact->compaction->max_output_file_size(), current_time));
   LogFlush(db_options_.info_log);
   return s;
 }
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 5e0191734aa..0b5707a3454 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -183,7 +183,8 @@ class CompactionJobTest : public testing::Test {
 
     VersionEdit edit;
     edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
-                 smallest_seqno, largest_seqno, false, oldest_blob_file_number);
+                 smallest_seqno, largest_seqno, false, oldest_blob_file_number,
+                 kUnknownOldestAncesterTime);
 
     mutex_.Lock();
     versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 5dddd2c9145..df86bd98520 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -92,7 +92,8 @@ class CompactionPickerTest : public testing::Test {
         file_number, path_id, file_size,
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
-        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
     vstorage_->AddFile(level, f);
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 326061ee985..eaae16f54e7 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3525,88 +3525,131 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
 TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
   const int kValueSize = 100;
 
-  Options options = CurrentOptions();
-  options.compression = kNoCompression;
-  options.ttl = 24 * 60 * 60;  // 24 hours
-  options.max_open_files = -1;
-  env_->time_elapse_only_sleep_ = false;
-  options.env = env_;
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.compression = kNoCompression;
+      options.ttl = 24 * 60 * 60;  // 24 hours
+      if (if_open_all_files) {
+        options.max_open_files = -1;
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 2;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the oldest ancester time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
+
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
 
-  env_->addon_time_.store(0);
-  DestroyAndReopen(options);
+      int ttl_compactions = 0;
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kTtl) {
+              ttl_compactions++;
+            }
+          });
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+      Random rnd(301);
+      for (int i = 1; i <= 100; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      for (int i = 101; i <= 200; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(6);
+      ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 
-  int ttl_compactions = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        auto compaction_reason = compaction->compaction_reason();
-        if (compaction_reason == CompactionReason::kTtl) {
-          ttl_compactions++;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+      for (int i = 1; i <= 50; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      for (int i = 51; i <= 150; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(4);
+      ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
 
-  // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
-  Random rnd(301);
-  for (int i = 1; i <= 100; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
-  }
-  Flush();
-  for (int i = 101; i <= 200; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
-  }
-  Flush();
-  MoveFilesToLevel(6);
-  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+      // Add one L1 file with key range: [26, 75].
+      for (int i = 26; i <= 75; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+      // LSM tree:
+      // L1:         [26 .. 75]
+      // L4:     [1 .. 50][51 ..... 150]
+      // L6:     [1 ........ 100][101 .... 200]
+      //
+      // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+      // compactions should keep going on until the key range hits bottom level.
+      // In other words: the compaction on this data range "cascasdes" until
+      // reaching the bottom level.
+      //
+      // Order of events on TTL expiry:
+      // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+      // ttl
+      //    compaction.
+      // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+      // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+      //    by the ttl compaction.
+      // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+      // Add 25 hours and do a write
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+
+      ASSERT_OK(Put(Key(1), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_EQ(5, ttl_compactions);
 
-  // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
-  for (int i = 1; i <= 50; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
-  }
-  Flush();
-  for (int i = 51; i <= 150; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
-  }
-  Flush();
-  MoveFilesToLevel(4);
-  ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+      ASSERT_OK(Put(Key(2), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_GE(ttl_compactions, 6);
 
-  // Add one L1 file with key range: [26, 75].
-  for (int i = 26; i <= 75; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    }
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  MoveFilesToLevel(1);
-  ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
-
-  // LSM tree:
-  // L1:         [26 .. 75]
-  // L4:     [1 .. 50][51 ..... 150]
-  // L6:     [1 ........ 100][101 .... 200]
-  //
-  // On TTL expiry, TTL compaction should be initiated on L1 file, and the
-  // compactions should keep going on until the key range hits bottom level.
-  // In other words: the compaction on this data range "cascasdes" until
-  // reaching the bottom level.
-  //
-  // Order of events on TTL expiry:
-  // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the ttl
-  //    compaction.
-  // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
-  // 3. The new output file from L4 falls to L5 via 1 trival move initiated
-  //    by the ttl compaction.
-  // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
-
-  // Add 25 hours and do a write
-  env_->addon_time_.fetch_add(25 * 60 * 60);
-  ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
-  ASSERT_EQ(5, ttl_compactions);
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index e5fd30e86ca..12a8a2aabc6 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1256,7 +1256,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction, f->oldest_blob_file_number);
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -2671,7 +2672,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
                            f->largest, f->fd.smallest_seqno,
                            f->fd.largest_seqno, f->marked_for_compaction,
-                           f->oldest_blob_file_number);
+                           f->oldest_blob_file_number, f->oldest_ancester_time);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index b76a8b2a46d..4fe5409ae9f 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -128,7 +128,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction, f->oldest_blob_file_number);
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 77ab2ebd809..dddbcc26224 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1175,6 +1175,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     int64_t _current_time = 0;
     env_->GetCurrentTime(&_current_time);  // ignore error
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    meta.oldest_ancester_time = current_time;
 
     {
       auto write_hint = cfd->CalculateSSTWriteHint(0);
@@ -1224,7 +1225,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
                   meta.fd.smallest_seqno, meta.fd.largest_seqno,
-                  meta.marked_for_compaction, meta.oldest_blob_file_number);
+                  meta.marked_for_compaction, meta.oldest_blob_file_number,
+                  meta.oldest_ancester_time);
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
diff --git a/db/db_test.cc b/db/db_test.cc
index 56f7b3dbf7d..e365fd3ce12 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3311,22 +3311,6 @@ TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
   }
 }
 
-// Check that FIFO-with-TTL is not supported with max_open_files != -1.
-TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
-  Options options;
-  options.compaction_style = kCompactionStyleFIFO;
-  options.create_if_missing = true;
-  options.ttl = 600;  // seconds
-
-  // Check that it is not supported with max_open_files != -1.
-  options.max_open_files = 100;
-  options = CurrentOptions(options);
-  ASSERT_TRUE(TryReopen(options).IsNotSupported());
-
-  options.max_open_files = -1;
-  ASSERT_OK(TryReopen(options));
-}
-
 // Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
 TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
   Options options;
@@ -6181,19 +6165,6 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
   }
 }
 
-TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
-  Options options = CurrentOptions();
-  options.max_open_files = 100;
-  Reopen(options);
-
-  ColumnFamilyOptions cf_options(options);
-  // ttl is only supported when max_open_files is -1.
-  cf_options.ttl = 3600;
-  ColumnFamilyHandle* handle;
-  ASSERT_NOK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
-  delete handle;
-}
-
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, RowCache) {
   Options options = CurrentOptions();
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index b23339c78af..08446c7f501 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -244,10 +244,18 @@ Status ExternalSstFileIngestionJob::Run() {
       return status;
     }
 
+    // We use the import time as the ancester time. This is the time the data
+    // is written to the database.
+    uint64_t oldest_ancester_time = 0;
+    int64_t temp_current_time = 0;
+    if (env_->GetCurrentTime(&temp_current_time).ok()) {
+      oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
+    }
+
     edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
-                  false, kInvalidBlobFileNumber);
+                  false, kInvalidBlobFileNumber, oldest_ancester_time);
   }
   return status;
 }
diff --git a/db/flush_job.cc b/db/flush_job.cc
index fc14b5b92bd..dcbc33c37a2 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -365,6 +365,10 @@ Status FlushJob::WriteLevel0Table() {
       uint64_t oldest_key_time =
           mems_.front()->ApproximateOldestKeyTime();
 
+      // It's not clear whether oldest_key_time is always available. In case
+      // it is not available, use current_time.
+      meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
           env_options_, cfd_->table_cache(), iter.get(),
@@ -408,7 +412,8 @@ Status FlushJob::WriteLevel0Table() {
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
-                   meta_.marked_for_compaction, meta_.oldest_blob_file_number);
+                   meta_.marked_for_compaction, meta_.oldest_blob_file_number,
+                   meta_.oldest_ancester_time);
   }
 #ifndef ROCKSDB_LITE
   // Piggyback FlushJobInfo on the first first flushed memtable.
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 8dfa4b61cf5..264075a9d21 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -133,6 +133,14 @@ Status ImportColumnFamilyJob::Run() {
   Status status;
   edit_.SetColumnFamily(cfd_->GetID());
 
+  // We use the import time as the ancester time. This is the time the data
+  // is written to the database.
+  uint64_t oldest_ancester_time = 0;
+  int64_t temp_current_time = 0;
+  if (env_->GetCurrentTime(&temp_current_time).ok()) {
+    oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
+  }
+
   for (size_t i = 0; i < files_to_import_.size(); ++i) {
     const auto& f = files_to_import_[i];
     const auto& file_metadata = metadata_[i];
@@ -140,7 +148,8 @@ Status ImportColumnFamilyJob::Run() {
     edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, file_metadata.smallest_seqno,
-                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber);
+                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
+                  oldest_ancester_time);
 
     // If incoming sequence number is higher, update local sequence number.
     if (file_metadata.largest_seqno > versions_->LastSequence()) {
diff --git a/db/repair.cc b/db/repair.cc
index 4f474fee6f8..baed9ead1ba 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -499,6 +499,7 @@ class Repairer {
         status =
             AddColumnFamily(props->column_family_name, t->column_family_id);
       }
+      t->meta.oldest_ancester_time = props->creation_time;
     }
     ColumnFamilyData* cfd = nullptr;
     if (status.ok()) {
@@ -581,7 +582,8 @@ class Repairer {
                      table->meta.largest, table->meta.fd.smallest_seqno,
                      table->meta.fd.largest_seqno,
                      table->meta.marked_for_compaction,
-                     table->meta.oldest_blob_file_number);
+                     table->meta.oldest_blob_file_number,
+                     table->meta.oldest_ancester_time);
       }
       assert(next_file_number_ > 0);
       vset_.MarkFileNumberUsed(next_file_number_ - 1);
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 1da20fab1ab..4c88983ba38 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -62,7 +62,8 @@ class VersionBuilderTest : public testing::Test {
     FileMetaData* f = new FileMetaData(
         file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
         GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
-        /* marked_for_compact */ false, kInvalidBlobFileNumber);
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime);
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
@@ -113,7 +114,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -148,7 +149,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -186,7 +187,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -215,19 +216,19 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
 
   EnvOptions env_options;
 
@@ -254,30 +255,30 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_builder.Apply(&version_edit);
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
                        GetInternalKey("950"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
                        GetInternalKey("850"), 200, 200, false,
-                       kInvalidBlobFileNumber);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   version_builder.Apply(&version_edit2);
 
   version_builder.SaveTo(&new_vstorage);
diff --git a/db/version_edit.cc b/db/version_edit.cc
index be5fd355585..564cec3c114 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -61,6 +61,7 @@ enum CustomTag : uint32_t {
   // removed when manifest becomes forward-comptabile.
   kMinLogNumberToKeepHack = 3,
   kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
   kPathId = 65,
 };
 // If this bit for the custom tag is set, opening DB should fail if
@@ -178,82 +179,71 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
     if (!f.smallest.Valid() || !f.largest.Valid()) {
       return false;
     }
-    bool has_customized_fields = false;
-    if (f.marked_for_compaction || has_min_log_number_to_keep_ ||
-        f.oldest_blob_file_number != kInvalidBlobFileNumber) {
-      PutVarint32(dst, kNewFile4);
-      has_customized_fields = true;
-    } else if (f.fd.GetPathId() == 0) {
-      // Use older format to make sure user can roll back the build if they
-      // don't config multiple DB paths.
-      PutVarint32(dst, kNewFile2);
-    } else {
-      PutVarint32(dst, kNewFile3);
-    }
+    PutVarint32(dst, kNewFile4);
     PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
-    if (f.fd.GetPathId() != 0 && !has_customized_fields) {
-      // kNewFile3
-      PutVarint32(dst, f.fd.GetPathId());
-    }
     PutVarint64(dst, f.fd.GetFileSize());
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
     PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
-    if (has_customized_fields) {
-      // Customized fields' format:
-      // +-----------------------------+
-      // | 1st field's tag (varint32)  |
-      // +-----------------------------+
-      // | 1st field's size (varint32) |
-      // +-----------------------------+
-      // |    bytes for 1st field      |
-      // |  (based on size decoded)    |
-      // +-----------------------------+
-      // |                             |
-      // |          ......             |
-      // |                             |
-      // +-----------------------------+
-      // | last field's size (varint32)|
-      // +-----------------------------+
-      // |    bytes for last field     |
-      // |  (based on size decoded)    |
-      // +-----------------------------+
-      // | terminating tag (varint32)  |
-      // +-----------------------------+
-      //
-      // Customized encoding for fields:
-      //   tag kPathId: 1 byte as path_id
-      //   tag kNeedCompaction:
-      //        now only can take one char value 1 indicating need-compaction
-      //
-      if (f.fd.GetPathId() != 0) {
-        PutVarint32(dst, CustomTag::kPathId);
-        char p = static_cast<char>(f.fd.GetPathId());
-        PutLengthPrefixedSlice(dst, Slice(&p, 1));
-      }
-      if (f.marked_for_compaction) {
-        PutVarint32(dst, CustomTag::kNeedCompaction);
-        char p = static_cast<char>(1);
-        PutLengthPrefixedSlice(dst, Slice(&p, 1));
-      }
-      if (has_min_log_number_to_keep_ && !min_log_num_written) {
-        PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
-        std::string varint_log_number;
-        PutFixed64(&varint_log_number, min_log_number_to_keep_);
-        PutLengthPrefixedSlice(dst, Slice(varint_log_number));
-        min_log_num_written = true;
-      }
-      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
-        PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
-        std::string oldest_blob_file_number;
-        PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
-        PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
-      }
-      TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
-                               dst);
-
-      PutVarint32(dst, CustomTag::kTerminate);
+    // Customized fields' format:
+    // +-----------------------------+
+    // | 1st field's tag (varint32)  |
+    // +-----------------------------+
+    // | 1st field's size (varint32) |
+    // +-----------------------------+
+    // |    bytes for 1st field      |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // |                             |
+    // |          ......             |
+    // |                             |
+    // +-----------------------------+
+    // | last field's size (varint32)|
+    // +-----------------------------+
+    // |    bytes for last field     |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // | terminating tag (varint32)  |
+    // +-----------------------------+
+    //
+    // Customized encoding for fields:
+    //   tag kPathId: 1 byte as path_id
+    //   tag kNeedCompaction:
+    //        now only can take one char value 1 indicating need-compaction
+    //
+    PutVarint32(dst, CustomTag::kOldestAncesterTime);
+    std::string varint_oldest_ancester_time;
+    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                             &varint_oldest_ancester_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, CustomTag::kPathId);
+      char p = static_cast<char>(f.fd.GetPathId());
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
     }
+    if (f.marked_for_compaction) {
+      PutVarint32(dst, CustomTag::kNeedCompaction);
+      char p = static_cast<char>(1);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (has_min_log_number_to_keep_ && !min_log_num_written) {
+      PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+      std::string varint_log_number;
+      PutFixed64(&varint_log_number, min_log_number_to_keep_);
+      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+      min_log_num_written = true;
+    }
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+      std::string oldest_blob_file_number;
+      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+    }
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                             dst);
+
+    PutVarint32(dst, CustomTag::kTerminate);
   }
 
   // 0 is default and does not need to be explicitly written
@@ -340,6 +330,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "path_id wrong vaue";
           }
           break;
+        case kOldestAncesterTime:
+          if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+            return "invalid oldest ancester time";
+          }
+          break;
         case kNeedCompaction:
           if (field.size() != 1) {
             return "need_compaction field wrong size";
@@ -663,6 +658,8 @@ std::string VersionEdit::DebugString(bool hex_key) const {
       r.append(" blob_file:");
       AppendNumberTo(&r, f.oldest_blob_file_number);
     }
+    r.append(" oldest_ancester_time:");
+    AppendNumberTo(&r, f.oldest_ancester_time);
   }
   r.append("\n  ColumnFamily: ");
   AppendNumberTo(&r, column_family_);
diff --git a/db/version_edit.h b/db/version_edit.h
index 3e1d7fc711e..d3664fd3921 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -16,6 +16,7 @@
 #include "db/dbformat.h"
 #include "memory/arena.h"
 #include "rocksdb/cache.h"
+#include "table/table_reader.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -24,6 +25,7 @@ class VersionSet;
 
 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
 constexpr uint64_t kInvalidBlobFileNumber = 0;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
 
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
@@ -122,18 +124,25 @@ struct FileMetaData {
   // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
   uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
 
+  // The file could be the compaction output from other SST files, which could
+  // in turn be outputs for compact older SST files. We track the memtable
+  // flush timestamp for the oldest SST file that eventaully contribute data
+  // to this file. 0 means the information is not available.
+  uint64_t oldest_ancester_time = 0;
+
   FileMetaData() = default;
 
   FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
                const InternalKey& smallest_key, const InternalKey& largest_key,
                const SequenceNumber& smallest_seq,
                const SequenceNumber& largest_seq, bool marked_for_compact,
-               uint64_t oldest_blob_file)
+               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
         marked_for_compaction(marked_for_compact),
-        oldest_blob_file_number(oldest_blob_file) {}
+        oldest_blob_file_number(oldest_blob_file),
+        oldest_ancester_time(_oldest_ancester_time) {}
 
   // REQUIRED: Keys must be given to the function in sorted order (it expects
   // the last key to be the largest).
@@ -154,6 +163,19 @@ struct FileMetaData {
     fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
     fd.largest_seqno = std::max(fd.largest_seqno, seqno);
   }
+
+  // Try to get oldest ancester time from the class itself or table properties
+  // if table reader is already pinned.
+  // 0 means the information is not available.
+  uint64_t TryGetOldestAncesterTime() {
+    if (oldest_ancester_time != 0) {
+      return oldest_ancester_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->creation_time;
+    }
+    return 0;
+  }
 };
 
 // A compressed copy of file meta data that just contain minimum data needed
@@ -255,12 +277,14 @@ class VersionEdit {
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno, bool marked_for_compaction,
-               uint64_t oldest_blob_file_number) {
+               uint64_t oldest_blob_file_number,
+               uint64_t oldest_ancester_time) {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
-        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
-                            smallest_seqno, largest_seqno,
-                            marked_for_compaction, oldest_blob_file_number));
+        level,
+        FileMetaData(file, file_path_id, file_size, smallest, largest,
+                     smallest_seqno, largest_seqno, marked_for_compaction,
+                     oldest_blob_file_number, oldest_ancester_time));
   }
 
   void AddFile(int level, const FileMetaData& f) {
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 05b953a7958..f110694d8a8 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -36,7 +36,8 @@ TEST_F(VersionEditTest, EncodeDecode) {
     edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber);
+                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
+                 888);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -53,16 +54,18 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false, kInvalidBlobFileNumber);
+               kBig + 601, false, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime);
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
-               kBig + 602, true, kInvalidBlobFileNumber);
+               kBig + 602, true, kInvalidBlobFileNumber, 666);
   edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
                InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
-               kBig + 603, true, 1001);
+               kBig + 603, true, 1001, kUnknownOldestAncesterTime);
 
   edit.DeleteFile(4, 700);
 
@@ -100,10 +103,11 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false, kInvalidBlobFileNumber);
+               kBig + 601, false, kInvalidBlobFileNumber, 686);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -149,7 +153,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -177,7 +182,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
   edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
-               kInvalidBlobFileNumber);
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
diff --git a/db/version_set.cc b/db/version_set.cc
index ca6c6fb1040..938f609fff1 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2330,13 +2330,11 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
   auto status = ioptions.env->GetCurrentTime(&_current_time);
   if (status.ok()) {
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
-    for (auto f : files) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time > 0 &&
-            creation_time < (current_time - mutable_cf_options.ttl)) {
+    for (FileMetaData* f : files) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time != 0 &&
+            oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
           ttl_expired_files_count++;
         }
       }
@@ -2489,12 +2487,11 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
   const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
   for (int level = 0; level < num_levels() - 1; level++) {
-    for (auto f : files_[level]) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time > 0 && creation_time < (current_time - ttl)) {
+    for (FileMetaData* f : files_[level]) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time > 0 &&
+            oldest_ancester_time < (current_time - ttl)) {
           expired_ttl_files_.emplace_back(level, f);
         }
       }
@@ -2539,8 +2536,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
         uint64_t file_modification_time =
             f->fd.table_reader->GetTableProperties()->file_creation_time;
         if (file_modification_time == 0) {
-          file_modification_time =
-              f->fd.table_reader->GetTableProperties()->creation_time;
+          file_modification_time = f->TryGetOldestAncesterTime();
         }
         if (file_modification_time == 0) {
           auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
@@ -4984,7 +4980,8 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
-                       f->marked_for_compaction, f->oldest_blob_file_number);
+                       f->marked_for_compaction, f->oldest_blob_file_number,
+                       f->oldest_ancester_time);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 7151e6d036a..363b337bcfc 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -39,7 +39,8 @@ class GenerateLevelFilesBriefTest : public testing::Test {
         files_.size() + 1, 0, 0,
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
-        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime);
     files_.push_back(f);
   }
 
@@ -133,7 +134,8 @@ class VersionStorageInfoTest : public testing::Test {
     FileMetaData* f = new FileMetaData(
         file_number, 0, file_size, GetInternalKey(smallest, 0),
         GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
-        /* marked_for_compact */ false, kInvalidBlobFileNumber);
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime);
     f->compensated_file_size = file_size;
     vstorage_.AddFile(level, f);
   }
@@ -144,7 +146,7 @@ class VersionStorageInfoTest : public testing::Test {
     FileMetaData* f = new FileMetaData(
         file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
         /* largest_seq */ 0, /* marked_for_compact */ false,
-        kInvalidBlobFileNumber);
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
     f->compensated_file_size = file_size;
     vstorage_.AddFile(level, f);
   }

From 75dfc7883d08e323d8368e33410b3bb1e7fad5a1 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 22 Nov 2019 18:12:35 -0800
Subject: [PATCH 554/572] Fix the constness issues around
 autovector::iterator_impl's dereference operators (#6057)

Summary:
As described in detail in issue https://github.com/facebook/rocksdb/issues/6048, iterators' dereference operators
(`*`, `->`, and `[]`) should return `pointer`s/`reference`s (as opposed to
`const_pointer`s/`const_reference`s) even if the iterator itself is `const`
to be in sync with the standard's iterator concept.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6057

Test Plan: make check

Differential Revision: D18623235

Pulled By: ltamasi

fbshipit-source-id: 04e82d73bc0c67fb0ded018383af8dfc332050cc
---
 util/autovector.h | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/util/autovector.h b/util/autovector.h
index 7ad4f5bef1e..010d8689586 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -120,35 +120,20 @@ class autovector {
     }
 
     // -- Reference
-    reference operator*() {
+    reference operator*() const {
       assert(vect_->size() >= index_);
       return (*vect_)[index_];
     }
 
-    const_reference operator*() const {
-      assert(vect_->size() >= index_);
-      return (*vect_)[index_];
-    }
-
-    pointer operator->() {
-      assert(vect_->size() >= index_);
-      return &(*vect_)[index_];
-    }
-
-    const_pointer operator->() const {
+    pointer operator->() const {
       assert(vect_->size() >= index_);
       return &(*vect_)[index_];
     }
 
-    reference operator[](difference_type len) {
-      return *(*this + len);
-    }
-
-    const_reference operator[](difference_type len) const {
+    reference operator[](difference_type len) const {
       return *(*this + len);
     }
 
-
     // -- Logical Operators
     bool operator==(const self_type& other) const {
       assert(vect_ == other.vect_);

From 669ea77d9fa1ea1b8f811cf312162ceb011476d4 Mon Sep 17 00:00:00 2001
From: Sagar Vemuri <svemuri@fb.com>
Date: Fri, 22 Nov 2019 22:12:09 -0800
Subject: [PATCH 555/572] Support ttl in Universal Compaction (#6071)

Summary:
`options.ttl` is now supported in universal compaction, similar to how periodic compactions are implemented in PR https://github.com/facebook/rocksdb/issues/5970 .
Setting `options.ttl` will simply set `options.periodic_compaction_seconds` to execute the periodic compactions code path.
Discarded PR https://github.com/facebook/rocksdb/issues/4749 in lieu of this.

This is a short term work-around/hack of falling back to periodic compactions when ttl is set.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6071

Test Plan: Added a unit test.

Differential Revision: D18668336

Pulled By: sagar0

fbshipit-source-id: e75f5b81ba949f77ef9eff05e44bb1c757f58612
---
 HISTORY.md                         |  1 +
 db/column_family.cc                | 12 ++++++++++++
 db/db_options_test.cc              | 29 +++++++++++++++++++++++++++++
 include/rocksdb/advanced_options.h |  1 -
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index e3279f52b7c..c5189334f1d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -18,6 +18,7 @@
 * Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
 * Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
 * Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
+* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
 
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
diff --git a/db/column_family.cc b/db/column_family.cc
index 79f2abc0cd2..6fb5391e9b0 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -367,6 +367,18 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     }
   }
 
+  // TTL compactions would work similar to Periodic Compactions in Universal in
+  // most of the cases. So, if ttl is set, execute the periodic compaction
+  // codepath.
+  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+    if (result.periodic_compaction_seconds != 0) {
+      result.periodic_compaction_seconds =
+          std::min(result.ttl, result.periodic_compaction_seconds);
+    } else {
+      result.periodic_compaction_seconds = result.ttl;
+    }
+  }
+
   return result;
 }
 
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 004c9acfbd8..103c58c3bcb 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -606,6 +606,35 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
 }
 
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 100;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 500;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
 TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 63d1f1ec972..90663ff0ed0 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -647,7 +647,6 @@ struct AdvancedColumnFamilyOptions {
   bool report_bg_io_stats = false;
 
   // Files older than TTL will go through the compaction process.
-  // Supported in Level and FIFO compaction.
   // Pre-req: This needs max_open_files to be set to -1.
   // In Level: Non-bottom-level files older than TTL will go through the
   //           compation process.

From 0bc87442aef997c53d50c3c3c13b2e20187c6d76 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 25 Nov 2019 12:03:06 -0800
Subject: [PATCH 556/572] Update HISTORY.md for forward compatibility (#6085)

Summary:
https://github.com/facebook/rocksdb/pull/6060 broke forward compatiblity for releases from 3.10 to 4.2. Update HISTORY.md to mention it. Also remove it from the compatibility tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6085

Differential Revision: D18691694

fbshipit-source-id: 4ef903783dc722b8a4d3e8229abbf0f021a114c9
---
 HISTORY.md                       | 9 ++++-----
 tools/check_format_compatible.sh | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c5189334f1d..14ce57e53ad 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,10 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
-### Bug Fixes
-* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
-* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
-
 ### Public API Change
+* RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016.
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
@@ -24,7 +21,9 @@
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
 
 ### Bug Fixes
-* Fix a assertion failure in MultiGe4t() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
+* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
+* Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
 * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
 * Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 30be49f0800..98c2bb5c2a3 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -54,7 +54,7 @@ with open('${sorted_input_data}', 'w') as f:
 EOF
 
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a forward_compatible_checkout_objs=("4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
 declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
 declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")

From fcd7e03832d133ac3a23778e9e029155a9a6ce04 Mon Sep 17 00:00:00 2001
From: Sebastiano Peluso <sebapeluso@fb.com>
Date: Mon, 25 Nov 2019 14:18:10 -0800
Subject: [PATCH 557/572] =?UTF-8?q?Ignore=20value=20of=20BackupableDBOptio?=
 =?UTF-8?q?ns::max=5Fvalid=5Fbackups=5Fto=5Fopen=20when=20B=E2=80=A6=20(#6?=
 =?UTF-8?q?072)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This change ignores the value of BackupableDBOptions::max_valid_backups_to_open when a BackupEngine is not read-only.

Issue: https://github.com/facebook/rocksdb/issues/4997

Note on tests: I had to remove test case WriteOnlyEngine of BackupableDBTest because it was not consistent with the new semantic of BackupableDBOptions::max_valid_backups_to_open. Maybe, we should think about adding a new interface for append-only BackupEngines. On the other hand, I changed LimitBackupsOpened test case to use a read-only BackupEngine, and I added a new specific test case for the change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6072

Reviewed By: pdillinger

Differential Revision: D18687364

Pulled By: sebastianopeluso

fbshipit-source-id: 77bc1f927d623964d59137a93de123bbd719da4e
---
 HISTORY.md                                 |   1 +
 utilities/backupable/backupable_db.cc      | 141 ++++++++++-----------
 utilities/backupable/backupable_db_test.cc |  69 +++++-----
 3 files changed, 106 insertions(+), 105 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 14ce57e53ad..a49ec50ad69 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
 * An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
+* The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
 
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index ab79abbe186..e85e4625ad1 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -568,6 +568,14 @@ Status BackupEngineImpl::Initialize() {
     // we might need to clean up from previous crash or I/O errors
     might_need_garbage_collect_ = true;
 
+    if (options_.max_valid_backups_to_open != port::kMaxInt32) {
+      options_.max_valid_backups_to_open = port::kMaxInt32;
+      ROCKS_LOG_WARN(
+          options_.info_log,
+          "`max_valid_backups_to_open` is not set to the default value. Ignoring "
+          "its value since BackupEngine is not read-only.");
+    }
+
     // gather the list of directories that we need to create
     std::vector<std::pair<std::string, std::unique_ptr<Directory>*>>
         directories;
@@ -1044,29 +1052,21 @@ Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
   // After removing meta file, best effort deletion even with errors.
   // (Don't delete other files if we can't delete the meta file right
   // now.)
-
-  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
-    std::vector<std::string> to_delete;
-    for (auto& itr : backuped_file_infos_) {
-      if (itr.second->refs == 0) {
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
-        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                       itr.first.c_str(), s.ToString().c_str());
-        to_delete.push_back(itr.first);
-        if (!s.ok()) {
-          // Trying again later might work
-          might_need_garbage_collect_ = true;
-        }
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second->refs == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+                     s.ToString().c_str());
+      to_delete.push_back(itr.first);
+      if (!s.ok()) {
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
       }
     }
-    for (auto& td : to_delete) {
-      backuped_file_infos_.erase(td);
-    }
-  } else {
-    ROCKS_LOG_WARN(
-        options_.info_log,
-        "DeleteBackup cleanup is limited since `max_valid_backups_to_open` "
-        "constrains how many backups the engine knows about");
+  }
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
   }
 
   // take care of private dirs -- GarbageCollect() will take care of them
@@ -1569,64 +1569,54 @@ Status BackupEngineImpl::GarbageCollect() {
   might_need_garbage_collect_ = false;
 
   ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
-  if (options_.max_valid_backups_to_open != port::kMaxInt32) {
-    ROCKS_LOG_WARN(
-        options_.info_log,
-        "Garbage collection is limited since `max_valid_backups_to_open` "
-        "constrains how many backups the engine knows about");
-  }
-
-  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
-    // delete obsolete shared files
-    // we cannot do this when BackupEngine has `max_valid_backups_to_open` set
-    // as those engines don't know about all shared files.
-    for (bool with_checksum : {false, true}) {
-      std::vector<std::string> shared_children;
-      {
-        std::string shared_path;
-        if (with_checksum) {
-          shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
-        } else {
-          shared_path = GetAbsolutePath(GetSharedFileRel());
-        }
-        auto s = backup_env_->FileExists(shared_path);
-        if (s.ok()) {
-          s = backup_env_->GetChildren(shared_path, &shared_children);
-        } else if (s.IsNotFound()) {
-          s = Status::OK();
-        }
+
+  // delete obsolete shared files
+  for (bool with_checksum : {false, true}) {
+    std::vector<std::string> shared_children;
+    {
+      std::string shared_path;
+      if (with_checksum) {
+        shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
+      } else {
+        shared_path = GetAbsolutePath(GetSharedFileRel());
+      }
+      auto s = backup_env_->FileExists(shared_path);
+      if (s.ok()) {
+        s = backup_env_->GetChildren(shared_path, &shared_children);
+      } else if (s.IsNotFound()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        overall_status = s;
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
+      }
+    }
+    for (auto& child : shared_children) {
+      if (child == "." || child == "..") {
+        continue;
+      }
+      std::string rel_fname;
+      if (with_checksum) {
+        rel_fname = GetSharedFileWithChecksumRel(child);
+      } else {
+        rel_fname = GetSharedFileRel(child);
+      }
+      auto child_itr = backuped_file_infos_.find(rel_fname);
+      // if it's not refcounted, delete it
+      if (child_itr == backuped_file_infos_.end() ||
+          child_itr->second->refs == 0) {
+        // this might be a directory, but DeleteFile will just fail in that
+        // case, so we're good
+        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       rel_fname.c_str(), s.ToString().c_str());
+        backuped_file_infos_.erase(rel_fname);
         if (!s.ok()) {
-          overall_status = s;
           // Trying again later might work
           might_need_garbage_collect_ = true;
         }
       }
-      for (auto& child : shared_children) {
-        if (child == "." || child == "..") {
-          continue;
-        }
-        std::string rel_fname;
-        if (with_checksum) {
-          rel_fname = GetSharedFileWithChecksumRel(child);
-        } else {
-          rel_fname = GetSharedFileRel(child);
-        }
-        auto child_itr = backuped_file_infos_.find(rel_fname);
-        // if it's not refcounted, delete it
-        if (child_itr == backuped_file_infos_.end() ||
-            child_itr->second->refs == 0) {
-          // this might be a directory, but DeleteFile will just fail in that
-          // case, so we're good
-          Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
-          ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                         rel_fname.c_str(), s.ToString().c_str());
-          backuped_file_infos_.erase(rel_fname);
-          if (!s.ok()) {
-            // Trying again later might work
-            might_need_garbage_collect_ = true;
-          }
-        }
-      }
     }
   }
 
@@ -1645,8 +1635,7 @@ Status BackupEngineImpl::GarbageCollect() {
     if (child == "." || child == "..") {
       continue;
     }
-    // it's ok to do this when BackupEngine has `max_valid_backups_to_open` set
-    // as the engine always knows all valid backup numbers.
+
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
     sscanf(child.c_str(), "%u", &backup_id);
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 010ef089739..98aad78e453 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -1687,12 +1687,50 @@ TEST_F(BackupableDBTest, LimitBackupsOpened) {
   CloseDBAndBackupEngine();
 
   backupable_options_->max_valid_backups_to_open = 2;
-  OpenDBAndBackupEngine();
+  backupable_options_->destroy_old_data = false;
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(backup_chroot_env_.get(),
+                                       *backupable_options_,
+                                       &read_only_backup_engine));
+
   std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
+  read_only_backup_engine->GetBackupInfo(&backup_infos);
   ASSERT_EQ(2, backup_infos.size());
   ASSERT_EQ(2, backup_infos[0].backup_id);
   ASSERT_EQ(4, backup_infos[1].backup_id);
+  delete read_only_backup_engine;
+}
+
+TEST_F(BackupableDBTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+  // Verify the specified max_valid_backups_to_open is ignored if the engine
+  // is not read-only.
+  //
+  // Setup:
+  // - backups 1, 2, and 4 are valid
+  // - backup 3 is corrupt
+  // - max_valid_backups_to_open == 2
+  //
+  // Expectation: the engine opens backups 4, 2, and 1 since those are latest
+  // non-corrupt backups, by ignoring max_valid_backups_to_open == 2.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true);
+  for (int i = 1; i <= 4; ++i) {
+    FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    if (i == 3) {
+      ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+    }
+  }
+  CloseDBAndBackupEngine();
+
+  backupable_options_->max_valid_backups_to_open = 2;
+  OpenDBAndBackupEngine();
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(3, backup_infos.size());
+  ASSERT_EQ(1, backup_infos[0].backup_id);
+  ASSERT_EQ(2, backup_infos[1].backup_id);
+  ASSERT_EQ(4, backup_infos[2].backup_id);
   CloseDBAndBackupEngine();
   DestroyDB(dbname_, options_);
 }
@@ -1718,33 +1756,6 @@ TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
   ASSERT_EQ(2, backup_infos[0].backup_id);
 }
 
-TEST_F(BackupableDBTest, WriteOnlyEngine) {
-  // Verify we can open a backup engine and create new ones even if reading old
-  // backups would fail with IOError. IOError is a more serious condition than
-  // corruption and would cause the engine to fail opening. So the only way to
-  // avoid is by not reading old backups at all, i.e., respecting
-  // `max_valid_backups_to_open == 0`.
-  const int kNumKeys = 5000;
-  OpenDBAndBackupEngine(true /* destroy_old_data */);
-  FillDB(db_.get(), 0 /* from */, kNumKeys);
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-  CloseDBAndBackupEngine();
-
-  backupable_options_->max_valid_backups_to_open = 0;
-  // cause any meta-file reads to fail with IOError during Open
-  test_backup_env_->SetDummySequentialFile(true);
-  test_backup_env_->SetDummySequentialFileFailReads(true);
-  OpenDBAndBackupEngine();
-  test_backup_env_->SetDummySequentialFileFailReads(false);
-  test_backup_env_->SetDummySequentialFile(false);
-
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-  std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
-  ASSERT_EQ(1, backup_infos.size());
-  ASSERT_EQ(2, backup_infos[0].backup_id);
-}
-
 TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
   // Verifies a write-only BackupEngine does not delete files belonging to valid
   // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.

From 77eab5c85aa7a05d2200339738373f8b0b921e81 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 25 Nov 2019 17:11:26 -0800
Subject: [PATCH 558/572] Make default value of options.ttl to be 30 days when
 it is supported. (#6073)

Summary:
By default options.ttl is disabled. We believe a better default will be 30 days, which means deleted data the database will be removed from SST files slightly after 30 days, for most of the cases.

Make the default UINT64_MAX - 1 to indicate that it is not overridden by users.

Change periodic_compaction_seconds to be UINT64_MAX - 1 to UINT64_MAX  too to be consistent. Also fix a small bug in the previous periodic_compaction_seconds default code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6073

Test Plan: Add unit tests for it.

Differential Revision: D18669626

fbshipit-source-id: 957cd4374cafc1557d45a0ba002010552a378cc8
---
 HISTORY.md                              |  5 ++-
 db/column_family.cc                     | 42 +++++++++++++++++++------
 db/compaction/compaction_picker_test.cc |  2 ++
 db/db_compaction_test.cc                |  2 +-
 db/db_options_test.cc                   | 15 +++++++++
 db/db_test.cc                           | 30 ++++++++++++++++++
 db/db_universal_compaction_test.cc      |  3 +-
 db/version_set.cc                       |  9 +++---
 include/rocksdb/advanced_options.h      | 14 ++++++---
 9 files changed, 100 insertions(+), 22 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index a49ec50ad69..b39230a3f0e 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,12 +3,15 @@
 ### Public API Change
 * RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016.
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
-* Changed the default value of periodic_compaction_seconds to `UINT64_MAX` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
 * An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
 * The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
 
+### Default Option Changes
+* Changed the default value of periodic_compaction_seconds to `UINT64_MAX - 1` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
+* Changed the default value of ttl to `UINT64_MAX - 1` which allows RocksDB to auto-tune ttl value. When using the default value, TTL will be auto-enabled to 30 days, when the feature is supported. To revert the old behavior, you can explictly set it to 0.
+
 ### New Features
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
 * `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
diff --git a/db/column_family.cc b/db/column_family.cc
index 6fb5391e9b0..f66759818e8 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -188,6 +188,11 @@ Status CheckCFPathsSupported(const DBOptions& db_options,
   return Status::OK();
 }
 
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+};  // namespace
+
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
@@ -343,8 +348,20 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.max_compaction_bytes = result.target_file_size_base * 25;
   }
 
-  const uint64_t kDefaultPeriodicCompSecs = 0xffffffffffffffff;
-  const uint64_t kDefaultTtlSecs = 30 * 24 * 60 * 60;
+  bool is_block_based_table =
+      (result.table_factory->Name() == BlockBasedTableFactory().Name());
+
+  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+  if (result.ttl == kDefaultTtl) {
+    if (is_block_based_table &&
+        result.compaction_style != kCompactionStyleFIFO) {
+      result.ttl = kAdjustedTtl;
+    } else {
+      result.ttl = 0;
+    }
+  }
+
+  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
 
   // Turn on periodic compactions and set them to occur once every 30 days if
   // compaction filters are used and periodic_compaction_seconds is set to the
@@ -352,16 +369,19 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   if (result.compaction_style != kCompactionStyleFIFO) {
     if ((result.compaction_filter != nullptr ||
          result.compaction_filter_factory != nullptr) &&
-        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
-      result.periodic_compaction_seconds = kDefaultTtlSecs;
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
     }
   } else {
     // result.compaction_style == kCompactionStyleFIFO
     if (result.ttl == 0) {
-      if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
-        result.periodic_compaction_seconds = kDefaultTtlSecs;
+      if (is_block_based_table) {
+        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+        }
+        result.ttl = result.periodic_compaction_seconds;
       }
-      result.ttl = result.periodic_compaction_seconds;
     } else if (result.periodic_compaction_seconds != 0) {
       result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
     }
@@ -379,6 +399,10 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     }
   }
 
+  if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = 0;
+  }
+
   return result;
 }
 
@@ -1209,7 +1233,7 @@ Status ColumnFamilyData::ValidateOptions(
     return s;
   }
 
-  if (cf_options.ttl > 0) {
+  if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
     if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
       return Status::NotSupported(
           "TTL is only supported in Block-Based Table format. ");
@@ -1217,7 +1241,7 @@ Status ColumnFamilyData::ValidateOptions(
   }
 
   if (cf_options.periodic_compaction_seconds > 0 &&
-      cf_options.periodic_compaction_seconds < port::kMaxUint64) {
+      cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
     if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
       return Status::NotSupported(
           "Periodic Compaction is only supported in "
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index df86bd98520..d593d646543 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -57,6 +57,8 @@ class CompactionPickerTest : public testing::Test {
         log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
         file_num_(1),
         vstorage_(nullptr) {
+    mutable_cf_options_.ttl = 0;
+    mutable_cf_options_.periodic_compaction_seconds = 0;
     // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
     // tests to cover.
     ioptions_.compaction_pri = kByCompensatedSize;
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index eaae16f54e7..3c2452e4c2b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3911,7 +3911,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
   for (CompactionFilterType comp_filter_type :
        {kUseCompactionFilter, kUseCompactionFilterFactory}) {
     // Assert that periodic compactions are not enabled.
-    ASSERT_EQ(port::kMaxUint64, options.periodic_compaction_seconds);
+    ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds);
 
     if (comp_filter_type == kUseCompactionFilter) {
       options.compaction_filter = &test_compaction_filter;
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 103c58c3bcb..cb031e62eb4 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -635,6 +635,21 @@ TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
   ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
 }
 
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+  Options options;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.compaction_style = kCompactionStyleLevel;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
 TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
diff --git a/db/db_test.cc b/db/db_test.cc
index e365fd3ce12..6ea5e9e00d6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3311,6 +3311,22 @@ TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
   }
 }
 
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  // TTL is now supported with max_open_files != -1.
+  options.max_open_files = 100;
+  options = CurrentOptions(options);
+  ASSERT_OK(TryReopen(options));
+
+  options.max_open_files = -1;
+  ASSERT_OK(TryReopen(options));
+}
+
 // Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
 TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
   Options options;
@@ -4812,6 +4828,7 @@ TEST_F(DBTest, DynamicCompactionOptions) {
 // Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
 TEST_F(DBTest, DynamicFIFOCompactionOptions) {
   Options options;
+  options.ttl = 0;
   options.create_if_missing = true;
   DestroyAndReopen(options);
 
@@ -6165,6 +6182,19 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
   }
 }
 
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is now supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, RowCache) {
   Options options = CurrentOptions();
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 2fe03650bcf..522f4a2d8b7 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -2166,9 +2166,10 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
   ASSERT_EQ(30 * 24 * 60 * 60,
             dbfull()->GetOptions().periodic_compaction_seconds);
 
+  options.ttl = 60 * 24 * 60 * 60;
   options.compaction_filter = nullptr;
   Reopen(options);
-  ASSERT_EQ(options.periodic_compaction_seconds,
+  ASSERT_EQ(60 * 24 * 60 * 60,
             dbfull()->GetOptions().periodic_compaction_seconds);
 }
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 938f609fff1..2393503d5ef 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2442,8 +2442,7 @@ void VersionStorageInfo::ComputeCompactionScore(
   if (mutable_cf_options.ttl > 0) {
     ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
   }
-  if (mutable_cf_options.periodic_compaction_seconds > 0 &&
-      mutable_cf_options.periodic_compaction_seconds < port::kMaxUint64) {
+  if (mutable_cf_options.periodic_compaction_seconds > 0) {
     ComputeFilesMarkedForPeriodicCompaction(
         immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
   }
@@ -2514,12 +2513,12 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
   }
   const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
 
-  assert(periodic_compaction_seconds <= current_time);
-  // Disable periodic compaction if periodic_compaction_seconds > current_time.
-  // This also help handle the underflow case.
+  // If periodic_compaction_seconds > current_time, no file possibly qualifies
+  // periodic compaction.
   if (periodic_compaction_seconds > current_time) {
     return;
   }
+
   const uint64_t allowed_time_limit =
       current_time - periodic_compaction_seconds;
 
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 90663ff0ed0..d4e986a110a 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -654,11 +654,15 @@ struct AdvancedColumnFamilyOptions {
   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
   // In FIFO, this option will have the same meaning as
   // periodic_compaction_seconds. Whichever stricter will be used.
+  // 0 means disabling.
+  // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+  // pick default.
   //
-  // Default: 0 (disabled)
+  // Default: 30 days for leveled compaction + block based table. disable
+  //          otherwise.
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t ttl = 0;
+  uint64_t ttl = 0xfffffffffffffffe;
 
   // Files older than this value will be picked up for compaction, and
   // re-written to the same level as they were before.
@@ -676,7 +680,7 @@ struct AdvancedColumnFamilyOptions {
   //
   // Values:
   // 0: Turn off Periodic compactions.
-  // UINT64_MAX (i.e 0xffffffffffffffff): Let RocksDB control this feature
+  // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
   //     as needed. For now, RocksDB will change this value to 30 days
   //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
   //     process at least once every 30 days if not compacted sooner.
@@ -684,10 +688,10 @@ struct AdvancedColumnFamilyOptions {
   //     when this value is left default, and ttl is left to 0, 30 days will be
   //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
   //
-  // Default: UINT64_MAX (allow RocksDB to auto-tune)
+  // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t periodic_compaction_seconds = 0xffffffffffffffff;
+  uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
 
   // If this option is set then 1 in N blocks are compressed
   // using a fast (lz4) and slow (zstd) compression algorithm.

From 382b154be65f561d63e5cab81bf487cc3752f11f Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 26 Nov 2019 10:45:36 -0800
Subject: [PATCH 559/572] Update 3rd-party libraries used by RocksJava (#6084)

Summary:
* LZ4 1.8.3 -> 1.9.2
* ZSTD 1.4.0 -> 1.4.4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6084

Differential Revision: D18710224

fbshipit-source-id: a461ef19a473d3480acdc027f627ec3048730692
---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index b440f408c0a..001c87d0996 100644
--- a/Makefile
+++ b/Makefile
@@ -1742,11 +1742,11 @@ BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2
 SNAPPY_VER ?= 1.1.7
 SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.8.3
-LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43
+LZ4_VER ?= 1.9.2
+LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.4.0
-ZSTD_SHA256 ?= 63be339137d2b683c6d19a9e34f4fb684790e864fee13c7dd40e197a64c705c1
+ZSTD_VER ?= 1.4.4
+ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 

From 4f17d33db4b2da89ca95af98c357c2ea7facc0e9 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 26 Nov 2019 10:47:25 -0800
Subject: [PATCH 560/572] Remove unused/undefined ImmutableCFOptions() (#6086)

Summary:
default constructor not used or even defined
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6086

Differential Revision: D18695669

Pulled By: pdillinger

fbshipit-source-id: 6b6ac46029f4fb6edf1c11ee6ce1d9f172b2eaf2
---
 options/cf_options.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/options/cf_options.h b/options/cf_options.h
index c119c9b6b96..3a6be638167 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -20,7 +20,6 @@ namespace rocksdb {
 // of DB. Raw pointers defined in this struct do not have ownership to the data
 // they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
-  ImmutableCFOptions();
   explicit ImmutableCFOptions(const Options& options);
 
   ImmutableCFOptions(const ImmutableDBOptions& db_options,

From 7f1451957743a0ced6ded5a1afefc9cb363cb835 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 26 Nov 2019 10:52:04 -0800
Subject: [PATCH 561/572] Small improvements to Docker build for RocksJava
 (#6079)

Summary:
* We can reuse downloaded 3rd-party libraries
* We can isolate the build to a Docker volume. This is useful for investigating failed builds, as we can examine the volume by assigning it a name during the build.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6079

Differential Revision: D18710263

fbshipit-source-id: 93f456ba44b49e48941c43b0c4d53995ecc1f404
---
 Makefile                                     | 25 ++++++++++++--------
 java/crossbuild/build-linux-centos.sh        |  2 +-
 java/crossbuild/docker-build-linux-centos.sh | 21 +++++++++-------
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/Makefile b/Makefile
index 001c87d0996..d1f4f64702c 100644
--- a/Makefile
+++ b/Makefile
@@ -1103,14 +1103,23 @@ unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unit
 rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
 	build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
 
-clean:
+clean: clean-ext-libraries-all clean-rocks
+
+clean-not-downloaded: clean-ext-libraries-bin clean-rocks
+
+clean-rocks:
 	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
 	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
 	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
-	rm -rf bzip2* snappy* zlib* lz4* zstd*
 	cd java; $(MAKE) clean
 
+clean-ext-libraries-all:
+	rm -rf bzip2* snappy* zlib* lz4* zstd*
+
+clean-ext-libraries-bin:
+	find . -maxdepth 1 -type d \( -name bzip2\* -or -name snappy\* -or -name zlib\* -or -name lz4\* -or -name zstd\* \) -prune -exec rm -rf {} \;
+
 tags:
 	ctags -R .
 	cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'`
@@ -1913,23 +1922,19 @@ rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 roc
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerarm64v8:
 	mkdir -p java/target
-	DOCKER_LINUX_ARM64V8_CONTAINER=`docker ps -aqf name=rocksdb_linux_arm64v8-be`; \
-	if [ -z "$$DOCKER_LINUX_ARM64V8_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_arm64v8-be evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_arm64v8-be
+	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
index f7090856d08..176e3456ce9 100755
--- a/java/crossbuild/build-linux-centos.sh
+++ b/java/crossbuild/build-linux-centos.sh
@@ -32,7 +32,7 @@ export PATH=$JAVA_HOME:/usr/local/bin:$PATH
 
 # build rocksdb
 cd /rocksdb
-scl enable devtoolset-2 'make jclean clean'
+scl enable devtoolset-2 'make clean-not-downloaded'
 scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
 cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
 cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh
index 68ee610cb73..33fcff95517 100755
--- a/java/crossbuild/docker-build-linux-centos.sh
+++ b/java/crossbuild/docker-build-linux-centos.sh
@@ -4,26 +4,31 @@
 set -e
 #set -x
 
-rm -rf /rocksdb-local
-cp -r /rocksdb-host /rocksdb-local
-cd /rocksdb-local
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
 
-# Use scl devtoolset if available (i.e. CentOS <7)
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+# Use scl devtoolset if available
 if hash scl 2>/dev/null; then
 	if scl --list | grep -q 'devtoolset-7'; then
-		scl enable devtoolset-7 'make jclean clean'
+               # CentOS 7+
+               scl enable devtoolset-7 'make clean-not-downloaded'
 		scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	elif scl --list | grep -q 'devtoolset-2'; then
-		scl enable devtoolset-2 'make jclean clean'
+               # CentOS 5 or 6
+               scl enable devtoolset-2 'make clean-not-downloaded'
 		scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	else
 		echo "Could not find devtoolset"
 		exit 1;
 	fi
 else
-	make jclean clean
+       make clean-not-downloaded
         PORTABLE=1 make -j2 rocksdbjavastatic
 fi
 
-cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
 

From 1bf316e5b68591b712a22fae985dcc19a9cf6e76 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 26 Nov 2019 10:53:23 -0800
Subject: [PATCH 562/572] Fix naming of library on PPC64LE (#6080)

Summary:
**NOTE**: This also needs to be back-ported to be 6.4.6

Fix a regression introduced in f2bf0b2 by https://github.com/facebook/rocksdb/pull/5674 whereby the compiled library would get the wrong name on PPC64LE platforms.

On PPC64LE, the regression caused the library to be named `librocksdbjni-linux64.so` instead of `librocksdbjni-linux-ppc64le.so`.

This PR corrects the name back to `librocksdbjni-linux-ppc64le.so` and also corrects the ordering of conditional arguments in the Makefile to match the expected order as defined in the documentation for Make.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6080

Differential Revision: D18710351

fbshipit-source-id: d4db87ef378263b57de7f9edce1b7d15644cf9de
---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index d1f4f64702c..1b72e947b3b 100644
--- a/Makefile
+++ b/Makefile
@@ -1722,7 +1722,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter $(MACHINE), amd64 arm64 aarch64 sparc64))
+	ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE)))
 		ARCH := 64
 	else
 		ARCH := 32
@@ -1731,10 +1731,10 @@ else
 	ARCH := $(shell getconf LONG_BIT)
 endif
 
-ifeq (,$(filter $(MACHINE), ppc arm64 aarch64 sparc64))
-        ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE)))
+	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
 else
-        ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
+	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 endif
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar

From 771e1723c734cc54de26b711f1ad1769b673f37a Mon Sep 17 00:00:00 2001
From: John Ericson <john.ericson@obsidian.systems>
Date: Tue, 26 Nov 2019 10:57:29 -0800
Subject: [PATCH 563/572] Use lowercase for shlwapi.lib rpcrt4.lib (#6076)

Summary:
This fixes MinGW cross compilation from case-sensative file systems, at no harm to MinGW builds on  Windows.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6076

Differential Revision: D18710554

fbshipit-source-id: a9f299ac3aa019f7dbc07ed0c4a79e19cf99b488
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2883b53aa0..4da0b3628ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -777,7 +777,7 @@ if(WITH_LIBRADOS)
 endif()
 
 if(WIN32)
-  set(SYSTEM_LIBS ${SYSTEM_LIBS} Shlwapi.lib Rpcrt4.lib)
+  set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib)
   set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 else()
   set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})

From 72daa92d3a47d6cc19c553bb51052d991f965125 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 26 Nov 2019 13:16:39 -0800
Subject: [PATCH 564/572] Refactor blob file creation logic (#6066)

Summary:
The patch refactors and cleans up the logic around creating new blob files
by moving the common code of `SelectBlobFile` and `SelectBlobFileTTL`
to a new helper method `CreateBlobFileAndWriter`, bringing the implementation
of `SelectBlobFile` and `SelectBlobFileTTL` into sync, and increasing encapsulation
by adding new constructors for `BlobFile` and `BlobLogHeader`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6066

Test Plan:
Ran `make check` and used the BlobDB mode of `db_bench` to sanity test both
the TTL and the non-TTL code paths.

Differential Revision: D18646921

Pulled By: ltamasi

fbshipit-source-id: e5705a84807932e31dccab4f49b3e64369cea26d
---
 utilities/blob_db/blob_db_impl.cc   | 193 +++++++++++++---------------
 utilities/blob_db/blob_db_impl.h    |  25 ++--
 utilities/blob_db/blob_file.cc      |  46 ++-----
 utilities/blob_db/blob_file.h       |  53 ++++----
 utilities/blob_db/blob_log_format.h |  10 +-
 5 files changed, 151 insertions(+), 176 deletions(-)

diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 4d259eab1d4..3403c345c9f 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -84,7 +84,6 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       statistics_(db_options_.statistics.get()),
       next_file_number_(1),
       flush_sequence_(0),
-      epoch_of_(0),
       closed_(true),
       open_file_count_(0),
       total_blob_size_(0),
@@ -584,14 +583,24 @@ Status BlobDBImpl::GetBlobFileReader(
   return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
+std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason) {
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+
   uint64_t file_num = next_file_number_++;
-  auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num,
-                                          db_options_.info_log.get());
+
+  const uint32_t column_family_id =
+      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  auto blob_file = std::make_shared<BlobFile>(
+      this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
+      bdb_options_.compression, has_ttl, expiration_range);
+
   ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
-                  bfile->PathName().c_str(), reason.c_str());
+                  blob_file->PathName().c_str(), reason.c_str());
   LogFlush(db_options_.info_log);
-  return bfile;
+
+  return blob_file;
 }
 
 Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
@@ -687,47 +696,29 @@ Status BlobDBImpl::CheckOrCreateWriterLocked(
   return s;
 }
 
-Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
-  assert(blob_file != nullptr);
-  {
-    ReadLock rl(&mutex_);
-    if (open_non_ttl_file_ != nullptr) {
-      *blob_file = open_non_ttl_file_;
-      return Status::OK();
-    }
-  }
-
-  // CHECK again
-  WriteLock wl(&mutex_);
-  if (open_non_ttl_file_ != nullptr) {
-    *blob_file = open_non_ttl_file_;
-    return Status::OK();
-  }
+Status BlobDBImpl::CreateBlobFileAndWriter(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
+    std::shared_ptr<Writer>* writer) {
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+  assert(blob_file);
+  assert(writer);
 
-  *blob_file = NewBlobFile("SelectBlobFile");
-  assert(*blob_file != nullptr);
+  *blob_file = NewBlobFile(has_ttl, expiration_range, reason);
+  assert(*blob_file);
 
   // file not visible, hence no lock
-  std::shared_ptr<Writer> writer;
-  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  Status s = CheckOrCreateWriterLocked(*blob_file, writer);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file: %s, error: %s",
+                    "Failed to get writer for blob file: %s, error: %s",
                     (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  (*blob_file)->file_size_ = BlobLogHeader::kSize;
-  (*blob_file)->header_.compression = bdb_options_.compression;
-  (*blob_file)->header_.has_ttl = false;
-  (*blob_file)->header_.column_family_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  (*blob_file)->header_valid_ = true;
-  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
-  (*blob_file)->SetHasTTL(false);
-  (*blob_file)->SetCompression(bdb_options_.compression);
+  assert(*writer);
 
-  s = writer->WriteHeader((*blob_file)->header_);
+  s = (*writer)->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
@@ -736,93 +727,92 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
     return s;
   }
 
-  blob_files_.insert(
-      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
-  open_non_ttl_file_ = *blob_file;
+  (*blob_file)->SetFileSize(BlobLogHeader::kSize);
   total_blob_size_ += BlobLogHeader::kSize;
+
   return s;
 }
 
-Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
-                                     std::shared_ptr<BlobFile>* blob_file) {
-  assert(blob_file != nullptr);
-  assert(expiration != kNoExpiration);
-  uint64_t epoch_read = 0;
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+
   {
     ReadLock rl(&mutex_);
-    *blob_file = FindBlobFileLocked(expiration);
-    epoch_read = epoch_of_.load();
-  }
 
-  if (*blob_file != nullptr) {
-    assert(!(*blob_file)->Immutable());
-    return Status::OK();
+    if (open_non_ttl_file_) {
+      assert(!open_non_ttl_file_->Immutable());
+      *blob_file = open_non_ttl_file_;
+      return Status::OK();
+    }
   }
 
-  uint64_t exp_low =
-      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
-  uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
-  ExpirationRange expiration_range = std::make_pair(exp_low, exp_high);
-
-  *blob_file = NewBlobFile("SelectBlobFileTTL");
-  assert(*blob_file != nullptr);
+  // Check again
+  WriteLock wl(&mutex_);
 
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "New blob file TTL range: %s %" PRIu64 " %" PRIu64,
-                 (*blob_file)->PathName().c_str(), exp_low, exp_high);
-  LogFlush(db_options_.info_log);
+  if (open_non_ttl_file_) {
+    assert(!open_non_ttl_file_->Immutable());
+    *blob_file = open_non_ttl_file_;
+    return Status::OK();
+  }
 
-  // we don't need to take lock as no other thread is seeing bfile yet
   std::shared_ptr<Writer> writer;
-  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  const Status s = CreateBlobFileAndWriter(
+      /* has_ttl */ false, ExpirationRange(),
+      /* reason */ "SelectBlobFile", blob_file, &writer);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(
-        db_options_.info_log,
-        "Failed to get writer from blob file with TTL: %s, error: %s",
-        (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  (*blob_file)->header_.expiration_range = expiration_range;
-  (*blob_file)->header_.compression = bdb_options_.compression;
-  (*blob_file)->header_.has_ttl = true;
-  (*blob_file)->header_.column_family_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  (*blob_file)->header_valid_ = true;
-  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
-  (*blob_file)->SetHasTTL(true);
-  (*blob_file)->SetCompression(bdb_options_.compression);
-  (*blob_file)->file_size_ = BlobLogHeader::kSize;
+  blob_files_.insert(std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+      (*blob_file)->BlobFileNumber(), *blob_file));
+  open_non_ttl_file_ = *blob_file;
 
-  // set the first value of the range, since that is
-  // concrete at this time.  also necessary to add to open_ttl_files_
-  (*blob_file)->expiration_range_ = expiration_range;
+  return s;
+}
 
-  WriteLock wl(&mutex_);
-  // in case the epoch has shifted in the interim, then check
-  // check condition again - should be rare.
-  if (epoch_of_.load() != epoch_read) {
-    std::shared_ptr<BlobFile> blob_file2 = FindBlobFileLocked(expiration);
-    if (blob_file2 != nullptr) {
-      *blob_file = std::move(blob_file2);
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+                                     std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+  assert(expiration != kNoExpiration);
+
+  {
+    ReadLock rl(&mutex_);
+
+    *blob_file = FindBlobFileLocked(expiration);
+    if (*blob_file != nullptr) {
+      assert(!(*blob_file)->Immutable());
       return Status::OK();
     }
   }
 
-  s = writer->WriteHeader((*blob_file)->header_);
+  // Check again
+  WriteLock wl(&mutex_);
+
+  *blob_file = FindBlobFileLocked(expiration);
+  if (*blob_file != nullptr) {
+    assert(!(*blob_file)->Immutable());
+    return Status::OK();
+  }
+
+  const uint64_t exp_low =
+      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
+  const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+  const ExpirationRange expiration_range(exp_low, exp_high);
+
+  std::ostringstream oss;
+  oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';
+
+  std::shared_ptr<Writer> writer;
+  const Status s =
+      CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
+                              /* reason */ oss.str(), blob_file, &writer);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to write header to new blob file: %s"
-                    " status: '%s'",
-                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  blob_files_.insert(
-      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  blob_files_.insert(std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+      (*blob_file)->BlobFileNumber(), *blob_file));
   open_ttl_files_.insert(*blob_file);
-  total_blob_size_ += BlobLogHeader::kSize;
-  epoch_of_++;
 
   return s;
 }
@@ -1954,7 +1944,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
       // new file
       std::string reason("GC of ");
       reason += bfptr->PathName();
-      newfile = NewBlobFile(reason);
+      newfile = NewBlobFile(bfptr->HasTTL(), bfptr->expiration_range_, reason);
 
       s = CheckOrCreateWriterLocked(newfile, &new_writer);
       if (!s.ok()) {
@@ -1963,14 +1953,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                         newfile->PathName().c_str(), s.ToString().c_str());
         break;
       }
-      // Can't use header beyond this point
-      newfile->header_ = std::move(header);
-      newfile->header_valid_ = true;
       newfile->file_size_ = BlobLogHeader::kSize;
-      newfile->SetColumnFamilyId(bfptr->column_family_id());
-      newfile->SetHasTTL(bfptr->HasTTL());
-      newfile->SetCompression(bfptr->compression());
-      newfile->expiration_range_ = bfptr->expiration_range_;
 
       s = new_writer->WriteHeader(newfile->header_);
       if (!s.ok()) {
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 17c97abf9f0..7e9d572048e 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -264,14 +264,22 @@ class BlobDBImpl : public BlobDB {
                     const Slice& value, uint64_t expiration,
                     std::string* index_entry);
 
-  // find an existing blob log file based on the expiration unix epoch
-  // if such a file does not exist, return nullptr
+  // Create a new blob file and associated writer.
+  Status CreateBlobFileAndWriter(bool has_ttl,
+                                 const ExpirationRange& expiration_range,
+                                 const std::string& reason,
+                                 std::shared_ptr<BlobFile>* blob_file,
+                                 std::shared_ptr<Writer>* writer);
+
+  // Get the open non-TTL blob log file, or create a new one if no such file
+  // exists.
+  Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
+
+  // Get the open TTL blob log file for a certain expiration, or create a new
+  // one if no such file exists.
   Status SelectBlobFileTTL(uint64_t expiration,
                            std::shared_ptr<BlobFile>* blob_file);
 
-  // find an existing blob log file to append the value to
-  Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
-
   std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
 
   // periodic sanity check. Bunch of checks
@@ -300,7 +308,9 @@ class BlobDBImpl : public BlobDB {
   void StartBackgroundTasks();
 
   // add a new Blob File
-  std::shared_ptr<BlobFile> NewBlobFile(const std::string& reason);
+  std::shared_ptr<BlobFile> NewBlobFile(bool has_ttl,
+                                        const ExpirationRange& expiration_range,
+                                        const std::string& reason);
 
   // collect all the blob log files from the blob directory
   Status GetAllBlobFiles(std::set<uint64_t>* file_numbers);
@@ -434,9 +444,6 @@ class BlobDBImpl : public BlobDB {
   // The largest sequence number that has been flushed.
   SequenceNumber flush_sequence_;
 
-  // epoch or version of the open files.
-  std::atomic<uint64_t> epoch_of_;
-
   // opened non-TTL blob file.
   std::shared_ptr<BlobFile> open_non_ttl_file_;
 
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index a3a037ac88b..f7a0e7de87a 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -10,7 +10,6 @@
 #include <cinttypes>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 
 #include "db/column_family.h"
@@ -25,45 +24,24 @@ namespace rocksdb {
 
 namespace blob_db {
 
-BlobFile::BlobFile()
-    : parent_(nullptr),
-      file_number_(0),
-      info_log_(nullptr),
-      column_family_id_(std::numeric_limits<uint32_t>::max()),
-      compression_(kNoCompression),
-      has_ttl_(false),
-      blob_count_(0),
-      file_size_(0),
-      closed_(false),
-      immutable_sequence_(0),
-      obsolete_(false),
-      obsolete_sequence_(0),
-      expiration_range_({0, 0}),
-      last_access_(-1),
-      last_fsync_(0),
-      header_valid_(false),
-      footer_valid_(false) {}
-
 BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
                    Logger* info_log)
+    : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+                   Logger* info_log, uint32_t column_family_id,
+                   CompressionType compression, bool has_ttl,
+                   const ExpirationRange& expiration_range)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
       info_log_(info_log),
-      column_family_id_(std::numeric_limits<uint32_t>::max()),
-      compression_(kNoCompression),
-      has_ttl_(false),
-      blob_count_(0),
-      file_size_(0),
-      closed_(false),
-      immutable_sequence_(0),
-      obsolete_(false),
-      obsolete_sequence_(0),
-      expiration_range_({0, 0}),
-      last_access_(-1),
-      last_fsync_(0),
-      header_valid_(false),
-      footer_valid_(false) {}
+      column_family_id_(column_family_id),
+      compression_(compression),
+      has_ttl_(has_ttl),
+      expiration_range_(expiration_range),
+      header_(column_family_id, compression, has_ttl, expiration_range),
+      header_valid_(true) {}
 
 BlobFile::~BlobFile() {
   if (obsolete_) {
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 06ada8635a9..485b2e98533 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <atomic>
+#include <limits>
 #include <memory>
 #include <unordered_set>
 
@@ -29,7 +30,7 @@ class BlobFile {
 
  private:
   // access to parent
-  const BlobDBImpl* parent_;
+  const BlobDBImpl* parent_{nullptr};
 
   // path to blob directory
   std::string path_to_dir_;
@@ -37,49 +38,50 @@ class BlobFile {
   // the id of the file.
   // the above 2 are created during file creation and never changed
   // after that
-  uint64_t file_number_;
+  uint64_t file_number_{0};
 
   // The file numbers of the SST files whose oldest blob file reference
   // points to this blob file.
   std::unordered_set<uint64_t> linked_sst_files_;
 
   // Info log.
-  Logger* info_log_;
+  Logger* info_log_{nullptr};
 
   // Column family id.
-  uint32_t column_family_id_;
+  uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
 
   // Compression type of blobs in the file
-  CompressionType compression_;
+  CompressionType compression_{kNoCompression};
 
   // If true, the keys in this file all has TTL. Otherwise all keys don't
   // have TTL.
-  bool has_ttl_;
+  bool has_ttl_{false};
+
+  // TTL range of blobs in the file.
+  ExpirationRange expiration_range_;
 
   // number of blobs in the file
-  std::atomic<uint64_t> blob_count_;
+  std::atomic<uint64_t> blob_count_{0};
 
   // size of the file
-  std::atomic<uint64_t> file_size_;
+  std::atomic<uint64_t> file_size_{0};
 
   BlobLogHeader header_;
 
   // closed_ = true implies the file is no more mutable
   // no more blobs will be appended and the footer has been written out
-  std::atomic<bool> closed_;
+  std::atomic<bool> closed_{false};
 
   // The latest sequence number when the file was closed/made immutable.
-  SequenceNumber immutable_sequence_;
+  SequenceNumber immutable_sequence_{0};
 
   // has a pass of garbage collection successfully finished on this file
   // obsolete_ still needs to do iterator/snapshot checks
-  std::atomic<bool> obsolete_;
+  std::atomic<bool> obsolete_{false};
 
   // The last sequence number by the time the file marked as obsolete.
   // Data in this file is visible to a snapshot taken before the sequence.
-  SequenceNumber obsolete_sequence_;
-
-  ExpirationRange expiration_range_;
+  SequenceNumber obsolete_sequence_{0};
 
   // Sequential/Append writer for blobs
   std::shared_ptr<Writer> log_writer_;
@@ -92,29 +94,30 @@ class BlobFile {
   mutable port::RWMutex mutex_;
 
   // time when the random access reader was last created.
-  std::atomic<std::int64_t> last_access_;
+  std::atomic<std::int64_t> last_access_{-1};
 
   // last time file was fsync'd/fdatasyncd
-  std::atomic<uint64_t> last_fsync_;
+  std::atomic<uint64_t> last_fsync_{0};
 
-  bool header_valid_;
+  bool header_valid_{false};
 
-  bool footer_valid_;
+  bool footer_valid_{false};
 
  public:
-  BlobFile();
+  BlobFile() = default;
 
   BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
            Logger* info_log);
 
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+           Logger* info_log, uint32_t column_family_id,
+           CompressionType compression, bool has_ttl,
+           const ExpirationRange& expiration_range);
+
   ~BlobFile();
 
   uint32_t column_family_id() const;
 
-  void SetColumnFamilyId(uint32_t cf_id) {
-    column_family_id_ = cf_id;
-  }
-
   // Returns log file's absolute pathname.
   std::string PathName() const;
 
@@ -203,10 +206,6 @@ class BlobFile {
 
   CompressionType compression() const { return compression_; }
 
-  void SetCompression(CompressionType c) {
-    compression_ = c;
-  }
-
   std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
 
   // Read blob file header and footer. Return corruption if file header is
diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h
index fcc042f06db..34eacd5e025 100644
--- a/utilities/blob_db/blob_log_format.h
+++ b/utilities/blob_db/blob_log_format.h
@@ -43,11 +43,19 @@ using ExpirationRange = std::pair<uint64_t, uint64_t>;
 struct BlobLogHeader {
   static constexpr size_t kSize = 30;
 
+  BlobLogHeader() = default;
+  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+                bool _has_ttl, const ExpirationRange& _expiration_range)
+      : column_family_id(_column_family_id),
+        compression(_compression),
+        has_ttl(_has_ttl),
+        expiration_range(_expiration_range) {}
+
   uint32_t version = kVersion1;
   uint32_t column_family_id = 0;
   CompressionType compression = kNoCompression;
   bool has_ttl = false;
-  ExpirationRange expiration_range = std::make_pair(0, 0);
+  ExpirationRange expiration_range;
 
   void EncodeTo(std::string* dst);
 

From 57f30322855b6ed38296ec0f4c6fb03b095fe0b3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 26 Nov 2019 15:49:16 -0800
Subject: [PATCH 565/572] Allow fractional bits/key in BloomFilterPolicy
 (#6092)

Summary:
There's no technological impediment to allowing the Bloom
filter bits/key to be non-integer (fractional/decimal) values, and it
provides finer control over the memory vs. accuracy trade-off. This is
especially handy in using the format_version=5 Bloom filter in place
of the old one, because bits_per_key=9.55 provides the same accuracy as
the old bits_per_key=10.

This change not only requires refining the logic for choosing the best
num_probes for a given bits/key setting, it revealed a flaw in that logic.
As bits/key gets higher, the best num_probes for a cache-local Bloom
filter is closer to bpk / 2 than to bpk * 0.69, the best choice for a
standard Bloom filter. For example, at 16 bits per key, the best
num_probes is 9 (FP rate = 0.0843%) not 11 (FP rate = 0.0884%).
This change fixes and refines that logic (for the format_version=5
Bloom filter only, just in case) based on empirical tests to find
accuracy inflection points between each num_probes.

Although bits_per_key is now specified as a double, the new Bloom
filter converts/rounds this to "millibits / key" for predictable/precise
internal computations. Just in case of unforeseen compatibility
issues, we round to the nearest whole number bits / key for the
legacy Bloom filter, so as not to unlock new behaviors for it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6092

Test Plan: unit tests included

Differential Revision: D18711313

Pulled By: pdillinger

fbshipit-source-id: 1aa73295f152a995328cb846ef9157ae8a05522a
---
 HISTORY.md                                    |   3 +-
 include/rocksdb/filter_policy.h               |  10 +-
 java/rocksjni/filter.cc                       |   4 +-
 .../main/java/org/rocksdb/BloomFilter.java    |  10 +-
 options/options_test.cc                       |  23 ++-
 .../block_based/block_based_table_factory.cc  |   4 +-
 table/block_based/filter_policy.cc            |  66 +++++---
 table/block_based/filter_policy_internal.h    |  19 ++-
 util/bloom_impl.h                             |  52 +++++++
 util/bloom_test.cc                            | 145 +++++++++++++++---
 util/filter_bench.cc                          |   2 +-
 11 files changed, 271 insertions(+), 67 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b39230a3f0e..036a503892d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
 * An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
+* NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility.
 * The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
 
 ### Default Option Changes
@@ -16,7 +17,7 @@
 * Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
 * `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
 * A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
-* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
+* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
 * Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
 * Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
 * Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 2e72d83f671..a72c18b94f2 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -151,8 +151,12 @@ class FilterPolicy {
 // Return a new filter policy that uses a bloom filter with approximately
 // the specified number of bits per key.
 //
-// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
-// is 10, which yields a filter with ~ 1% false positive rate.
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
 // use_block_based_builder: use deprecated block based filter (true) rather
 // than full or partitioned filter (false).
 //
@@ -167,5 +171,5 @@ class FilterPolicy {
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(
-    int bits_per_key, bool use_block_based_builder = false);
+    double bits_per_key, bool use_block_based_builder = false);
 }  // namespace rocksdb
diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc
index 5e9c63643de..c4c275fb529 100644
--- a/java/rocksjni/filter.cc
+++ b/java/rocksjni/filter.cc
@@ -19,10 +19,10 @@
 /*
  * Class:     org_rocksdb_BloomFilter
  * Method:    createBloomFilter
- * Signature: (IZ)J
+ * Signature: (DZ)J
  */
 jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(
-    JNIEnv* /*env*/, jclass /*jcls*/, jint bits_per_key,
+    JNIEnv* /*env*/, jclass /*jcls*/, jdouble bits_per_key,
     jboolean use_block_base_builder) {
   auto* sptr_filter = new std::shared_ptr<const rocksdb::FilterPolicy>(
       rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java
index 316c3ad838b..0a119878a46 100644
--- a/java/src/main/java/org/rocksdb/BloomFilter.java
+++ b/java/src/main/java/org/rocksdb/BloomFilter.java
@@ -20,7 +20,7 @@
  */
 public class BloomFilter extends Filter {
 
-  private static final int DEFAULT_BITS_PER_KEY = 10;
+  private static final double DEFAULT_BITS_PER_KEY = 10.0;
   private static final boolean DEFAULT_MODE = true;
 
   /**
@@ -39,7 +39,7 @@ public BloomFilter() {
    *
    * <p>
    * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
-   * is 10, which yields a filter with ~ 1% false positive rate.
+   * is 9.9, which yields a filter with ~ 1% false positive rate.
    * </p>
    * <p>
    * Callers must delete the result after any database that is using the
@@ -47,7 +47,7 @@ public BloomFilter() {
    *
    * @param bitsPerKey number of bits to use
    */
-  public BloomFilter(final int bitsPerKey) {
+  public BloomFilter(final double bitsPerKey) {
     this(bitsPerKey, DEFAULT_MODE);
   }
 
@@ -70,10 +70,10 @@ public BloomFilter(final int bitsPerKey) {
    * @param bitsPerKey number of bits to use
    * @param useBlockBasedMode use block based mode or full filter mode
    */
-  public BloomFilter(final int bitsPerKey, final boolean useBlockBasedMode) {
+  public BloomFilter(final double bitsPerKey, final boolean useBlockBasedMode) {
     super(createNewBloomFilter(bitsPerKey, useBlockBasedMode));
   }
 
-  private native static long createNewBloomFilter(final int bitsKeyKey,
+  private native static long createNewBloomFilter(final double bitsKeyKey,
       final boolean useBlockBasedMode);
 }
diff --git a/options/options_test.cc b/options/options_test.cc
index d3bbe87c8fd..d1c9db039d4 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
@@ -515,13 +516,15 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   BlockBasedTableOptions table_opt;
   BlockBasedTableOptions new_opt;
   // make sure default values are overwritten by something else
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-            "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
-            "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
-            "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
-            "block_size_deviation=8;block_restart_interval=4;"
-            "filter_policy=bloomfilter:4:true;whole_key_filtering=1;",
-            &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4;"
+      "format_version=5;whole_key_filtering=1;"
+      "filter_policy=bloomfilter:4.567:false;",
+      &new_opt));
   ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
   ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
@@ -534,7 +537,13 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(new_opt.block_size, 1024UL);
   ASSERT_EQ(new_opt.block_size_deviation, 8);
   ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_EQ(new_opt.format_version, 5U);
+  ASSERT_EQ(new_opt.whole_key_filtering, true);
   ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  const BloomFilterPolicy& bfp =
+      dynamic_cast<const BloomFilterPolicy&>(*new_opt.filter_policy);
+  EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5);
 
   // unknown option
   ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 5f48b78cae0..3c7af3bf9de 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -510,8 +510,8 @@ std::string ParseBlockBasedTableOption(const std::string& name,
       if (pos == std::string::npos) {
         return "Invalid filter policy config, missing bits_per_key";
       }
-      int bits_per_key =
-          ParseInt(trim(value.substr(kName.size(), pos - kName.size())));
+      double bits_per_key =
+          ParseDouble(trim(value.substr(kName.size(), pos - kName.size())));
       bool use_block_based_builder =
           ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
       new_options->filter_policy.reset(
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index dc7c299854d..9b71010c48c 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -27,9 +27,10 @@ namespace {
 // See description in FastLocalBloomImpl
 class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
  public:
-  FastLocalBloomBitsBuilder(const int bits_per_key, const int num_probes)
-      : bits_per_key_(bits_per_key), num_probes_(num_probes) {
-    assert(bits_per_key_);
+  FastLocalBloomBitsBuilder(const int millibits_per_key)
+      : millibits_per_key_(millibits_per_key),
+        num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
+    assert(millibits_per_key >= 1000);
   }
 
   // No Copy allowed
@@ -77,14 +78,15 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
 
   int CalculateNumEntry(const uint32_t bytes) override {
     uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
-    return static_cast<int>(uint64_t{8} * bytes_no_meta / bits_per_key_);
+    return static_cast<int>(uint64_t{8000} * bytes_no_meta /
+                            millibits_per_key_);
   }
 
   uint32_t CalculateSpace(const int num_entry) override {
     uint32_t num_cache_lines = 0;
-    if (bits_per_key_ > 0 && num_entry > 0) {
+    if (millibits_per_key_ > 0 && num_entry > 0) {
       num_cache_lines = static_cast<uint32_t>(
-          (int64_t{num_entry} * bits_per_key_ + 511) / 512);
+          (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000);
     }
     return num_cache_lines * 64 + /*metadata*/ 5;
   }
@@ -136,7 +138,7 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
     }
   }
 
-  int bits_per_key_;
+  int millibits_per_key_;
   int num_probes_;
   std::vector<uint64_t> hash_entries_;
 };
@@ -187,7 +189,7 @@ using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
 
 class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
  public:
-  explicit LegacyBloomBitsBuilder(const int bits_per_key, const int num_probes);
+  explicit LegacyBloomBitsBuilder(const int bits_per_key);
 
   // No Copy allowed
   LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
@@ -208,7 +210,6 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
   }
 
  private:
-  friend class FullFilterBlockTest_DuplicateEntries_Test;
   int bits_per_key_;
   int num_probes_;
   std::vector<uint32_t> hash_entries_;
@@ -228,9 +229,9 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
   void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
 };
 
-LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
-                                               const int num_probes)
-    : bits_per_key_(bits_per_key), num_probes_(num_probes) {
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key)
+    : bits_per_key_(bits_per_key),
+      num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)) {
   assert(bits_per_key_);
 }
 
@@ -412,12 +413,24 @@ const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
     kAuto,
 };
 
-BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Mode mode)
-    : bits_per_key_(bits_per_key), mode_(mode) {
-  // We intentionally round down to reduce probing cost a little bit
-  num_probes_ = static_cast<int>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
-  if (num_probes_ < 1) num_probes_ = 1;
-  if (num_probes_ > 30) num_probes_ = 30;
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
+    : mode_(mode) {
+  // Sanitize bits_per_key
+  if (bits_per_key < 1.0) {
+    bits_per_key = 1.0;
+  } else if (!(bits_per_key < 100.0)) {  // including NaN
+    bits_per_key = 100.0;
+  }
+
+  // Includes a nudge toward rounding up, to ensure on all platforms
+  // that doubles specified with three decimal digits after the decimal
+  // point are interpreted accurately.
+  millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+  // For better or worse, this is a rounding up of a nudged rounding up,
+  // e.g. 7.4999999999999 will round up to 8, but that provides more
+  // predictability against small arithmetic errors in floating point.
+  whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
 }
 
 BloomFilterPolicy::~BloomFilterPolicy() {}
@@ -433,7 +446,7 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
   assert(mode_ == kDeprecatedBlock);
 
   // Compute bloom filter size (in both bits and bytes)
-  uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
+  uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
 
   // For small n, we can see a very high false positive rate.  Fix it
   // by enforcing a minimum bloom filter length.
@@ -442,12 +455,15 @@ void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
   uint32_t bytes = (bits + 7) / 8;
   bits = bytes * 8;
 
+  int num_probes =
+      LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
+
   const size_t init_size = dst->size();
   dst->resize(init_size + bytes, 0);
-  dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
+  dst->push_back(static_cast<char>(num_probes));  // Remember # of probes
   char* array = &(*dst)[init_size];
   for (int i = 0; i < n; i++) {
-    LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_,
+    LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
                                        array);
   }
 }
@@ -470,7 +486,7 @@ bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
     // Consider it a match.
     return true;
   }
-  // NB: using k not num_probes_
+  // NB: using stored k not num_probes for whole_bits_per_key_
   return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
                                                  array);
 }
@@ -504,9 +520,9 @@ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal(
       case kDeprecatedBlock:
         return nullptr;
       case kFastLocalBloom:
-        return new FastLocalBloomBitsBuilder(bits_per_key_, num_probes_);
+        return new FastLocalBloomBitsBuilder(millibits_per_key_);
       case kLegacyBloom:
-        return new LegacyBloomBitsBuilder(bits_per_key_, num_probes_);
+        return new LegacyBloomBitsBuilder(whole_bits_per_key_);
     }
   }
   assert(false);
@@ -649,7 +665,7 @@ FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
   return new AlwaysTrueFilter();
 }
 
-const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
                                          bool use_block_based_builder) {
   BloomFilterPolicy::Mode m;
   if (use_block_based_builder) {
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 2129a781eb0..24d9bf25a0d 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -91,7 +91,7 @@ class BloomFilterPolicy : public FilterPolicy {
   // tests should prefer using NewBloomFilterPolicy (user-exposed).
   static const std::vector<Mode> kAllUserModes;
 
-  explicit BloomFilterPolicy(int bits_per_key, Mode mode);
+  explicit BloomFilterPolicy(double bits_per_key, Mode mode);
 
   ~BloomFilterPolicy() override;
 
@@ -111,6 +111,11 @@ class BloomFilterPolicy : public FilterPolicy {
   // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
   FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
 
+  // Essentially for testing only: configured millibits/key
+  int GetMillibitsPerKey() const { return millibits_per_key_; }
+  // Essentially for testing only: legacy whole bits/key
+  int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
  protected:
   // To use this function, call FilterBuildingContext::GetBuilder().
   //
@@ -120,8 +125,16 @@ class BloomFilterPolicy : public FilterPolicy {
       const FilterBuildingContext&) const override;
 
  private:
-  int bits_per_key_;
-  int num_probes_;
+  // Newer filters support fractional bits per key. For predictable behavior
+  // of 0.001-precision values across floating point implementations, we
+  // round to thousandths of a bit (on average) per key.
+  int millibits_per_key_;
+
+  // Older filters round to whole number bits per key. (There *should* be no
+  // compatibility issue with fractional bits per key, but preserving old
+  // behavior with format_version < 5 just in case.)
+  int whole_bits_per_key_;
+
   // Selected mode (a specific implementation or way of selecting an
   // implementation) for building new SST filters.
   Mode mode_;
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index 9e34afb9bb3..73575b07c02 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -72,6 +72,50 @@ namespace rocksdb {
 //
 class FastLocalBloomImpl {
  public:
+  static inline int ChooseNumProbes(int millibits_per_key) {
+    // Since this implementation can (with AVX2) make up to 8 probes
+    // for the same cost, we pick the most accurate num_probes, based
+    // on actual tests of the implementation. Note that for higher
+    // bits/key, the best choice for cache-local Bloom can be notably
+    // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
+    if (millibits_per_key <= 2080) {
+      return 1;
+    } else if (millibits_per_key <= 3580) {
+      return 2;
+    } else if (millibits_per_key <= 5100) {
+      return 3;
+    } else if (millibits_per_key <= 6640) {
+      return 4;
+    } else if (millibits_per_key <= 8300) {
+      return 5;
+    } else if (millibits_per_key <= 10070) {
+      return 6;
+    } else if (millibits_per_key <= 11720) {
+      return 7;
+    } else if (millibits_per_key <= 14001) {
+      // Would be something like <= 13800 but sacrificing *slightly* for
+      // more settings using <= 8 probes.
+      return 8;
+    } else if (millibits_per_key <= 16050) {
+      return 9;
+    } else if (millibits_per_key <= 18300) {
+      return 10;
+    } else if (millibits_per_key <= 22001) {
+      return 11;
+    } else if (millibits_per_key <= 25501) {
+      return 12;
+    } else if (millibits_per_key > 50000) {
+      // Top out at 24 probes (three sets of 8)
+      return 24;
+    } else {
+      // Roughly optimal choices for remaining range
+      // e.g.
+      // 28000 -> 12, 28001 -> 13
+      // 50000 -> 23, 50001 -> 24
+      return (millibits_per_key - 1) / 2000 - 1;
+    }
+  }
+
   static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
                              int num_probes, char *data) {
     uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
@@ -228,6 +272,14 @@ class FastLocalBloomImpl {
 //
 class LegacyNoLocalityBloomImpl {
  public:
+  static inline int ChooseNumProbes(int bits_per_key) {
+    // We intentionally round down to reduce probing cost a little bit
+    int num_probes = static_cast<int>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes < 1) num_probes = 1;
+    if (num_probes > 30) num_probes = 30;
+    return num_probes;
+  }
+
   static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
                              char *data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 3ab3e071b1e..c2d7a01a873 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -16,6 +16,7 @@ int main() {
 #else
 
 #include <array>
+#include <cmath>
 #include <vector>
 
 #include "logging/logging.h"
@@ -69,7 +70,7 @@ class BlockBasedBloomTest : public testing::Test {
     filter_.clear();
   }
 
-  void ResetPolicy(int bits_per_key) {
+  void ResetPolicy(double bits_per_key) {
     policy_.reset(new BloomFilterPolicy(bits_per_key,
                                         BloomFilterPolicy::kDeprecatedBlock));
     Reset();
@@ -229,6 +230,22 @@ TEST_F(BlockBasedBloomTest, Schema) {
   Build();
   ASSERT_EQ(BloomHash(FilterData()), 3057004015U);
 
+  // With new fractional bits_per_key, check that we are rounding to
+  // whole bits per key for old Bloom filters.
+  ResetPolicy(9.5);  // Treated as 10
+  for (int key = 1; key < 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
+  ResetPolicy(10.499);  // Treated as 10
+  for (int key = 1; key < 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
   ResetPolicy();
 }
 
@@ -250,7 +267,12 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
 
   BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
     // Throws on bad cast
-    return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_.get());
+    return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_);
+  }
+
+  const BloomFilterPolicy* GetBloomFilterPolicy() {
+    // Throws on bad cast
+    return &dynamic_cast<const BloomFilterPolicy&>(*policy_);
   }
 
   void Reset() {
@@ -260,7 +282,7 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
     filter_size_ = 0;
   }
 
-  void ResetPolicy(int bits_per_key) {
+  void ResetPolicy(double bits_per_key) {
     policy_.reset(new BloomFilterPolicy(bits_per_key, GetParam()));
     Reset();
   }
@@ -366,14 +388,57 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
 };
 
 TEST_P(FullBloomTest, FilterSize) {
-  auto bits_builder = GetBuiltinFilterBitsBuilder();
-  for (int n = 1; n < 100; n++) {
-    auto space = bits_builder->CalculateSpace(n);
-    auto n2 = bits_builder->CalculateNumEntry(space);
-    ASSERT_GE(n2, n);
-    auto space2 = bits_builder->CalculateSpace(n2);
-    ASSERT_EQ(space, space2);
+  // In addition to checking the consistency of space computation, we are
+  // checking that denoted and computed doubles are interpreted as expected
+  // as bits_per_key values.
+  bool some_computed_less_than_denoted = false;
+  // Note: enforced minimum is 1 bit per key (1000 millibits), and enforced
+  // maximum is 100 bits per key (100000 millibits).
+  for (auto bpk :
+       std::vector<std::pair<double, int> >{{-HUGE_VAL, 1000},
+                                            {-INFINITY, 1000},
+                                            {0.0, 1000},
+                                            {1.234, 1234},
+                                            {3.456, 3456},
+                                            {9.5, 9500},
+                                            {10.0, 10000},
+                                            {10.499, 10499},
+                                            {21.345, 21345},
+                                            {99.999, 99999},
+                                            {1234.0, 100000},
+                                            {HUGE_VAL, 100000},
+                                            {INFINITY, 100000},
+                                            {NAN, 100000}}) {
+    ResetPolicy(bpk.first);
+    auto bfp = GetBloomFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    double computed = bpk.first;
+    // This transforms e.g. 9.5 -> 9.499999999999998, which we still
+    // round to 10 for whole bits per key.
+    computed += 0.5;
+    computed /= 1234567.0;
+    computed *= 1234567.0;
+    computed -= 0.5;
+    some_computed_less_than_denoted |= (computed < bpk.first);
+    ResetPolicy(computed);
+    bfp = GetBloomFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    auto bits_builder = GetBuiltinFilterBitsBuilder();
+    for (int n = 1; n < 100; n++) {
+      auto space = bits_builder->CalculateSpace(n);
+      auto n2 = bits_builder->CalculateNumEntry(space);
+      EXPECT_GE(n2, n);
+      auto space2 = bits_builder->CalculateSpace(n2);
+      EXPECT_EQ(space, space2);
+    }
   }
+  // Check that the compiler hasn't optimized our computation into nothing
+  EXPECT_TRUE(some_computed_less_than_denoted);
+  ResetPolicy();
 }
 
 TEST_P(FullBloomTest, FullEmptyFilter) {
@@ -546,32 +611,43 @@ TEST_P(FullBloomTest, Schema) {
     EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
   }
 
-  ResetPolicy(14);  // num_probes = 9
+  // This used to be 9 probes, but 8 is a better choice for speed,
+  // especially with SIMD groups of 8 probes, with essentially no
+  // change in FP rate.
+  // FP rate @ 9 probes, old Bloom: 0.4321%
+  // FP rate @ 9 probes, new Bloom: 0.1846%
+  // FP rate @ 8 probes, new Bloom: 0.1843%
+  ResetPolicy(14);  // num_probes = 8 (new), 9 (old)
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(9, 8));
   EXPECT_EQ(
       BloomHash(FilterData()),
       SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
-                   3129678118U));
+                   3709876890U));
   if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("130,989,2002,3225,3543,4522,4863,5256,5277", FirstFPs(9));
+    EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
   }
 
-  ResetPolicy(16);  // num_probes = 11
+  // This used to be 11 probes, but 9 is a better choice for speed
+  // AND accuracy.
+  // FP rate @ 11 probes, old Bloom: 0.3571%
+  // FP rate @ 11 probes, new Bloom: 0.0884%
+  // FP rate @  9 probes, new Bloom: 0.0843%
+  ResetPolicy(16);  // num_probes = 9 (new), 11 (old)
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 11);
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(11, 9));
   EXPECT_EQ(
       BloomHash(FilterData()),
       SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
-                   1262483504));
+                   1087138490));
   if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("240,945,2660,3299,4031,4282,5173,6197,8715", FirstFPs(9));
+    EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
   }
 
   ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
@@ -616,6 +692,39 @@ TEST_P(FullBloomTest, Schema) {
     EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
   }
 
+  // With new fractional bits_per_key, check that we are rounding to
+  // whole bits per key for old Bloom filters but fractional for
+  // new Bloom filter.
+  ResetPolicy(9.5);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(BloomHash(FilterData()),
+            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+                                                        4175124908U),
+                         /*CHANGED*/ 3166884174U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
+              FirstFPs(10));
+  }
+
+  ResetPolicy(10.499);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(6, 7));
+  EXPECT_EQ(BloomHash(FilterData()),
+            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+                                                        4175124908U),
+                         /*CHANGED*/ 4098502778U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
+              FirstFPs(10));
+  }
+
   ResetPolicy();
 }
 
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 6ff496cb10e..60c59b00833 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -52,7 +52,7 @@ DEFINE_uint32(vary_key_size_log2_interval, 5,
 
 DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
 
-DEFINE_uint32(bits_per_key, 10, "Bits per key setting for filters");
+DEFINE_double(bits_per_key, 10.0, "Bits per key setting for filters");
 
 DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
 

From d9314a92148772bb361fb6a4f7a875a1d647582a Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 26 Nov 2019 16:42:44 -0800
Subject: [PATCH 566/572] Refactor and clean up the code that reads a blob from
 a file (#6093)

Summary:
This patch factors out the logic that reads a (potentially compressed) blob
from a file into a separate helper method `GetRawBlobFromFile`, and cleans
up the code a bit. Also, errors during decompression are now logged/propagated
to the user by returning a `Status` code of `Corruption`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/6093

Test Plan: `make check`

Differential Revision: D18716673

Pulled By: ltamasi

fbshipit-source-id: 44144bc064cab616862d5643f34384f2bae6eb78
---
 utilities/blob_db/blob_db_impl.cc | 154 +++++++++++++++++++-----------
 utilities/blob_db/blob_db_impl.h  |   5 +
 2 files changed, 103 insertions(+), 56 deletions(-)

diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 3403c345c9f..9c4a8a4c5ad 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1251,15 +1251,18 @@ bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
 
 Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
                                 PinnableSlice* value, uint64_t* expiration) {
-  assert(value != nullptr);
+  assert(value);
+
   BlobIndex blob_index;
   Status s = blob_index.DecodeFrom(index_entry);
   if (!s.ok()) {
     return s;
   }
+
   if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
     return Status::NotFound("Key expired");
   }
+
   if (expiration != nullptr) {
     if (blob_index.HasTTL()) {
       *expiration = blob_index.expiration();
@@ -1267,13 +1270,65 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
       *expiration = kNoExpiration;
     }
   }
+
   if (blob_index.IsInlined()) {
     // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
     // memory buffer to avoid extra copy.
     value->PinSelf(blob_index.value());
     return Status::OK();
   }
-  if (blob_index.size() == 0) {
+
+  CompressionType compression_type = kNoCompression;
+  s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+                         blob_index.size(), value, &compression_type);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (compression_type != kNoCompression) {
+    BlockContents contents;
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+
+    {
+      StopWatch decompression_sw(env_, statistics_,
+                                 BLOB_DB_DECOMPRESSION_MICROS);
+      UncompressionContext context(compression_type);
+      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                             compression_type);
+      s = UncompressBlockContentsForCompressionType(
+          info, value->data(), value->size(), &contents,
+          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+    }
+
+    if (!s.ok()) {
+      if (debug_level_ >= 2) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Uncompression error during blob read from file: %" PRIu64
+            " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+            " key: %s status: '%s'",
+            blob_index.file_number(), blob_index.offset(), blob_index.size(),
+            key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+      }
+
+      return Status::Corruption("Unable to uncompress blob.");
+    }
+
+    value->PinSelf(contents.data);
+  }
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                                      uint64_t offset, uint64_t size,
+                                      PinnableSlice* value,
+                                      CompressionType* compression_type) {
+  assert(value);
+  assert(compression_type);
+  assert(*compression_type == kNoCompression);
+
+  if (!size) {
     value->PinSelf("");
     return Status::OK();
   }
@@ -1281,47 +1336,46 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   // offset has to have certain min, as we will read CRC
   // later from the Blob Header, which needs to be also a
   // valid offset.
-  if (blob_index.offset() <
+  if (offset <
       (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
     if (debug_level_ >= 2) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "Invalid blob index file_number: %" PRIu64
                       " blob_offset: %" PRIu64 " blob_size: %" PRIu64
                       " key: %s",
-                      blob_index.file_number(), blob_index.offset(),
-                      blob_index.size(), key.data());
+                      file_number, offset, size,
+                      key.ToString(/* output_hex */ true).c_str());
     }
+
     return Status::NotFound("Invalid blob offset");
   }
 
-  std::shared_ptr<BlobFile> bfile;
+  std::shared_ptr<BlobFile> blob_file;
+
   {
     ReadLock rl(&mutex_);
-    auto hitr = blob_files_.find(blob_index.file_number());
+    auto it = blob_files_.find(file_number);
 
     // file was deleted
-    if (hitr == blob_files_.end()) {
+    if (it == blob_files_.end()) {
       return Status::NotFound("Blob Not Found as blob file missing");
     }
 
-    bfile = hitr->second;
+    blob_file = it->second;
   }
 
-  if (blob_index.size() == 0 && value != nullptr) {
-    value->PinSelf("");
-    return Status::OK();
-  }
+  *compression_type = blob_file->compression();
 
   // takes locks when called
   std::shared_ptr<RandomAccessFileReader> reader;
-  s = GetBlobFileReader(bfile, &reader);
+  Status s = GetBlobFileReader(blob_file, &reader);
   if (!s.ok()) {
     return s;
   }
 
-  assert(blob_index.offset() > key.size() + sizeof(uint32_t));
-  uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
-  uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
+  assert(offset >= key.size() + sizeof(uint32_t));
+  const uint64_t record_offset = offset - key.size() - sizeof(uint32_t);
+  const uint64_t record_size = sizeof(uint32_t) + key.size() + size;
 
   // Allocate the buffer. This is safe in C++11
   std::string buffer_str(static_cast<size_t>(record_size), static_cast<char>(0));
@@ -1329,42 +1383,44 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
 
   // A partial blob record contain checksum, key and value.
   Slice blob_record;
+
   {
     StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
     s = reader->Read(record_offset, static_cast<size_t>(record_size), &blob_record, buffer);
     RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
   }
+
   if (!s.ok()) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Failed to read blob from blob file %" PRIu64
-                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
-                    ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
-                    bfile->BlobFileNumber(), blob_index.offset(),
-                    blob_index.size(), key.size(), s.ToString().c_str());
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
     return s;
   }
+
   if (blob_record.size() != record_size) {
     ROCKS_LOG_DEBUG(
         db_options_.info_log,
         "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
         ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt
         ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes",
-        bfile->BlobFileNumber(), blob_index.offset(), blob_index.size(),
-        key.size(), blob_record.size(), record_size);
+        file_number, offset, size, key.size(), blob_record.size(), record_size);
 
     return Status::Corruption("Failed to retrieve blob from blob index.");
   }
+
   Slice crc_slice(blob_record.data(), sizeof(uint32_t));
   Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
-                   static_cast<size_t>(blob_index.size()));
-  uint32_t crc_exp;
+                   static_cast<size_t>(size));
+
+  uint32_t crc_exp = 0;
   if (!GetFixed32(&crc_slice, &crc_exp)) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Unable to decode CRC from blob file %" PRIu64
-                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
-                    ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
-                    bfile->BlobFileNumber(), blob_index.offset(),
-                    blob_index.size(), key.size(), s.ToString().c_str());
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
     return Status::Corruption("Unable to decode checksum.");
   }
 
@@ -1373,34 +1429,20 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   crc = crc32c::Mask(crc);  // Adjust for storage
   if (crc != crc_exp) {
     if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Blob crc mismatch file: %s blob_offset: %" PRIu64
-                      " blob_size: %" PRIu64 " key: %s status: '%s'",
-                      bfile->PathName().c_str(), blob_index.offset(),
-                      blob_index.size(), key.data(), s.ToString().c_str());
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s status: '%s'",
+          file_number, offset, size,
+          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
     }
+
     return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
-  if (bfile->compression() == kNoCompression) {
-    value->PinSelf(blob_value);
-  } else {
-    BlockContents contents;
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-    {
-      StopWatch decompression_sw(env_, statistics_,
-                                 BLOB_DB_DECOMPRESSION_MICROS);
-      UncompressionContext context(bfile->compression());
-      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                             bfile->compression());
-      s = UncompressBlockContentsForCompressionType(
-          info, blob_value.data(), blob_value.size(), &contents,
-          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
-    }
-    value->PinSelf(contents.data);
-  }
+  value->PinSelf(blob_value);
 
-  return s;
+  return Status::OK();
 }
 
 Status BlobDBImpl::Get(const ReadOptions& read_options,
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 7e9d572048e..18aed63f82d 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -237,6 +237,11 @@ class BlobDBImpl : public BlobDB {
   Status GetBlobValue(const Slice& key, const Slice& index_entry,
                       PinnableSlice* value, uint64_t* expiration = nullptr);
 
+  Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                            uint64_t offset, uint64_t size,
+                            PinnableSlice* value,
+                            CompressionType* compression_type);
+
   Slice GetCompressedSlice(const Slice& raw,
                            std::string* compression_output) const;
 

From 8ae149eba1a3c6ce1314375c5020f5d80273e730 Mon Sep 17 00:00:00 2001
From: Patrick Double <pat@patdouble.com>
Date: Tue, 26 Nov 2019 16:51:26 -0800
Subject: [PATCH 567/572] Add shared library for musl-libc (#3143)

Summary:
Add the jni library for musl-libc, specifically for incorporating into Alpine based docker images. The classifier is `musl64`.

I have signed the CLA electronically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/3143

Differential Revision: D18719372

fbshipit-source-id: 6189d149310b6436d6def7d808566b0234b23313
---
 Makefile                                      | 39 ++++++++--
 java/RELEASE.md                               | 25 ++++---
 java/crossbuild/Vagrantfile                   | 26 ++++++-
 java/crossbuild/build-linux-alpine.sh         | 70 ++++++++++++++++++
 java/crossbuild/docker-build-linux-alpine.sh  | 18 +++++
 java/crossbuild/docker-build-linux-centos.sh  | 10 +--
 java/rocksjni.pom                             |  2 +-
 .../java/org/rocksdb/util/Environment.java    | 43 ++++++++++-
 .../org/rocksdb/util/EnvironmentTest.java     | 74 ++++++++++++++++++-
 9 files changed, 280 insertions(+), 27 deletions(-)
 create mode 100755 java/crossbuild/build-linux-alpine.sh
 create mode 100755 java/crossbuild/docker-build-linux-alpine.sh

diff --git a/Makefile b/Makefile
index 1b72e947b3b..3b0ddffee51 100644
--- a/Makefile
+++ b/Makefile
@@ -1731,12 +1731,23 @@ else
 	ARCH := $(shell getconf LONG_BIT)
 endif
 
+ifeq ($(shell ldd /usr/bin/env 2>/dev/null | grep -q musl; echo $$?),0)
+        JNI_LIBC = musl
+# GNU LibC (or glibc) is so pervasive we can assume it is the default
+# else
+#        JNI_LIBC = glibc
+endif
+
+ifneq ($(origin JNI_LIBC), undefined)
+  JNI_LIBC_POSTFIX = -$(JNI_LIBC)
+endif
+
 ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE)))
-	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
+	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
 else
-	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
 endif
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
 ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
@@ -1910,12 +1921,12 @@ rocksdbjavastatic: $(java_static_all_libobjects)
 	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic
-	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
+	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
 	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
-rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64
+rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
 	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
@@ -1936,6 +1947,22 @@ rocksdbjavastaticdockerarm64v8:
 	mkdir -p java/target
 	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
+rocksdbjavastaticdockerx86musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerx86_64musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerppc64lemusl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerarm64v8musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
 rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral
@@ -1945,6 +1972,8 @@ rocksdbjavastaticpublishcentral:
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64-musl.jar -Dclassifier=linux64-musl
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32-musl.jar -Dclassifier=linux32-musl
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-win64.jar -Dclassifier=win64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
diff --git a/java/RELEASE.md b/java/RELEASE.md
index 65da3d83a19..dda19455f3f 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -4,28 +4,33 @@ RocksDB can be built as a single self contained cross-platform JAR. The cross-pl
 
 Building a cross-platform JAR requires:
 
- * [Vagrant](https://www.vagrantup.com/)
- * [Virtualbox](https://www.virtualbox.org/)
+ * [Docker](https://www.docker.com/docker-community)
  * A Mac OSX machine that can compile RocksDB.
  * Java 7 set as JAVA_HOME.
 
 Once you have these items, run this make command from RocksDB's root source directory:
 
-    make jclean clean rocksdbjavastaticrelease
+    make jclean clean rocksdbjavastaticreleasedocker
 
-This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. 
+This command will build RocksDB natively on OSX, and will then spin up docker containers to build RocksDB for 32-bit and 64-bit Linux with glibc, and 32-bit and 64-bit Linux with musl libc.
 
 You can find all native binaries and JARs in the java/target directory upon completion:
 
     librocksdbjni-linux32.so
     librocksdbjni-linux64.so
+    librocksdbjni-linux64-musl.so
+    librocksdbjni-linux32-musl.so
     librocksdbjni-osx.jnilib
-    rocksdbjni-3.5.0-javadoc.jar
-    rocksdbjni-3.5.0-linux32.jar
-    rocksdbjni-3.5.0-linux64.jar
-    rocksdbjni-3.5.0-osx.jar
-    rocksdbjni-3.5.0-sources.jar
-    rocksdbjni-3.5.0.jar
+    rocksdbjni-x.y.z-javadoc.jar
+    rocksdbjni-x.y.z-linux32.jar
+    rocksdbjni-x.y.z-linux64.jar
+    rocksdbjni-x.y.z-linux64-musl.jar
+    rocksdbjni-x.y.z-linux32-musl.jar
+    rocksdbjni-x.y.z-osx.jar
+    rocksdbjni-x.y.z-sources.jar
+    rocksdbjni-x.y.z.jar
+
+Where x.y.z is the built version number of RocksDB.
 
 ## Maven publication
 
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index 48ab03f8089..0ee50de2ce1 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -8,10 +8,28 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
   config.vm.define "linux32" do |linux32|
     linux32.vm.box = "bento/centos-6.10-i386"
+    linux32.vm.provision :shell, path: "build-linux-centos.sh"
   end
 
   config.vm.define "linux64" do |linux64|
     linux64.vm.box = "bento/centos-6.10"
+    linux64.vm.provision :shell, path: "build-linux-centos.sh"
+  end
+
+  config.vm.define "linux32-musl" do |musl32|
+    musl32.vm.box = "alpine/alpine32"
+    musl32.vm.box_version = "3.6.0"
+    musl32.vm.provision :shell, path: "build-linux-alpine.sh"
+  end
+
+  config.vm.define "linux64-musl" do |musl64|
+    musl64.vm.box = "generic/alpine36"
+
+    ##  Should use the alpine/alpine64 box, but this issue needs to be fixed first - https://github.com/hashicorp/vagrant/issues/11218 
+    # musl64.vm.box = "alpine/alpine64"
+    # musl64.vm.box_version = "3.6.0"
+
+    musl64.vm.provision :shell, path: "build-linux-alpine.sh"
   end
 
   config.vm.provider "virtualbox" do |v|
@@ -20,7 +38,13 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     v.customize ["modifyvm", :id, "--nictype1", "virtio" ]
   end
 
-  config.vm.provision :shell, path: "build-linux-centos.sh"
+  if Vagrant.has_plugin?("vagrant-cachier")
+    config.cache.scope = :box
+  end
+  if Vagrant.has_plugin?("vagrant-vbguest")
+    config.vbguest.no_install = true
+  end
+
   config.vm.synced_folder "../target", "/rocksdb-build"
   config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
   config.vm.boot_timeout = 1200
diff --git a/java/crossbuild/build-linux-alpine.sh b/java/crossbuild/build-linux-alpine.sh
new file mode 100755
index 00000000000..561d34141ea
--- /dev/null
+++ b/java/crossbuild/build-linux-alpine.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+# update Alpine with latest versions
+echo '@edge http://nl.alpinelinux.org/alpine/edge/main' >> /etc/apk/repositories
+echo '@community http://nl.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories
+apk update
+apk upgrade
+
+# install CA certificates
+apk add ca-certificates
+
+# install build tools
+apk add \
+  build-base \
+  coreutils \
+  file \
+  git \
+  perl \
+  automake \
+  autoconf \
+  cmake
+
+# install tool dependencies for building RocksDB static library
+apk add \
+  curl \
+  bash \
+  wget \
+  tar \
+  openssl
+
+# install RocksDB dependencies
+apk add \
+  snappy snappy-dev \
+  zlib zlib-dev \
+  bzip2 bzip2-dev \
+  lz4 lz4-dev \
+  zstd zstd-dev \
+  linux-headers \
+  jemalloc jemalloc-dev
+
+# install OpenJDK7
+apk add openjdk7 \
+  && apk add java-cacerts \
+  && rm /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts \
+  && ln -s /etc/ssl/certs/java/cacerts /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts
+
+# cleanup
+rm -rf /var/cache/apk/*
+
+# puts javac in the PATH
+export JAVA_HOME=/usr/lib/jvm/java-1.7-openjdk
+export PATH=/usr/lib/jvm/java-1.7-openjdk/bin:$PATH
+
+# gflags from source
+cd /tmp &&\
+  git clone -b v2.0 --single-branch https://github.com/gflags/gflags.git &&\
+  cd gflags &&\
+  ./configure --prefix=/usr && make && make install &&\
+  rm -rf /tmp/*
+
+
+# build rocksdb
+cd /rocksdb
+make jclean clean
+PORTABLE=1 make -j8 rocksdbjavastatic
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/java/crossbuild/docker-build-linux-alpine.sh b/java/crossbuild/docker-build-linux-alpine.sh
new file mode 100755
index 00000000000..e605c7716bc
--- /dev/null
+++ b/java/crossbuild/docker-build-linux-alpine.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+#set -x
+
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
+
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+make clean-not-downloaded
+PORTABLE=1 make rocksdbjavastatic
+
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
+
diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh
index 33fcff95517..c4217785f2e 100755
--- a/java/crossbuild/docker-build-linux-centos.sh
+++ b/java/crossbuild/docker-build-linux-centos.sh
@@ -14,19 +14,19 @@ cd /rocksdb-local-build
 # Use scl devtoolset if available
 if hash scl 2>/dev/null; then
 	if scl --list | grep -q 'devtoolset-7'; then
-               # CentOS 7+
-               scl enable devtoolset-7 'make clean-not-downloaded'
+		# CentOS 7+
+		scl enable devtoolset-7 'make clean-not-downloaded'
 		scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	elif scl --list | grep -q 'devtoolset-2'; then
-               # CentOS 5 or 6
-               scl enable devtoolset-2 'make clean-not-downloaded'
+		# CentOS 5 or 6
+		scl enable devtoolset-2 'make clean-not-downloaded'
 		scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	else
 		echo "Could not find devtoolset"
 		exit 1;
 	fi
 else
-       make clean-not-downloaded
+	make clean-not-downloaded
         PORTABLE=1 make -j2 rocksdbjavastatic
 fi
 
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 94f07551c36..5defdca7d46 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -10,7 +10,7 @@
     <artifactId>rocksdbjni</artifactId>
     <!-- Version will be automatically replaced -->
     <version>-</version>
-    <description>RocksDB fat jar that contains .so files for linux32 and linux64, jnilib files
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files
         for Mac OSX, and a .dll for Windows x64.
     </description>
     <licenses>
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index bac1c559ac8..b5de34b756f 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -1,9 +1,22 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb.util;
 
+import java.io.File;
+import java.io.IOException;
+
 public class Environment {
   private static String OS = System.getProperty("os.name").toLowerCase();
   private static String ARCH = System.getProperty("os.arch").toLowerCase();
+  private static boolean MUSL_LIBC;
+
+  static {
+    try {
+      final Process p = new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start();
+      MUSL_LIBC = p.waitFor() == 0;
+    } catch (final IOException | InterruptedException e) {
+      MUSL_LIBC = false;
+    }
+  }
 
   public static boolean isAarch64() {
     return ARCH.contains("aarch64");
@@ -38,6 +51,10 @@ public static boolean isUnix() {
         OS.contains("nux");
   }
 
+  public static boolean isMuslLibc() {
+    return MUSL_LIBC;
+  }
+
   public static boolean isSolaris() {
     return OS.contains("sunos");
   }
@@ -61,15 +78,37 @@ public static String getSharedLibraryFileName(final String name) {
     return appendLibOsSuffix("lib" + getSharedLibraryName(name), true);
   }
 
+  /**
+   * Get the name of the libc implementation
+   *
+   * @return the name of the implementation,
+   *    or null if the default for that platform (e.g. glibc on Linux).
+   */
+  public static /* @Nullable */ String getLibcName() {
+    if (isMuslLibc()) {
+      return "musl";
+    } else {
+      return null;
+    }
+  }
+
+  private static String getLibcPostfix() {
+    final String libcName = getLibcName();
+    if (libcName == null) {
+      return "";
+    }
+    return "-" + libcName;
+  }
+
   public static String getJniLibraryName(final String name) {
     if (isUnix()) {
       final String arch = is64Bit() ? "64" : "32";
       if (isPowerPC() || isAarch64()) {
-        return String.format("%sjni-linux-%s", name, ARCH);
+        return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix());
       } else if (isS390x()) {
         return String.format("%sjni-linux%s", name, ARCH);
       } else {
-        return String.format("%sjni-linux%s", name, arch);
+        return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
       }
     } else if (isMac()) {
       return String.format("%sjni-osx", name);
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index ff13ddb5d9f..ab0ff2027a0 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -16,14 +16,17 @@
 public class EnvironmentTest {
   private final static String ARCH_FIELD_NAME = "ARCH";
   private final static String OS_FIELD_NAME = "OS";
+  private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC";
 
   private static String INITIAL_OS;
   private static String INITIAL_ARCH;
+  private static boolean INITIAL_MUSL_LIBC;
 
   @BeforeClass
   public static void saveState() {
     INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME);
     INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME);
+    INITIAL_MUSL_LIBC = getEnvironmentClassField(MUSL_LIBC_FIELD_NAME);
   }
 
   @Test
@@ -53,6 +56,7 @@ public void mac64() {
   @Test
   public void nix32() {
     // Linux
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -61,7 +65,17 @@ public void nix32() {
         isEqualTo("librocksdbjni-linux32.so");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
     // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Unix", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -84,6 +98,7 @@ public void aix32() {
 
   @Test
   public void nix64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -92,7 +107,17 @@ public void nix64() {
         isEqualTo("librocksdbjni-linux64.so");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
     // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Unix", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -130,8 +155,37 @@ public void win64() {
       isEqualTo("librocksdbjni.dll");
   }
 
+  @Test
+  public void ppc64le() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+  }
+
   @Test
   public void aarch64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "aarch64");
     assertThat(Environment.isUnix()).isTrue();
     assertThat(Environment.isAarch64()).isTrue();
@@ -142,6 +196,19 @@ public void aarch64() {
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-aarch64.so");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
   }
 
   private void setEnvironmentClassFields(String osName,
@@ -154,9 +221,10 @@ private void setEnvironmentClassFields(String osName,
   public static void restoreState() {
     setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS);
     setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH);
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, INITIAL_MUSL_LIBC);
   }
 
-  private static String getEnvironmentClassField(String fieldName) {
+  private static <T> T getEnvironmentClassField(String fieldName) {
     final Field field;
     try {
       field = Environment.class.getDeclaredField(fieldName);
@@ -164,13 +232,13 @@ private static String getEnvironmentClassField(String fieldName) {
       final Field modifiersField = Field.class.getDeclaredField("modifiers");
       modifiersField.setAccessible(true);
       modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
-      return (String)field.get(null);
+      return (T)field.get(null);
     } catch (NoSuchFieldException | IllegalAccessException e) {
       throw new RuntimeException(e);
     }
   }
 
-  private static void setEnvironmentClassField(String fieldName, String value) {
+  private static void setEnvironmentClassField(String fieldName, Object value) {
     final Field field;
     try {
       field = Environment.class.getDeclaredField(fieldName);

From 6d58ea901d64626745b96cc9ac072b5c1c5c8be4 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 26 Nov 2019 16:55:46 -0800
Subject: [PATCH 568/572] Fix compilation under MSVC VS2015 (#6081)

Summary:
**NOTE**: this also needs to be back-ported to 6.4.6 and possibly older branches if further releases from them is envisaged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6081

Differential Revision: D18710107

Pulled By: zhichao-cao

fbshipit-source-id: 03260f9316566e2bfc12c7d702d6338bb7941e01
---
 .../block_cache_trace_analyzer.cc               | 17 +++++++++--------
 .../block_cache_trace_analyzer.h                |  2 +-
 utilities/transactions/transaction_base.cc      |  2 +-
 utilities/transactions/transaction_base.h       |  2 +-
 utilities/transactions/write_prepared_txn.cc    |  2 +-
 utilities/transactions/write_prepared_txn.h     |  2 +-
 utilities/transactions/write_unprepared_txn.cc  |  2 +-
 utilities/transactions/write_unprepared_txn.h   |  2 +-
 8 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index 891af9a52ef..e6b6a2c0572 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -14,6 +14,7 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <memory>
 #include <random>
 #include <sstream>
 
@@ -650,7 +651,7 @@ void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
     const std::map<std::string, Features>& label_features,
     const std::map<std::string, Predictions>& label_predictions,
     uint32_t max_number_of_values) const {
-  std::default_random_engine rand_engine(static_cast<unsigned int>(env_->NowMicros()));
+  std::default_random_engine rand_engine(static_cast<std::default_random_engine::result_type>(env_->NowMicros()));
   for (auto const& label_feature_vectors : label_features) {
     const Features& past = label_feature_vectors.second;
     auto it = label_predictions.find(label_feature_vectors.first);
@@ -1170,7 +1171,7 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime(
 }
 
 void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
-    uint64_t reuse_window, bool user_access_only, TraceType block_type) const {
+    const uint64_t reuse_window, bool user_access_only, TraceType block_type) const {
   // A map from block key to an array of bools that states whether a block is
   // accessed in a time window.
   std::map<uint64_t, std::vector<bool>> block_accessed;
@@ -1209,11 +1210,11 @@ void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
   TraverseBlocks(block_callback);
 
   // A cell is the number of blocks accessed in a reuse window.
-  uint64_t reuse_table[reuse_vector_size][reuse_vector_size];
+  std::unique_ptr<uint64_t[]> reuse_table(new uint64_t[reuse_vector_size * reuse_vector_size]);
   for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
     // Initialize the reuse_table.
     for (uint64_t i = 0; i < reuse_vector_size; i++) {
-      reuse_table[start_time][i] = 0;
+      reuse_table[start_time * reuse_vector_size + i] = 0;
     }
     // Examine all blocks.
     for (auto const& block : block_accessed) {
@@ -1222,7 +1223,7 @@ void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
           // This block is accessed at start time and at the current time. We
           // increment reuse_table[start_time][i] since it is reused at the ith
           // window.
-          reuse_table[start_time][i]++;
+          reuse_table[start_time * reuse_vector_size + i]++;
         }
       }
     }
@@ -1250,8 +1251,8 @@ void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
       if (j < start_time) {
         row += "100.0";
       } else {
-        row += std::to_string(percent(reuse_table[start_time][j],
-                                      reuse_table[start_time][start_time]));
+        row += std::to_string(percent(reuse_table[start_time * reuse_vector_size + j],
+                                      reuse_table[start_time * reuse_vector_size + start_time]));
       }
     }
     out << row << std::endl;
@@ -1673,7 +1674,7 @@ void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
     if (bottom_k_index >= bottom_k) {
       break;
     }
-    std::map<TableReaderCaller, uint32_t> caller_naccesses;
+    std::map<TableReaderCaller, uint64_t> caller_naccesses;
     uint64_t naccesses = 0;
     for (auto const& block_id : naccess_it->second) {
       BlockAccessInfo* block = block_info_map_.find(block_id)->second;
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
index d7abd010010..4a15429e4c0 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -289,7 +289,7 @@ class BlockCacheTraceAnalyzer {
   // The file is named
   // "block_type_user_access_only_reuse_window_reuse_timeline". The file format
   // is start_time,0,1,...,N where N equals trace_duration / reuse_window.
-  void WriteBlockReuseTimeline(uint64_t reuse_window, bool user_access_only,
+  void WriteBlockReuseTimeline(const uint64_t reuse_window, bool user_access_only,
                                TraceType block_type) const;
 
   // Write the Get spatical locality into csv files saved in 'output_dir'.
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 04e664f6fcf..54b5b6204ac 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -323,7 +323,7 @@ void TransactionBaseImpl::MultiGet(const ReadOptions& read_options,
                                    ColumnFamilyHandle* column_family,
                                    const size_t num_keys, const Slice* keys,
                                    PinnableSlice* values, Status* statuses,
-                                   bool sorted_input) {
+                                   const bool sorted_input) {
   write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
                                       num_keys, keys, values, statuses,
                                       sorted_input);
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 72fa9d26af4..082b0b4f5b9 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -98,7 +98,7 @@ class TransactionBaseImpl : public Transaction {
 
   void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family,
                 const size_t num_keys, const Slice* keys, PinnableSlice* values,
-                Status* statuses, bool sorted_input = false) override;
+                Status* statuses, const bool sorted_input = false) override;
 
   using Transaction::MultiGetForUpdate;
   std::vector<Status> MultiGetForUpdate(
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 8dfc0d1d4ac..f8eef361b82 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -44,7 +44,7 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family,
                                 const size_t num_keys, const Slice* keys,
                                 PinnableSlice* values, Status* statuses,
-                                bool sorted_input) {
+                                const bool sorted_input) {
   SequenceNumber min_uncommitted, snap_seq;
   const SnapshotBackup backed_by_snapshot =
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index e5dadabc483..aaebaf48afd 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -61,7 +61,7 @@ class WritePreparedTxn : public PessimisticTransaction {
                         ColumnFamilyHandle* column_family,
                         const size_t num_keys, const Slice* keys,
                         PinnableSlice* values, Status* statuses,
-                        bool sorted_input = false) override;
+                        const bool sorted_input = false) override;
 
   // Note: The behavior is undefined in presence of interleaved writes to the
   // same transaction.
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 9b58d8bc87c..bcaeb1eae24 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -837,7 +837,7 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
                                   ColumnFamilyHandle* column_family,
                                   const size_t num_keys, const Slice* keys,
                                   PinnableSlice* values, Status* statuses,
-                                  bool sorted_input) {
+                                  const bool sorted_input) {
   SequenceNumber min_uncommitted, snap_seq;
   const SnapshotBackup backed_by_snapshot =
       wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index e2a5399c3b0..f39f39891f1 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -194,7 +194,7 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                         ColumnFamilyHandle* column_family,
                         const size_t num_keys, const Slice* keys,
                         PinnableSlice* values, Status* statuses,
-                        bool sorted_input = false) override;
+                        const bool sorted_input = false) override;
 
   using Transaction::GetIterator;
   virtual Iterator* GetIterator(const ReadOptions& options) override;

From ca3b6c28c90663db58a27c074d0c79c5cf124a73 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Tue, 26 Nov 2019 18:18:29 -0800
Subject: [PATCH 569/572] Expose and elaborate FilterBuildingContext (#6088)

Summary:
This change enables custom implementations of FilterPolicy to
wrap a variety of NewBloomFilterPolicy and select among them based on
contextual information such as table level and compaction style.

* Moves FilterBuildingContext to public API and elaborates it with more
useful data. (It would be nice to put more general options-like data,
but at the time this object is constructed, we are using internal APIs
ImmutableCFOptions and MutableCFOptions and don't have easy access to
ColumnFamilyOptions that I can tell.)

* Renames BloomFilterPolicy::GetFilterBitsBuilderInternal to
GetBuilderWithContext, because it's now public.

* Plumbs through the table's "level_at_creation" for filter building
context.

* Simplified some tests by adding GetBuilder() to
MockBlockBasedTableTester.

* Adds test as DBBloomFilterTest.ContextCustomFilterPolicy, including
sample wrapper class LevelAndStyleCustomFilterPolicy.

* Fixes a cross-test bug in DBBloomFilterTest.OptimizeFiltersForHits
where it does not reset perf context.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6088

Test Plan: make check, valgrind on db_bloom_filter_test

Differential Revision: D18697817

Pulled By: pdillinger

fbshipit-source-id: 5f987a2d7b07cc7a33670bc08ca6b4ca698c1cf4
---
 HISTORY.md                                    |   1 +
 db/db_bloom_filter_test.cc                    | 178 +++++++++++++++++-
 db/db_test_util.h                             |   5 +
 include/rocksdb/filter_policy.h               |  49 +++--
 .../block_based/block_based_table_builder.cc  |  39 ++--
 table/block_based/block_based_table_builder.h |   5 +-
 .../block_based/block_based_table_factory.cc  |   2 +-
 table/block_based/filter_policy.cc            |  33 +++-
 table/block_based/filter_policy_internal.h    |  38 ++--
 table/block_based/full_filter_block_test.cc   |  18 +-
 table/block_based/mock_block_based_table.h    |  20 +-
 .../partitioned_filter_block_test.cc          |   3 +-
 util/bloom_test.cc                            |   3 +-
 util/filter_bench.cc                          |   2 +-
 14 files changed, 300 insertions(+), 96 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 036a503892d..e7dd677d87b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
 * With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
 * Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
+* FilterPolicy now exposes additional API to make it possible to choose filter configurations based on context, such as table level and compaction style. See `LevelAndStyleCustomFilterPolicy` in db_bloom_filter_test.cc. While most existing custom implementations of FilterPolicy should continue to work as before, those wrapping the return of NewBloomFilterPolicy will require overriding new function `GetBuilderWithContext()`, because calling `GetFilterBitsBuilder()` on the FilterPolicy returned by NewBloomFilterPolicy is no longer supported.
 * An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
 * NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility.
 * The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index a3a47e7d54e..b31c935e85c 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -648,15 +648,17 @@ TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) {
 }
 
 namespace {
-// A wrapped bloom over default FilterPolicy
-class WrappedBloom : public FilterPolicy {
+// A wrapped bloom over block-based FilterPolicy
+class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy {
  public:
-  explicit WrappedBloom(int bits_per_key)
+  explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key)
       : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
 
-  ~WrappedBloom() override { delete filter_; }
+  ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; }
 
-  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
+  const char* Name() const override {
+    return "TestingWrappedBlockBasedFilterPolicy";
+  }
 
   void CreateFilter(const rocksdb::Slice* keys, int n,
                     std::string* dst) const override {
@@ -683,12 +685,13 @@ class WrappedBloom : public FilterPolicy {
 };
 }  // namespace
 
-TEST_F(DBBloomFilterTest, BloomFilterWrapper) {
+TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) {
   Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
 
   BlockBasedTableOptions table_options;
-  WrappedBloom* policy = new WrappedBloom(10);
+  TestingWrappedBlockBasedFilterPolicy* policy =
+      new TestingWrappedBlockBasedFilterPolicy(10);
   table_options.filter_policy.reset(policy);
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -718,6 +721,166 @@ TEST_F(DBBloomFilterTest, BloomFilterWrapper) {
   ASSERT_EQ(2U * maxKey, policy->GetCounter());
 }
 
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+  explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                           int bpk_otherwise)
+      : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+        policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+        policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+  // OK to use built-in policy name because we are deferring to a
+  // built-in builder. We aren't changing the serialized format.
+  const char* Name() const override { return policy_fifo_->Name(); }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    if (context.compaction_style == kCompactionStyleFIFO) {
+      return policy_fifo_->GetBuilderWithContext(context);
+    } else if (context.level_at_creation == 0) {
+      return policy_l0_other_->GetBuilderWithContext(context);
+    } else {
+      return policy_otherwise_->GetBuilderWithContext(context);
+    }
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    // OK to defer to any of them; they all can parse built-in filters
+    // from any settings.
+    return policy_fifo_->GetFilterBitsReader(contents);
+  }
+
+  // Defer just in case configuration uses block-based filter
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    policy_otherwise_->CreateFilter(keys, n, dst);
+  }
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    return policy_otherwise_->KeyMayMatch(key, filter);
+  }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_fifo_;
+  const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+  const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+class TestingContextCustomFilterPolicy
+    : public LevelAndStyleCustomFilterPolicy {
+ public:
+  explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                            int bpk_otherwise)
+      : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    test_report_ += "cf=";
+    test_report_ += context.column_family_name;
+    test_report_ += ",cs=";
+    test_report_ +=
+        OptionsHelper::compaction_style_to_string[context.compaction_style];
+    test_report_ += ",lv=";
+    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += "\n";
+
+    return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+  }
+
+  std::string DumpTestReport() {
+    std::string rv;
+    std::swap(rv, test_report_);
+    return rv;
+  }
+
+ private:
+  mutable std::string test_report_;
+};
+}  // namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  for (bool fifo : {true, false}) {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    options.compaction_style =
+        fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+    BlockBasedTableOptions table_options;
+    auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+    table_options.filter_policy = policy;
+    table_options.format_version = 5;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey / 2; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    for (int i = maxKey / 2; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    // Check that they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    // Since we have two tables / two filters, we might have Bloom checks on
+    // our queries, but no more than one "useful" per query on a found key.
+    EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+    // Check that we have two filters, each about
+    // fifo: 0.12% FP rate (15 bits per key)
+    // level: 2.3% FP rate (8 bits per key)
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    {
+      auto useful_count =
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+      EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+      EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+    }
+
+    if (!fifo) {  // FIFO only has L0
+      // Full compaction
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                  nullptr));
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+
+      // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+      for (int i = 0; i < maxKey; i++) {
+        ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+      }
+      {
+        auto useful_count =
+            TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+        EXPECT_GE(useful_count, maxKey * 0.90);
+        EXPECT_LE(useful_count, maxKey * 0.91);
+      }
+    }
+
+    // Destroy
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    handles_[1] = nullptr;
+  }
+}
+
 class SliceTransformLimitedDomain : public SliceTransform {
   const char* Name() const override { return "SliceTransformLimitedDomain"; }
 
@@ -1159,6 +1322,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.optimize_filters_for_hits = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  get_perf_context()->Reset();
   get_perf_context()->EnablePerLevelPerfContext();
   CreateAndReopenWithCF({"mypikachu"}, options);
 
diff --git a/db/db_test_util.h b/db/db_test_util.h
index f8a01ad156e..c9678ee1c3d 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -988,6 +988,11 @@ class DBTestBase : public testing::Test {
   uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
     return options.statistics->getTickerCount(ticker_type);
   }
+
+  uint64_t TestGetAndResetTickerCount(const Options& options,
+                                      Tickers ticker_type) {
+    return options.statistics->getAndResetTickerCount(ticker_type);
+  }
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index a72c18b94f2..ad6862d5f38 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -25,9 +25,12 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/advanced_options.h"
+
 namespace rocksdb {
 
 class Slice;
+struct BlockBasedTableOptions;
 
 // A class that takes a bunch of keys, then generates filter
 class FilterBitsBuilder {
@@ -80,8 +83,25 @@ class FilterBitsReader {
   }
 };
 
-// Internal type required for FilterPolicy
-struct FilterBuildingContext;
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext().
+struct FilterBuildingContext {
+  // This constructor is for internal use only and subject to change.
+  FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+  // Options for the table being built
+  const BlockBasedTableOptions& table_options;
+
+  // Name of the column family for the table (or empty string if unknown)
+  std::string column_family_name;
+
+  // The compactions style in effect for the table
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // The table level at time of constructing the SST file, or -1 if unknown.
+  // (The table file could later be used at a different level.)
+  int level_at_creation = -1;
+};
 
 // We add a new format of filter block called full filter block
 // This new interface gives you more space of customization
@@ -125,8 +145,21 @@ class FilterPolicy {
 
   // Return a new FilterBitsBuilder for full or partitioned filter blocks, or
   // nullptr if using block-based filter.
+  // NOTE: This function is only called by GetBuilderWithContext() below for
+  // custom FilterPolicy implementations. Thus, it is not necessary to
+  // override this function if overriding GetBuilderWithContext().
   virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
 
+  // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy
+  // to customize the builder for contextual constraints and hints.
+  // (Name changed to avoid triggering -Werror=overloaded-virtual.)
+  // If overriding GetFilterBitsBuilder() suffices, it is not necessary to
+  // override this function.
+  virtual FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const {
+    return GetFilterBitsBuilder();
+  }
+
   // Return a new FilterBitsReader for full or partitioned filter blocks, or
   // nullptr if using block-based filter.
   // As here, the input slice should NOT be deleted by FilterPolicy.
@@ -134,18 +167,6 @@ class FilterPolicy {
       const Slice& /*contents*/) const {
     return nullptr;
   }
-
- protected:
-  // An internal-use-only variant of GetFilterBitsBuilder that allows
-  // a built-in FilterPolicy to customize the builder for contextual
-  // constraints and hints. (Name changed to avoid triggering
-  // -Werror=overloaded-virtual.)
-  virtual FilterBitsBuilder* GetFilterBitsBuilderInternal(
-      const FilterBuildingContext&) const {
-    return GetFilterBitsBuilder();
-  }
-
-  friend FilterBuildingContext;
 };
 
 // Return a new filter policy that uses a bloom filter with approximately
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3e40549c488..58f3ff4339a 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -62,13 +62,14 @@ namespace {
 // Create a filter block builder based on its type.
 FilterBlockBuilder* CreateFilterBlockBuilder(
     const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
-    const BlockBasedTableOptions& table_opt,
+    const FilterBuildingContext& context,
     const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
+  const BlockBasedTableOptions& table_opt = context.table_options;
   if (table_opt.filter_policy == nullptr) return nullptr;
 
   FilterBitsBuilder* filter_bits_builder =
-      FilterBuildingContext(table_opt).GetBuilder();
+      BloomFilterPolicy::GetBuilderFromContext(context);
   if (filter_bits_builder == nullptr) {
     return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
                                             table_opt);
@@ -345,6 +346,7 @@ struct BlockBasedTableBuilder::Rep {
 
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+  int level_at_creation;
   uint32_t column_family_id;
   const std::string& column_family_name;
   uint64_t creation_time = 0;
@@ -363,9 +365,9 @@ struct BlockBasedTableBuilder::Rep {
       const CompressionType _compression_type,
       const uint64_t _sample_for_compression,
       const CompressionOptions& _compression_opts, const bool skip_filters,
-      const std::string& _column_family_name, const uint64_t _creation_time,
-      const uint64_t _oldest_key_time, const uint64_t _target_file_size,
-      const uint64_t _file_creation_time)
+      const int _level_at_creation, const std::string& _column_family_name,
+      const uint64_t _creation_time, const uint64_t _oldest_key_time,
+      const uint64_t _target_file_size, const uint64_t _file_creation_time)
       : ioptions(_ioptions),
         moptions(_moptions),
         table_options(table_opt),
@@ -398,6 +400,7 @@ struct BlockBasedTableBuilder::Rep {
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
+        level_at_creation(_level_at_creation),
         column_family_id(_column_family_id),
         column_family_name(_column_family_name),
         creation_time(_creation_time),
@@ -419,9 +422,13 @@ struct BlockBasedTableBuilder::Rep {
     if (skip_filters) {
       filter_builder = nullptr;
     } else {
+      FilterBuildingContext context(table_options);
+      context.column_family_name = column_family_name;
+      context.compaction_style = ioptions.compaction_style;
+      context.level_at_creation = level_at_creation;
       filter_builder.reset(CreateFilterBlockBuilder(
-          _ioptions, _moptions, table_options,
-          use_delta_encoding_for_index_values, p_index_builder_));
+          ioptions, moptions, context, use_delta_encoding_for_index_values,
+          p_index_builder_));
     }
 
     for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@@ -454,9 +461,9 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const CompressionType compression_type,
     const uint64_t sample_for_compression,
     const CompressionOptions& compression_opts, const bool skip_filters,
-    const std::string& column_family_name, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const uint64_t target_file_size,
-    const uint64_t file_creation_time) {
+    const std::string& column_family_name, const int level_at_creation,
+    const uint64_t creation_time, const uint64_t oldest_key_time,
+    const uint64_t target_file_size, const uint64_t file_creation_time) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
@@ -469,12 +476,12 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ =
-      new Rep(ioptions, moptions, sanitized_table_options, internal_comparator,
-              int_tbl_prop_collector_factories, column_family_id, file,
-              compression_type, sample_for_compression, compression_opts,
-              skip_filters, column_family_name, creation_time, oldest_key_time,
-              target_file_size, file_creation_time);
+  rep_ = new Rep(ioptions, moptions, sanitized_table_options,
+                 internal_comparator, int_tbl_prop_collector_factories,
+                 column_family_id, file, compression_type,
+                 sample_for_compression, compression_opts, skip_filters,
+                 level_at_creation, column_family_name, creation_time,
+                 oldest_key_time, target_file_size, file_creation_time);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index bd099ab3dae..5dd5065bb20 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -47,8 +47,9 @@ class BlockBasedTableBuilder : public TableBuilder {
       const CompressionType compression_type,
       const uint64_t sample_for_compression,
       const CompressionOptions& compression_opts, const bool skip_filters,
-      const std::string& column_family_name, const uint64_t creation_time = 0,
-      const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
+      const std::string& column_family_name, const int level_at_creation,
+      const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+      const uint64_t target_file_size = 0,
       const uint64_t file_creation_time = 0);
 
   // No copying allowed
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 3c7af3bf9de..a3368610dc8 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -218,7 +218,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
       table_builder_options.sample_for_compression,
       table_builder_options.compression_opts,
       table_builder_options.skip_filters,
-      table_builder_options.column_family_name,
+      table_builder_options.column_family_name, table_builder_options.level,
       table_builder_options.creation_time,
       table_builder_options.oldest_key_time,
       table_builder_options.target_file_size,
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 9b71010c48c..38a6e9f83b2 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -27,7 +27,7 @@ namespace {
 // See description in FastLocalBloomImpl
 class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
  public:
-  FastLocalBloomBitsBuilder(const int millibits_per_key)
+  explicit FastLocalBloomBitsBuilder(const int millibits_per_key)
       : millibits_per_key_(millibits_per_key),
         num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
     assert(millibits_per_key >= 1000);
@@ -493,17 +493,17 @@ bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
 
 FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
   // This code path should no longer be used, for the built-in
-  // BloomFilterPolicy. Internal to RocksDB and outside BloomFilterPolicy,
-  // only get a FilterBitsBuilder with FilterBuildingContext::GetBuilder(),
-  // which will call BloomFilterPolicy::GetFilterBitsBuilderInternal.
-  // RocksDB users have been warned (HISTORY.md) that they can no longer
-  // call this on the built-in BloomFilterPolicy (unlikely).
+  // BloomFilterPolicy. Internal to RocksDB and outside
+  // BloomFilterPolicy, only get a FilterBitsBuilder with
+  // BloomFilterPolicy::GetBuilderFromContext(), which will call
+  // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
+  // been warned (HISTORY.md) that they can no longer call this on
+  // the built-in BloomFilterPolicy (unlikely).
   assert(false);
-  return GetFilterBitsBuilderInternal(
-      FilterBuildingContext(BlockBasedTableOptions()));
+  return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
 }
 
-FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal(
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
     const FilterBuildingContext& context) const {
   Mode cur = mode_;
   // Unusual code construction so that we can have just
@@ -511,7 +511,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal(
   for (int i = 0; i < 2; ++i) {
     switch (cur) {
       case kAuto:
-        if (context.table_options_.format_version < 5) {
+        if (context.table_options.format_version < 5) {
           cur = kLegacyBloom;
         } else {
           cur = kFastLocalBloom;
@@ -529,6 +529,15 @@ FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilderInternal(
   return nullptr;  // something legal
 }
 
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
+    const FilterBuildingContext& context) {
+  if (context.table_options.filter_policy) {
+    return context.table_options.filter_policy->GetBuilderWithContext(context);
+  } else {
+    return nullptr;
+  }
+}
+
 // Read metadata to determine what kind of FilterBitsReader is needed
 // and return a new one.
 FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
@@ -679,6 +688,10 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
   return new BloomFilterPolicy(bits_per_key, m);
 }
 
+FilterBuildingContext::FilterBuildingContext(
+    const BlockBasedTableOptions& _table_options)
+    : table_options(_table_options) {}
+
 FilterPolicy::~FilterPolicy() { }
 
 }  // namespace rocksdb
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 24d9bf25a0d..b92980a546a 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -29,24 +29,6 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
   virtual uint32_t CalculateSpace(const int num_entry) = 0;
 };
 
-// Current information passed to BloomFilterPolicy at filter building
-// time. Subject to change.
-struct FilterBuildingContext {
-  explicit FilterBuildingContext(const BlockBasedTableOptions& table_options)
-      : table_options_(table_options) {}
-
-  // A convenience function to save boilerplate
-  FilterBitsBuilder* GetBuilder() const {
-    if (table_options_.filter_policy) {
-      return table_options_.filter_policy->GetFilterBitsBuilderInternal(*this);
-    } else {
-      return nullptr;
-    }
-  }
-
-  const BlockBasedTableOptions& table_options_;
-};
-
 // RocksDB built-in filter policy for Bloom or Bloom-like filters.
 // This class is considered internal API and subject to change.
 // See NewBloomFilterPolicy.
@@ -105,6 +87,18 @@ class BloomFilterPolicy : public FilterPolicy {
 
   FilterBitsBuilder* GetFilterBitsBuilder() const override;
 
+  // To use this function, call GetBuilderFromContext().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  // Returns a new FilterBitsBuilder from the filter_policy in
+  // table_options of a context, or nullptr if not applicable.
+  // (An internal convenience function to save boilerplate.)
+  static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
   // Read metadata to determine what kind of FilterBitsReader is needed
   // and return a new one. This must successfully process any filter data
   // generated by a built-in FilterBitsBuilder, regardless of the impl
@@ -116,14 +110,6 @@ class BloomFilterPolicy : public FilterPolicy {
   // Essentially for testing only: legacy whole bits/key
   int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
 
- protected:
-  // To use this function, call FilterBuildingContext::GetBuilder().
-  //
-  // Neither the context nor any objects therein should be saved beyond
-  // the call to this function, unless it's shared_ptr.
-  FilterBitsBuilder* GetFilterBitsBuilderInternal(
-      const FilterBuildingContext&) const override;
-
  private:
   // Newer filters support fractional bits per key. For predictable behavior
   // of 0.001-precision values across floating point implementations, we
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index eb4506f89a5..bde19121c95 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -110,8 +110,7 @@ class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
 };
 
 TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
@@ -130,8 +129,7 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
 }
 
 TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
   builder.Add("foo");
   builder.Add("bar");
   builder.Add("box");
@@ -188,8 +186,7 @@ class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
 };
 
 TEST_F(FullFilterBlockTest, EmptyBuilder) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
   Slice slice = builder.Finish();
   ASSERT_EQ("", EscapeString(slice));
 
@@ -238,8 +235,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
   {  // empty prefixes
     std::unique_ptr<const SliceTransform> prefix_extractor(
         NewFixedPrefixTransform(0));
-    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
-        FilterBuildingContext(table_options_).GetBuilder());
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
     const bool WHOLE_KEY = true;
     FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                    bits_builder);
@@ -262,8 +258,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
   // mix of empty and non-empty
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(7));
-  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(
-      FilterBuildingContext(table_options_).GetBuilder());
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
   const bool WHOLE_KEY = true;
   FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                  bits_builder);
@@ -279,8 +274,7 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
 }
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, FilterBuildingContext(table_options_).GetBuilder());
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
   ASSERT_EQ(0, builder.NumAdded());
   builder.Add("foo");
   builder.Add("bar");
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
index 7775e06781a..52891b1bdad 100644
--- a/table/block_based/mock_block_based_table.h
+++ b/table/block_based/mock_block_based_table.h
@@ -4,9 +4,10 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 
-#include "table/block_based/block_based_filter_block.h"
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
 
 namespace rocksdb {
 namespace mock {
@@ -18,6 +19,8 @@ class MockBlockBasedTable : public BlockBasedTable {
 };
 
 class MockBlockBasedTableTester {
+  static constexpr int kMockLevel = 0;
+
  public:
   Options options_;
   ImmutableCFOptions ioptions_;
@@ -33,11 +36,18 @@ class MockBlockBasedTableTester {
     table_options_.filter_policy.reset(filter_policy);
 
     constexpr bool skip_filters = false;
-    constexpr int level = 0;
     constexpr bool immortal_table = false;
-    table_.reset(new MockBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table)));
+    table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+        ioptions_, env_options_, table_options_, icomp_, skip_filters,
+        kMockLevel, immortal_table)));
+  }
+
+  FilterBitsBuilder* GetBuilder() const {
+    FilterBuildingContext context(table_options_);
+    context.column_family_name = "mock_cf";
+    context.compaction_style = ioptions_.compaction_style;
+    context.level_at_creation = kMockLevel;
+    return BloomFilterPolicy::GetBuilderFromContext(context);
   }
 };
 
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index b621b58893f..315789f55b7 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -126,7 +126,8 @@ class PartitionedFilterBlockTest
     const bool kValueDeltaEncoded = true;
     return new PartitionedFilterBlockBuilder(
         prefix_extractor, table_options_.whole_key_filtering,
-        FilterBuildingContext(table_options_).GetBuilder(),
+        BloomFilterPolicy::GetBuilderFromContext(
+            FilterBuildingContext(table_options_)),
         table_options_.index_block_restart_interval, !kValueDeltaEncoded,
         p_index_builder, partition_size);
   }
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index c2d7a01a873..6f4d4864607 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -276,7 +276,8 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
   }
 
   void Reset() {
-    bits_builder_.reset(FilterBuildingContext(table_options_).GetBuilder());
+    bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext(
+        FilterBuildingContext(table_options_)));
     bits_reader_.reset(nullptr);
     buf_.reset(nullptr);
     filter_size_ = 0;
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 60c59b00833..bbce72f494c 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -280,7 +280,7 @@ void FilterBench::Go() {
 
   std::unique_ptr<FilterBitsBuilder> builder;
   if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
-    builder.reset(FilterBuildingContext(table_options_).GetBuilder());
+    builder.reset(GetBuilder());
   }
 
   uint32_t variance_mask = 1;

From 496a6ae895ab9d3064dc92f4a3bcfeac839b80c4 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm1373.frc2.facebook.com>
Date: Tue, 26 Nov 2019 18:59:24 -0800
Subject: [PATCH 570/572] Fix HISTORY.md for 6.6.0 (#6096)

Summary:
Some of the entries were incorrectly listed under 6.5.0.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6096

Differential Revision: D18722801

Pulled By: gfosco

fbshipit-source-id: 18d1187deb6a9d69a8feb68b727d2f720a65f2bc
---
 HISTORY.md | 71 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e7dd677d87b..f4318c3005b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,33 @@
 # Rocksdb Change Log
-## Unreleased
+## 6.6.0 (11/25/2019)
+### Bug Fixes
+* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
+* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
+* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
+* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
+* Fixed an sst_dump crash on some plain table SST files.
+* Fixed a memory leak in some error cases of opening plain table SST files.
+* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856).
+
+### New Features
+* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
+* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
+* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
+* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
+* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
+* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
+* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
+* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
+* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
+* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
+* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
+* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
+* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
+
 ### Public API Change
 * RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016.
 * TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
@@ -9,30 +37,25 @@
 * An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
 * NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility.
 * The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
+* Deprecate `snap_refresh_nanos` option.
+* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
+* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
+* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
+* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
 
 ### Default Option Changes
 * Changed the default value of periodic_compaction_seconds to `UINT64_MAX - 1` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
 * Changed the default value of ttl to `UINT64_MAX - 1` which allows RocksDB to auto-tune ttl value. When using the default value, TTL will be auto-enabled to 30 days, when the feature is supported. To revert the old behavior, you can explictly set it to 0.
 
-### New Features
-* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
-* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
-* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
-* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
-* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
-* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
-* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
-
 ### Performance Improvements
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
+* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
 
+## 6.5.2 (11/15/2019)
 ### Bug Fixes
-* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
-* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
 * Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
-* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
-* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
+* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 
 ## 6.5.1 (10/16/2019)
 ### Bug Fixes
@@ -46,40 +69,20 @@
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
 * Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
 * Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
-* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
-* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
-* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
-* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
-* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
-* Fixed an sst_dump crash on some plain table SST files.
-* Fixed a memory leak in some error cases of opening plain table SST files.
-* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856).
 
 ### New Features
 * Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
 * VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
-* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
-* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
-* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
-* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
-* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
-* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
 
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
 * The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
-* Deprecate `snap_refresh_nanos` option.
-* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
-* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
-* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
-* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
 
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
-* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
 
 ## 6.4.0 (7/30/2019)
 ### Default Option Change

From aa1857e2df9251c57b9a61e1543969e9be0811bf Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 26 Nov 2019 21:38:38 -0800
Subject: [PATCH 571/572] Support options.max_open_files = -1 with
 periodic_compaction_seconds (#6090)

Summary:
options.periodic_compaction_seconds isn't supported when options.max_open_files != -1. It's because that the information of file creation time is stored in table properties and are not guaranteed to be loaded unless options.max_open_files = -1. Relax this constraint by storing the information in manifest.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6090

Test Plan: Pass all existing tests; Modify an existing test to force the manifest value to take 0 to simulate backward compatibility case; manually open the DB generated with the change by release 4.2.

Differential Revision: D18702268

fbshipit-source-id: 13e0bd94f546498a04f3dc5fc0d9dff5125ec9eb
---
 HISTORY.md                              |   3 +-
 db/compaction/compaction_job.cc         |   1 +
 db/compaction/compaction_job_test.cc    |   2 +-
 db/compaction/compaction_picker_test.cc |   2 +-
 db/db_compaction_test.cc                | 145 ++++++++++++++----------
 db/db_impl/db_impl_compaction_flush.cc  |   5 +-
 db/db_impl/db_impl_experimental.cc      |   2 +-
 db/db_impl/db_impl_open.cc              |   2 +-
 db/db_test.cc                           |  29 ++++-
 db/external_sst_file_ingestion_job.cc   |   9 +-
 db/flush_job.cc                         |   3 +-
 db/import_column_family_job.cc          |   8 +-
 db/repair.cc                            |  14 +--
 db/version_builder_test.cc              |  47 +++++---
 db/version_edit.cc                      |  16 +++
 db/version_edit.h                       |  40 +++++--
 db/version_edit_test.cc                 |  21 ++--
 db/version_set.cc                       |  29 +++--
 db/version_set_test.cc                  |   7 +-
 include/rocksdb/metadata.h              |  16 ++-
 20 files changed, 263 insertions(+), 138 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f4318c3005b..3f1e7c9ae81 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -19,8 +19,9 @@
 * A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
 * Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
 * Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
-* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
+* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties.
 * Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
+* SstFileMetaData also returns file creation time and oldest ancester time.
 * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
 * The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
 * When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3aec2cf6a49..22c504fde26 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1501,6 +1501,7 @@ Status CompactionJob::OpenCompactionOutputFile(
     out.meta.fd = FileDescriptor(file_number,
                                  sub_compact->compaction->output_path_id(), 0);
     out.meta.oldest_ancester_time = oldest_ancester_time;
+    out.meta.file_creation_time = current_time;
     out.finished = false;
     sub_compact->outputs.push_back(out);
   }
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 0b5707a3454..9fb3f0df5dc 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -184,7 +184,7 @@ class CompactionJobTest : public testing::Test {
     VersionEdit edit;
     edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
                  smallest_seqno, largest_seqno, false, oldest_blob_file_number,
-                 kUnknownOldestAncesterTime);
+                 kUnknownOldestAncesterTime, kUnknownFileCreationTime);
 
     mutex_.Lock();
     versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index d593d646543..5cb3350d648 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -95,7 +95,7 @@ class CompactionPickerTest : public testing::Test {
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
         largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime);
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
     vstorage_->AddFile(level, f);
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 3c2452e4c2b..84f9f55dd7b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -3657,71 +3657,103 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
   const int kNumLevelFiles = 2;
   const int kValueSize = 100;
 
-  Options options = CurrentOptions();
-  options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
-  options.max_open_files = -1;       // needed for ttl compaction
-  env_->time_elapse_only_sleep_ = false;
-  options.env = env_;
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+      if (if_open_all_files) {
+        options.max_open_files = -1;  // needed for ttl compaction
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 0;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the file creation time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
 
-  env_->addon_time_.store(0);
-  DestroyAndReopen(options);
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
 
-  int periodic_compactions = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        auto compaction_reason = compaction->compaction_reason();
-        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
-          periodic_compactions++;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
 
-  Random rnd(301);
-  for (int i = 0; i < kNumLevelFiles; ++i) {
-    for (int j = 0; j < kNumKeysPerFile; ++j) {
-      ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
-    }
-    Flush();
-  }
-  dbfull()->TEST_WaitForCompact();
+      int periodic_compactions = 0;
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+              periodic_compactions++;
+            }
+          });
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  ASSERT_EQ("2", FilesPerLevel());
-  ASSERT_EQ(0, periodic_compactions);
+      Random rnd(301);
+      for (int i = 0; i < kNumLevelFiles; ++i) {
+        for (int j = 0; j < kNumKeysPerFile; ++j) {
+          ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
+                        RandomString(&rnd, kValueSize)));
+        }
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
 
-  // Add 50 hours and do a write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  // Assert that the files stay in the same level
-  ASSERT_EQ("3", FilesPerLevel());
-  // The two old files go through the periodic compaction process
-  ASSERT_EQ(2, periodic_compactions);
+      ASSERT_EQ("2", FilesPerLevel());
+      ASSERT_EQ(0, periodic_compactions);
 
-  MoveFilesToLevel(1);
-  ASSERT_EQ("0,3", FilesPerLevel());
+      // Add 50 hours and do a write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("a", "1"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      // Assert that the files stay in the same level
+      ASSERT_EQ("3", FilesPerLevel());
+      // The two old files go through the periodic compaction process
+      ASSERT_EQ(2, periodic_compactions);
 
-  // Add another 50 hours and do another write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("b", "2"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("1,3", FilesPerLevel());
-  // The three old files now go through the periodic compaction process. 2 + 3.
-  ASSERT_EQ(5, periodic_compactions);
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,3", FilesPerLevel());
 
-  // Add another 50 hours and do another write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("c", "3"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("2,3", FilesPerLevel());
-  // The four old files now go through the periodic compaction process. 5 + 4.
-  ASSERT_EQ(9, periodic_compactions);
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("b", "2"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,3", FilesPerLevel());
+      // The three old files now go through the periodic compaction process. 2
+      // + 3.
+      ASSERT_EQ(5, periodic_compactions);
+
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("c", "3"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("2,3", FilesPerLevel());
+      // The four old files now go through the periodic compaction process. 5
+      // + 4.
+      ASSERT_EQ(9, periodic_compactions);
 
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
 }
 
 TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
@@ -3734,7 +3766,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
   const int kValueSize = 100;
 
   Options options = CurrentOptions();
-  options.max_open_files = -1;  // needed for ttl compaction
   env_->time_elapse_only_sleep_ = false;
   options.env = env_;
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 12a8a2aabc6..b01fdbc965c 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1257,7 +1257,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction, f->oldest_blob_file_number,
-                   f->oldest_ancester_time);
+                   f->oldest_ancester_time, f->file_creation_time);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -2672,7 +2672,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
                            f->largest, f->fd.smallest_seqno,
                            f->fd.largest_seqno, f->marked_for_compaction,
-                           f->oldest_blob_file_number, f->oldest_ancester_time);
+                           f->oldest_blob_file_number, f->oldest_ancester_time,
+                           f->file_creation_time);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index 4fe5409ae9f..9a6e85ea6f5 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -129,7 +129,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction, f->oldest_blob_file_number,
-                   f->oldest_ancester_time);
+                   f->oldest_ancester_time, f->file_creation_time);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index dddbcc26224..9ca0a940cc7 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1226,7 +1226,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
                   meta.fd.smallest_seqno, meta.fd.largest_seqno,
                   meta.marked_for_compaction, meta.oldest_blob_file_number,
-                  meta.oldest_ancester_time);
+                  meta.oldest_ancester_time, meta.file_creation_time);
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
diff --git a/db/db_test.cc b/db/db_test.cc
index 6ea5e9e00d6..16d1a4deebe 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1022,7 +1022,8 @@ TEST_F(DBTest, FailMoreDbPaths) {
 
 void CheckColumnFamilyMeta(
     const ColumnFamilyMetaData& cf_meta,
-    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+    const std::vector<std::vector<FileMetaData>>& files_by_level,
+    uint64_t start_time, uint64_t end_time) {
   ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
   ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
 
@@ -1060,6 +1061,14 @@ void CheckColumnFamilyMeta(
                 file_meta_from_files.largest.user_key().ToString());
       ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
                 file_meta_from_files.oldest_blob_file_number);
+      ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+                file_meta_from_files.oldest_ancester_time);
+      ASSERT_EQ(file_meta_from_cf.file_creation_time,
+                file_meta_from_files.file_creation_time);
+      ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+      ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+      ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+      ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
     }
 
     ASSERT_EQ(level_meta_from_cf.size, level_size);
@@ -1113,6 +1122,11 @@ TEST_F(DBTest, MetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
   DestroyAndReopen(options);
 
   Random rnd(301);
@@ -1139,9 +1153,12 @@ TEST_F(DBTest, MetaDataTest) {
   std::vector<std::vector<FileMetaData>> files_by_level;
   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
 
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
   ColumnFamilyMetaData cf_meta;
   db_->GetColumnFamilyMetaData(&cf_meta);
-  CheckColumnFamilyMeta(cf_meta, files_by_level);
+  CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
 
   std::vector<LiveFileMetaData> live_file_meta;
   db_->GetLiveFilesMetaData(&live_file_meta);
@@ -6420,6 +6437,12 @@ TEST_F(DBTest, CreationTimeOfOldestFile) {
           }
         }
       });
+  // Set file creation time in manifest all to 0.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::FileMetaData", [&](void* arg) {
+        FileMetaData* meta = static_cast<FileMetaData*>(arg);
+        meta->file_creation_time = 0;
+      });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Random rnd(301);
@@ -6431,7 +6454,7 @@ TEST_F(DBTest, CreationTimeOfOldestFile) {
     Flush();
   }
 
-  // At this point there should be 2 files, oen with file_creation_time = 0 and
+  // At this point there should be 2 files, one with file_creation_time = 0 and
   // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
   uint64_t creation_time;
   Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 08446c7f501..fd79ff1d0c1 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -246,16 +246,19 @@ Status ExternalSstFileIngestionJob::Run() {
 
     // We use the import time as the ancester time. This is the time the data
     // is written to the database.
-    uint64_t oldest_ancester_time = 0;
     int64_t temp_current_time = 0;
+    uint64_t current_time = kUnknownFileCreationTime;
+    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
     if (env_->GetCurrentTime(&temp_current_time).ok()) {
-      oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
+      current_time = oldest_ancester_time =
+          static_cast<uint64_t>(temp_current_time);
     }
 
     edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
-                  false, kInvalidBlobFileNumber, oldest_ancester_time);
+                  false, kInvalidBlobFileNumber, oldest_ancester_time,
+                  current_time);
   }
   return status;
 }
diff --git a/db/flush_job.cc b/db/flush_job.cc
index dcbc33c37a2..bdb4c179bd8 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -368,6 +368,7 @@ Status FlushJob::WriteLevel0Table() {
       // It's not clear whether oldest_key_time is always available. In case
       // it is not available, use current_time.
       meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+      meta_.file_creation_time = current_time;
 
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
@@ -413,7 +414,7 @@ Status FlushJob::WriteLevel0Table() {
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
                    meta_.marked_for_compaction, meta_.oldest_blob_file_number,
-                   meta_.oldest_ancester_time);
+                   meta_.oldest_ancester_time, meta_.file_creation_time);
   }
 #ifndef ROCKSDB_LITE
   // Piggyback FlushJobInfo on the first first flushed memtable.
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 264075a9d21..f52418a0781 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -135,10 +135,12 @@ Status ImportColumnFamilyJob::Run() {
 
   // We use the import time as the ancester time. This is the time the data
   // is written to the database.
-  uint64_t oldest_ancester_time = 0;
   int64_t temp_current_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t current_time = kUnknownOldestAncesterTime;
   if (env_->GetCurrentTime(&temp_current_time).ok()) {
-    oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
+    current_time = oldest_ancester_time =
+        static_cast<uint64_t>(temp_current_time);
   }
 
   for (size_t i = 0; i < files_to_import_.size(); ++i) {
@@ -149,7 +151,7 @@ Status ImportColumnFamilyJob::Run() {
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, file_metadata.smallest_seqno,
                   file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
-                  oldest_ancester_time);
+                  oldest_ancester_time, current_time);
 
     // If incoming sequence number is higher, update local sequence number.
     if (file_metadata.largest_seqno > versions_->LastSequence()) {
diff --git a/db/repair.cc b/db/repair.cc
index baed9ead1ba..b71f725a285 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -577,13 +577,13 @@ class Repairer {
 
       // TODO(opt): separate out into multiple levels
       for (const auto* table : cf_id_and_tables.second) {
-        edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
-                     table->meta.fd.GetFileSize(), table->meta.smallest,
-                     table->meta.largest, table->meta.fd.smallest_seqno,
-                     table->meta.fd.largest_seqno,
-                     table->meta.marked_for_compaction,
-                     table->meta.oldest_blob_file_number,
-                     table->meta.oldest_ancester_time);
+        edit.AddFile(
+            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+            table->meta.fd.GetFileSize(), table->meta.smallest,
+            table->meta.largest, table->meta.fd.smallest_seqno,
+            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+            table->meta.oldest_blob_file_number,
+            table->meta.oldest_ancester_time, table->meta.file_creation_time);
       }
       assert(next_file_number_ > 0);
       vset_.MarkFileNumberUsed(next_file_number_ - 1);
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 4c88983ba38..64d2d2481eb 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -63,7 +63,7 @@ class VersionBuilderTest : public testing::Test {
         file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
         GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
         /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime);
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
@@ -114,7 +114,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -149,7 +150,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -187,7 +189,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -216,19 +219,24 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
 
   EnvOptions env_options;
 
@@ -255,30 +263,37 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_builder.Apply(&version_edit);
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
                        GetInternalKey("950"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
                        GetInternalKey("850"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_builder.Apply(&version_edit2);
 
   version_builder.SaveTo(&new_vstorage);
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 564cec3c114..dc1d821d975 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -62,6 +62,7 @@ enum CustomTag : uint32_t {
   kMinLogNumberToKeepHack = 3,
   kOldestBlobFileNumber = 4,
   kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
   kPathId = 65,
 };
 // If this bit for the custom tag is set, opening DB should fail if
@@ -217,6 +218,14 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
     TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
                              &varint_oldest_ancester_time);
     PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+    PutVarint32(dst, CustomTag::kFileCreationTime);
+    std::string varint_file_creation_time;
+    PutVarint64(&varint_file_creation_time, f.file_creation_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                             &varint_file_creation_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
     if (f.fd.GetPathId() != 0) {
       PutVarint32(dst, CustomTag::kPathId);
       char p = static_cast<char>(f.fd.GetPathId());
@@ -335,6 +344,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "invalid oldest ancester time";
           }
           break;
+        case kFileCreationTime:
+          if (!GetVarint64(&field, &f.file_creation_time)) {
+            return "invalid file creation time";
+          }
+          break;
         case kNeedCompaction:
           if (field.size() != 1) {
             return "need_compaction field wrong size";
@@ -660,6 +674,8 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     }
     r.append(" oldest_ancester_time:");
     AppendNumberTo(&r, f.oldest_ancester_time);
+    r.append(" file_creation_time:");
+    AppendNumberTo(&r, f.file_creation_time);
   }
   r.append("\n  ColumnFamily: ");
   AppendNumberTo(&r, column_family_);
diff --git a/db/version_edit.h b/db/version_edit.h
index d3664fd3921..5815d18dca6 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -26,6 +26,7 @@ class VersionSet;
 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
 constexpr uint64_t kInvalidBlobFileNumber = 0;
 constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
 
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
@@ -128,7 +129,10 @@ struct FileMetaData {
   // in turn be outputs for compact older SST files. We track the memtable
   // flush timestamp for the oldest SST file that eventaully contribute data
   // to this file. 0 means the information is not available.
-  uint64_t oldest_ancester_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+  // Unix time when the SST file is created.
+  uint64_t file_creation_time = kUnknownFileCreationTime;
 
   FileMetaData() = default;
 
@@ -136,13 +140,17 @@ struct FileMetaData {
                const InternalKey& smallest_key, const InternalKey& largest_key,
                const SequenceNumber& smallest_seq,
                const SequenceNumber& largest_seq, bool marked_for_compact,
-               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time)
+               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
+               uint64_t _file_creation_time)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
         marked_for_compaction(marked_for_compact),
         oldest_blob_file_number(oldest_blob_file),
-        oldest_ancester_time(_oldest_ancester_time) {}
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+  }
 
   // REQUIRED: Keys must be given to the function in sorted order (it expects
   // the last key to be the largest).
@@ -168,13 +176,23 @@ struct FileMetaData {
   // if table reader is already pinned.
   // 0 means the information is not available.
   uint64_t TryGetOldestAncesterTime() {
-    if (oldest_ancester_time != 0) {
+    if (oldest_ancester_time != kUnknownOldestAncesterTime) {
       return oldest_ancester_time;
     } else if (fd.table_reader != nullptr &&
                fd.table_reader->GetTableProperties() != nullptr) {
       return fd.table_reader->GetTableProperties()->creation_time;
     }
-    return 0;
+    return kUnknownOldestAncesterTime;
+  }
+
+  uint64_t TryGetFileCreationTime() {
+    if (file_creation_time != kUnknownFileCreationTime) {
+      return file_creation_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->file_creation_time;
+    }
+    return kUnknownFileCreationTime;
   }
 };
 
@@ -277,14 +295,14 @@ class VersionEdit {
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno, bool marked_for_compaction,
-               uint64_t oldest_blob_file_number,
-               uint64_t oldest_ancester_time) {
+               uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
+               uint64_t file_creation_time) {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
-        level,
-        FileMetaData(file, file_path_id, file_size, smallest, largest,
-                     smallest_seqno, largest_seqno, marked_for_compaction,
-                     oldest_blob_file_number, oldest_ancester_time));
+        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
+                            smallest_seqno, largest_seqno,
+                            marked_for_compaction, oldest_blob_file_number,
+                            oldest_ancester_time, file_creation_time));
   }
 
   void AddFile(int level, const FileMetaData& f) {
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index f110694d8a8..8a4c1380c1e 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -37,7 +37,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
                  kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
-                 888);
+                 888, 678);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -55,17 +55,19 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
                kBig + 600, true, kInvalidBlobFileNumber,
-               kUnknownOldestAncesterTime);
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
                kBig + 601, false, kInvalidBlobFileNumber,
-               kUnknownOldestAncesterTime);
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
-               kBig + 602, true, kInvalidBlobFileNumber, 666);
+               kBig + 602, true, kInvalidBlobFileNumber, 666, 888);
   edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
                InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
-               kBig + 603, true, 1001, kUnknownOldestAncesterTime);
+               kBig + 603, true, 1001, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime);
+  ;
 
   edit.DeleteFile(4, 700);
 
@@ -104,10 +106,10 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
                kBig + 600, true, kInvalidBlobFileNumber,
-               kUnknownOldestAncesterTime);
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false, kInvalidBlobFileNumber, 686);
+               kBig + 601, false, kInvalidBlobFileNumber, 686, 868);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -154,7 +156,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
                kBig + 600, true, kInvalidBlobFileNumber,
-               kUnknownOldestAncesterTime);
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -182,7 +184,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
   edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
-               kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
diff --git a/db/version_set.cc b/db/version_set.cc
index 2393503d5ef..444996e409d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1459,7 +1459,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
           file->fd.largest_seqno, file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted, file->oldest_blob_file_number});
+          file->being_compacted, file->oldest_blob_file_number,
+          file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime()});
       files.back().num_entries = file->num_entries;
       files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
@@ -1485,10 +1486,9 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
   for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
     for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
       assert(meta->fd.table_reader != nullptr);
-      uint64_t file_creation_time =
-          meta->fd.table_reader->GetTableProperties()->file_creation_time;
-      if (file_creation_time == 0) {
-        *creation_time = file_creation_time;
+      uint64_t file_creation_time = meta->TryGetFileCreationTime();
+      if (file_creation_time == kUnknownFileCreationTime) {
+        *creation_time = 0;
         return;
       }
       if (file_creation_time < oldest_time) {
@@ -2501,8 +2501,7 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
 void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
     const ImmutableCFOptions& ioptions,
     const uint64_t periodic_compaction_seconds) {
-  assert(periodic_compaction_seconds > 0 &&
-      periodic_compaction_seconds < port::kMaxUint64);
+  assert(periodic_compaction_seconds > 0);
 
   files_marked_for_periodic_compaction_.clear();
 
@@ -2513,8 +2512,8 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
   }
   const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
 
-  // If periodic_compaction_seconds > current_time, no file possibly qualifies
-  // periodic compaction.
+  // If periodic_compaction_seconds is larger than current time, periodic
+  // compaction can't possibly be triggered.
   if (periodic_compaction_seconds > current_time) {
     return;
   }
@@ -2524,20 +2523,18 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
 
   for (int level = 0; level < num_levels(); level++) {
     for (auto f : files_[level]) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
+      if (!f->being_compacted) {
         // Compute a file's modification time in the following order:
         // 1. Use file_creation_time table property if it is > 0.
         // 2. Use creation_time table property if it is > 0.
         // 3. Use file's mtime metadata if the above two table properties are 0.
         // Don't consider the file at all if the modification time cannot be
         // correctly determined based on the above conditions.
-        uint64_t file_modification_time =
-            f->fd.table_reader->GetTableProperties()->file_creation_time;
-        if (file_modification_time == 0) {
+        uint64_t file_modification_time = f->TryGetFileCreationTime();
+        if (file_modification_time == kUnknownFileCreationTime) {
           file_modification_time = f->TryGetOldestAncesterTime();
         }
-        if (file_modification_time == 0) {
+        if (file_modification_time == kUnknownOldestAncesterTime) {
           auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
                                          f->fd.GetPathId());
           status = ioptions.env->GetFileModificationTime(
@@ -4980,7 +4977,7 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
                        f->marked_for_compaction, f->oldest_blob_file_number,
-                       f->oldest_ancester_time);
+                       f->oldest_ancester_time, f->file_creation_time);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 363b337bcfc..66ad930f583 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -40,7 +40,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
         largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime);
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     files_.push_back(f);
   }
 
@@ -135,7 +135,7 @@ class VersionStorageInfoTest : public testing::Test {
         file_number, 0, file_size, GetInternalKey(smallest, 0),
         GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
         /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime);
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
     vstorage_.AddFile(level, f);
   }
@@ -146,7 +146,8 @@ class VersionStorageInfoTest : public testing::Test {
     FileMetaData* f = new FileMetaData(
         file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
         /* largest_seq */ 0, /* marked_for_compact */ false,
-        kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
     vstorage_.AddFile(level, f);
   }
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 52b5657c3b6..fecee84304d 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -69,7 +69,8 @@ struct SstFileMetaData {
                   SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
-                  bool _being_compacted, uint64_t _oldest_blob_file_number)
+                  bool _being_compacted, uint64_t _oldest_blob_file_number,
+                  uint64_t _oldest_ancester_time, uint64_t _file_creation_time)
       : size(_size),
         name(_file_name),
         file_number(_file_number),
@@ -82,7 +83,9 @@ struct SstFileMetaData {
         being_compacted(_being_compacted),
         num_entries(0),
         num_deletions(0),
-        oldest_blob_file_number(_oldest_blob_file_number) {}
+        oldest_blob_file_number(_oldest_blob_file_number),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {}
 
   // File size in bytes.
   size_t size;
@@ -105,6 +108,15 @@ struct SstFileMetaData {
 
   uint64_t oldest_blob_file_number;  // The id of the oldest blob file
                                      // referenced by the file.
+  // An SST file may be generated by compactions whose input files may
+  // in turn be generated by earlier compactions. The creation time of the
+  // oldest SST file that is the compaction ancester of this file.
+  // The timestamp is provided Env::GetCurrentTime().
+  // 0 if the information is not available.
+  uint64_t oldest_ancester_time;
+  // Timestamp when the SST file is created, provided by Env::GetCurrentTime().
+  // 0 if the information is not available.
+  uint64_t file_creation_time;
 };
 
 // The full set of metadata associated with each SST file.

From c16b08742738048889192ee3bf10e0635e603646 Mon Sep 17 00:00:00 2001
From: John Ericson <john.ericson@obsidian.systems>
Date: Tue, 26 Nov 2019 21:40:16 -0800
Subject: [PATCH 572/572] Work around weird unused errors with Mingw (#6075)

Summary:
From the reset of the code, it looks this this maybe can be unconditionally given the attribute? But I couldn't test with MSVC so I defensively put under CPP.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6075

Differential Revision: D18723749

fbshipit-source-id: 45fc8732c28dd29aab1644225d68f3c6f39bd69b
---
 port/win/io_win.cc     | 7 ++++++-
 port/win/win_thread.cc | 7 ++++++-
 util/status.cc         | 7 ++++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index e050d69ddca..c433e5e522f 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -1025,7 +1025,12 @@ Status WinRandomRWFile::Close() {
 //////////////////////////////////////////////////////////////////////////
 /// WinMemoryMappedBufer
 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
-  BOOL ret = FALSE;
+  BOOL ret
+#if defined(_MSC_VER)
+    = FALSE;
+#else
+    __attribute__((__unused__));
+#endif
   if (base_ != nullptr) {
     ret = ::UnmapViewOfFile(base_);
     assert(ret);
diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc
index 9a976e2c6b8..34e54f30171 100644
--- a/port/win/win_thread.cc
+++ b/port/win/win_thread.cc
@@ -138,7 +138,12 @@ void WindowsThread::join() {
       "WaitForSingleObjectFailed: thread join");
   }
 
-  BOOL rc;
+  BOOL rc
+#if defined(_MSC_VER)
+    = FALSE;
+#else
+    __attribute__((__unused__));
+#endif
   rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
   assert(rc != 0);
   data_->handle_ = 0;
diff --git a/util/status.cc b/util/status.cc
index 0b1b2493539..d75f9271eb8 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -21,7 +21,12 @@ const char* Status::CopyState(const char* state) {
 #ifdef OS_WIN
   const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
   char* result = new char[cch];
-  errno_t ret;
+  errno_t ret
+#if defined(_MSC_VER)
+    ;
+#else
+    __attribute__((__unused__));
+#endif
   ret = strncpy_s(result, cch, state, cch - 1);
   result[cch - 1] = '\0';
   assert(ret == 0);